# Import Libraries

In [1]:
!python --version

Python 3.10.0


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
from datetime import datetime 

In [3]:
# Better visualization
plt.style.use('ggplot')

## Declare Constants

In [4]:
KENYA_RAW_DATA = '../raw-data/Kenya_training.csv'

### Logging progress

In [5]:
def log_progress(message): 
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second 
    now = datetime.now() # get current timestamp 
    timestamp = now.strftime(timestamp_format) 
    with open("./log/kenya_log.txt","a") as f: 
        f.write(timestamp + ' : ' + message + '\n')

# 1. Load Data

In [31]:
# Kenya Training

log_progress('Preliminaries complete. Initiating main process')

try: 
    # Try to load the cleaned data (if it exists)
    kenya_train = pickle.load(open("../cleaned-data/kenya_training_cleaned.pkl", 'rb'))

    print("Loaded cleaned data from checkpoint")
    
except FileNotFoundError:
    # If the cleaned data doesn't exist, load the raw data
    kenya_train = pd.read_csv(KENYA_RAW_DATA, index_col=0)

    print("Loading raw data...")

display(kenya_train.head())

log_progress('Data extraction complete. Initiating EDA process')


Loading raw data...


Unnamed: 0_level_0,lon,lat,blue_p50,green_p50,nir_p50,nira_p50,re1_p50,re2_p50,re3_p50,red_p50,swir1_p50,swir2_p50,VV_p50,VH_p50,TARGET
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,37.059054,0.046039,2731.0,3035.0,5418.0,5306.5,2958.0,4610.5,5099.5,2994.0,4108.5,3187.5,-6.861458,-12.162713,1
2,37.059503,0.047296,2921.0,3329.0,5834.0,6472.0,4144.5,5782.0,6302.0,3344.0,5269.0,4471.5,-8.082717,-14.619608,1
3,37.057527,0.047656,3828.0,4163.0,6420.0,7890.5,5408.5,6695.5,7302.0,4380.0,6905.0,6905.5,-7.767043,-14.880353,1
4,37.057976,0.047925,2556.0,2839.0,4822.0,5276.5,3201.0,4643.0,5069.5,2787.0,3832.0,2867.5,-7.773118,-15.587653,1
5,37.056359,0.048554,3790.0,4009.0,6476.0,6199.5,4285.0,5711.0,6079.5,3939.0,4892.5,3702.5,-6.104467,-15.431915,1


# 2. Exploratory Data Analysis (EDA)

### 2.1.0 

- Shape of DataFrame
- Data Types

In [35]:
print("Kenya Training Shape:", kenya_train.shape)

kenya_train.dtypes

Kenya Training Shape: (998, 15)


lon          float64
lat          float64
blue_p50     float64
green_p50    float64
nir_p50      float64
nira_p50     float64
re1_p50      float64
re2_p50      float64
re3_p50      float64
red_p50      float64
swir1_p50    float64
swir2_p50    float64
VV_p50       float64
VH_p50       float64
TARGET         int64
dtype: object

### 2.1.1 Missing Values

In [36]:
kenya_missing_values = kenya_train.isnull().sum()
print("Kenya Training Data Missing Values:", kenya_missing_values)

Kenya Training Data Missing Values: lon          0
lat          0
blue_p50     0
green_p50    0
nir_p50      0
nira_p50     0
re1_p50      0
re2_p50      0
re3_p50      0
red_p50      0
swir1_p50    0
swir2_p50    0
VV_p50       0
VH_p50       0
TARGET       0
dtype: int64


### 2.2 Descriptive Statistics

In [37]:
display("Kenya Training Data Summary:\n", kenya_train.describe())

'Kenya Training Data Summary:\n'

Unnamed: 0,lon,lat,blue_p50,green_p50,nir_p50,nira_p50,re1_p50,re2_p50,re3_p50,red_p50,swir1_p50,swir2_p50,VV_p50,VH_p50,TARGET
count,998.0,998.0,998.0,998.0,998.0,998.0,998.0,998.0,998.0,998.0,998.0,998.0,998.0,998.0,998.0
mean,37.252425,0.075933,2170.822645,2384.236473,4084.263527,4238.900301,2766.331663,3814.640782,4136.091683,2375.989479,3358.880261,2617.702405,-8.163459,-15.192528,1.513026
std,0.112068,0.034866,691.245929,651.399984,880.874587,886.329809,622.303905,770.130322,876.365757,687.701428,633.248322,702.969272,2.231359,2.224911,0.500081
min,37.026894,0.005794,1253.5,1381.0,2061.0,2139.5,1617.5,2083.5,2121.5,1243.0,1579.5,1347.5,-14.470079,-26.436514,1.0
25%,37.164022,0.047229,1502.25,1791.625,3455.5,3615.75,2252.0,3269.625,3522.625,1779.25,3004.625,2216.375,-9.328339,-16.148069,1.0
50%,37.260501,0.073258,2188.0,2355.0,3985.0,4138.75,2719.0,3739.75,4016.25,2408.0,3281.25,2451.5,-8.280302,-15.00821,2.0
75%,37.328369,0.099017,2633.875,2815.75,4676.0,4846.875,3153.625,4338.125,4763.75,2829.0,3577.5,2821.75,-7.269487,-13.982456,2.0
max,37.49806,0.172881,6130.0,6066.0,9248.0,9507.0,6839.0,8542.0,9235.0,6140.0,10014.0,10692.0,10.258422,-4.934835,2.0


In [38]:
TARGET = kenya_train['TARGET'].to_list()
TARGET

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


### 2.3 Data Visualization

In [None]:
# Histograms for each feature
for column in kenya_train.columns:
    if column != "TARGET":  # Exclude the target column
        plt.figure(figsize=(8, 4))
        sns.histplot(kenya_train[column], kde=True)
        plt.title(f"Distribution of {column}")
        plt.show()

In [None]:
# Box Plots to Identify Outliers
for column in kenya_train.columns:
    if column != "TARGET":  # Exclude the target column
        plt.figure(figsize=(8, 4))
        sns.boxplot(y=kenya_train[column])
        plt.title(f"Boxplot of {column}")
        plt.show()

### 2.4 Correlation Analysis

In [None]:
# Correlation Matrix
correlation_matrix = kenya_train.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix for Kenya Training Data")
plt.show()

In [None]:
# Scatter Plots
# Example: Scatter Plot of 'nir_p50' vs. 'TARGET'
plt.figure(figsize=(8, 4))
sns.scatterplot(x=kenya_train["nir_p50"], y=kenya_train["TARGET"])
plt.title("Scatter Plot of nir_p50 vs. TARGET")
plt.xlabel("nir_p50")
plt.ylabel("TARGET")
plt.show()

In [None]:
# Save checkpoint after loading and initial EDA
pickle.dump(kenya_train, open("../cleaned-data/kenya_training_cleaned.pkl", 'wb'))

log_progress('Explorative Data Analysis complete')

# 3. Feature Selection

Most influential columns for classification, based on EDA:

In [None]:
log_progress('Feature selection complete')

# 4. Data Cleaning (Optional)

Remove outliers or handle missing values

In [None]:
# Save checkpoint after cleaning
pickle.dump(kenya_train, open("../cleaned-data/kenya_training_cleaned.pkl", 'wb'))

log_progress('Data cleaning complete')

# 5. Save Cleaned Data

Save the cleaned Kenya training data to the cleaned-data folder

In [None]:

kenya_train.to_csv("../cleaned-data/kenya_training_cleaned.csv", index=False)

log_progress('Data saved to CSV file')