In [43]:
#  1. Import Libraries
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import joblib

In [2]:
#  2. Load Data
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

In [3]:
#  Save test IDs for submission
test_ids = test_data['ID']

In [4]:
#  3. Preprocessing
# Drop ID columns
train_data = train_data.drop('ID', axis=1)
test_data = test_data.drop('ID', axis=1)


In [5]:
#Exploratory Data Analysis (EDA)
# Quick overview
print(train_data.shape)
print(train_data.info())
print(train_data.describe())

# Visualize missing values
plt.figure(figsize=(10,6))
sns.heatmap(train_data.isnull(), cbar=False, cmap='Reds')
plt.title("Missing Values Heatmap")
plt.show()


(21454, 18)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21454 entries, 0 to 21453
Data columns (total 18 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   country                                  21454 non-null  object 
 1   year                                     21454 non-null  int64  
 2   urban_or_rural                           21454 non-null  object 
 3   ghsl_water_surface                       21454 non-null  float64
 4   ghsl_built_pre_1975                      21454 non-null  float64
 5   ghsl_built_1975_to_1990                  21454 non-null  float64
 6   ghsl_built_1990_to_2000                  21454 non-null  float64
 7   ghsl_built_2000_to_2014                  21454 non-null  float64
 8   ghsl_not_built_up                        21454 non-null  float64
 9   ghsl_pop_density                         21454 non-null  float64
 10  landcover_crops_fraction          

  plt.show()


In [6]:
# Target class distribution
sns.countplot(x='Target', data=train_data)
plt.title("Target Class Distribution")
plt.show()

  plt.show()


In [10]:
# Drop 'ID' column safely
train_data.drop(columns='ID', axis=1, inplace=True, errors='ignore')
test_data.drop(columns='ID', axis=1, inplace=True, errors='ignore')

In [13]:
# Safe drop: no need to specify axis when using 'columns' parameter
train_data.drop(columns=['ID'], inplace=True, errors='ignore')
test_data.drop(columns=['ID'], inplace=True, errors='ignore')

In [15]:
# Drop 'ID' if it exists
for col in ['ID']:
    if col in train_data.columns:
        train_data.drop(columns=[col], inplace=True)
    if col in test_data.columns:
        test_data.drop(columns=[col], inplace=True)

In [18]:
print("Train columns:", train_data.columns.tolist())
print("Test columns:", test_data.columns.tolist())

Train columns: ['country', 'year', 'urban_or_rural', 'ghsl_water_surface', 'ghsl_built_pre_1975', 'ghsl_built_1975_to_1990', 'ghsl_built_1990_to_2000', 'ghsl_built_2000_to_2014', 'ghsl_not_built_up', 'ghsl_pop_density', 'landcover_crops_fraction', 'landcover_urban_fraction', 'landcover_water_permanent_10km_fraction', 'landcover_water_seasonal_10km_fraction', 'nighttime_lights', 'dist_to_capital', 'dist_to_shoreline', 'Target']
Test columns: ['country', 'year', 'urban_or_rural', 'ghsl_water_surface', 'ghsl_built_pre_1975', 'ghsl_built_1975_to_1990', 'ghsl_built_1990_to_2000', 'ghsl_built_2000_to_2014', 'ghsl_not_built_up', 'ghsl_pop_density', 'landcover_crops_fraction', 'landcover_urban_fraction', 'landcover_water_permanent_10km_fraction', 'landcover_water_seasonal_10km_fraction', 'nighttime_lights', 'dist_to_capital', 'dist_to_shoreline']


In [19]:
# Safely drop 'ID' column if it exists
for df in [train_data, test_data]:
    if 'ID' in df.columns:
        df.drop(columns=['ID'], inplace=True)

In [23]:
for df in [train_data, test_data]:
    df.drop(columns=[col for col in df.columns if col == 'ID'], inplace=True)

In [25]:
#4. Data Preprocessing


# Convert boolean columns to integers
bool_cols = train_data.select_dtypes('bool').columns
train_data[bool_cols] = train_data[bool_cols].astype(int)
test_data[bool_cols] = test_data[bool_cols].astype(int)

# One-hot encode categorical variables
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

# Align test data with training features
X = train_data.drop('Target', axis=1)
y = (train_data['Target'] > 0.5).astype(int)  # Convert to binary

test_data = test_data.reindex(columns=X.columns, fill_value=0)


In [26]:
#  5. Train/Test Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [27]:
# 6. Model Training – Logistic Regression
log_model = LogisticRegression(
    penalty='l2',
    C=1.0,
    solver='liblinear',
    class_weight='balanced',
    max_iter=1000,
    random_state=42
)
log_model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [28]:
# 7. Evaluation
y_pred = log_model.predict(X_val)

print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))

Accuracy: 0.8625029130738755
Confusion Matrix:
 [[2808  409]
 [ 181  893]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.87      0.90      3217
           1       0.69      0.83      0.75      1074

    accuracy                           0.86      4291
   macro avg       0.81      0.85      0.83      4291
weighted avg       0.88      0.86      0.87      4291



In [30]:
# 8. Feature Importance (Logistic Coefficients)
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': log_model.coef_[0],
    'Abs_Coefficient': np.abs(log_model.coef_[0])
}).sort_values(by='Abs_Coefficient', ascending=False)

# Plot top 15 features
plt.figure(figsize=(10,6))
sns.barplot(x='Abs_Coefficient', y='Feature', data=coef_df.head(15), palette='viridis')
plt.title("Top 15 Most Important Features")
plt.tight_layout()
plt.show()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Abs_Coefficient', y='Feature', data=coef_df.head(15), palette='viridis')
  plt.show()


In [35]:
# 9. Grid Search – Logistic Regression
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced']
}

grid_search = GridSearchCV(
    LogisticRegression(max_iter=1000, random_state=42),
    param_grid,
    scoring='accuracy',
    cv=5,
    verbose=1
)

grid_search.fit(X_train, y_train)
best_log_model = grid_search.best_estimator_

print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy (CV):", grid_search.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits




Best Parameters: {'C': 10, 'class_weight': None, 'solver': 'liblinear'}
Best Accuracy (CV): 0.8839947628834899


In [38]:
#10. Random Forest + Grid Search (Using Recall)
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced', 'balanced_subsample']
}

rf_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_param_grid,
    scoring='recall',
    cv=3,
    verbose=2,
    n_jobs=-1
)

rf_search.fit(X_train, y_train)
best_rf = rf_search.best_estimator_

Fitting 3 folds for each of 48 candidates, totalling 144 fits


In [40]:
#11. Evaluate Best Random Forest
val_preds = best_rf.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, val_preds))
print("Confusion Matrix:\n", confusion_matrix(y_val, val_preds))
print("Classification Report:\n", classification_report(y_val, val_preds))

Validation Accuracy: 0.8848753204381263
Confusion Matrix:
 [[2863  354]
 [ 140  934]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.89      0.92      3217
           1       0.73      0.87      0.79      1074

    accuracy                           0.88      4291
   macro avg       0.84      0.88      0.86      4291
weighted avg       0.90      0.88      0.89      4291



In [44]:
# 12. Predict on Test Data + Save Submission
final_test_preds = best_rf.predict(test_data)

submission = pd.DataFrame({
    'ID': test_ids,
    'Target': final_test_preds
})

submission.to_csv('submission.csv', index=False)
print("✅ Submission file created: submission.csv")

✅ Submission file created: submission.csv


In [45]:
# 13. Save Model + Column References
joblib.dump(best_rf, 'best_random_forest_model.pkl', compress=3)
joblib.dump(X.columns.tolist(), 'training_columns.pkl')

['training_columns.pkl']

In [46]:
# Quick overview
print(train_data.shape)
print(train_data.info())
print(train_data.describe())

# Visualize missing values
plt.figure(figsize=(10,6))
sns.heatmap(train_data.isnull(), cbar=False, cmap='Reds')
plt.title("Missing Values Heatmap")
plt.show()

# Target class distribution
sns.countplot(x='Target', data=train_data)
plt.title("Target Class Distribution")
plt.show()

# Correlation matrix
corr_matrix = train_data.corr()
plt.figure(figsize=(12,8))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

(21454, 36)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21454 entries, 0 to 21453
Data columns (total 36 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   year                                     21454 non-null  int64  
 1   ghsl_water_surface                       21454 non-null  float64
 2   ghsl_built_pre_1975                      21454 non-null  float64
 3   ghsl_built_1975_to_1990                  21454 non-null  float64
 4   ghsl_built_1990_to_2000                  21454 non-null  float64
 5   ghsl_built_2000_to_2014                  21454 non-null  float64
 6   ghsl_not_built_up                        21454 non-null  float64
 7   ghsl_pop_density                         21454 non-null  float64
 8   landcover_crops_fraction                 21454 non-null  float64
 9   landcover_urban_fraction                 21454 non-null  float64
 10  landcover_water_permanent_10km_fra

  plt.show()
  plt.show()
  plt.show()


In [47]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced']
}

# Set up GridSearch
grid_search = GridSearchCV(
    LogisticRegression(max_iter=1000, random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1
)

# Run the search
grid_search.fit(X_train, y_train)

# Best results
print("Best parameters:", grid_search.best_params_)
print("Best CV accuracy:", grid_search.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits




Best parameters: {'C': 10, 'class_weight': None, 'solver': 'liblinear'}
Best CV accuracy: 0.8839947628834899
