In [None]:
#  1. Import Libraries
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
#  2. Load Data
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

In [3]:
#  Save test IDs for submission
test_ids = test_data['ID']

In [4]:
#  3. Preprocessing
# Drop ID columns
train_data = train_data.drop('ID', axis=1)
test_data = test_data.drop('ID', axis=1)


In [5]:
# Convert boolean columns to integers
bool_cols = train_data.select_dtypes('bool').columns
train_data[bool_cols] = train_data[bool_cols].astype(int)
test_data[bool_cols] = test_data[bool_cols].astype(int)

In [6]:
# Encode categorical columns
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

In [7]:
# Align test data to training columns
test_data = test_data.reindex(columns=train_data.drop('Target', axis=1).columns, fill_value=0)


In [8]:
#  4. Feature/Target Split
X = train_data.drop('Target', axis=1)
y = (train_data['Target'] > 0.5).astype(int)


In [9]:
#  5. Train/Test Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# 6. Model Training
model = LogisticRegression(
    penalty='l2',
    C=1.0,
    solver='liblinear',
    class_weight='balanced',
    max_iter=1000,
    random_state=42
)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [11]:
#  7. Model Evaluation
y_pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))

Accuracy: 0.8625029130738755
Confusion Matrix:
 [[2808  409]
 [ 181  893]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.87      0.90      3217
           1       0.69      0.83      0.75      1074

    accuracy                           0.86      4291
   macro avg       0.81      0.85      0.83      4291
weighted avg       0.88      0.86      0.87      4291



In [12]:
# 8. Predict on Test Data
test_preds = model.predict(test_data)

In [13]:
#  9. Prepare Submission File
submission = pd.DataFrame({
    'ID': test_ids,
    'Target': test_preds
})
submission.to_csv('submission.csv', index=False)
print("✅ Submission file created: submission.csv")

✅ Submission file created: submission.csv


In [14]:
# Get feature names and coefficients
feature_names = X.columns
coefficients = model.coef_[0]

# Create a DataFrame for easy viewing
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients,
    'Abs_Coefficient': np.abs(coefficients)
})

# Sort by absolute value of coefficients
importance_df = importance_df.sort_values(by='Abs_Coefficient', ascending=False)

# Display top 15 most influential features
importance_df.head(15)

Unnamed: 0,Feature,Coefficient,Abs_Coefficient
16,country_Central African Republic,-2.321384,2.321384
2,ghsl_built_pre_1975,-2.208836,2.208836
29,country_Sierra Leone,-2.152057,2.152057
26,country_Nigeria,1.570075,1.570075
33,urban_or_rural_R,-1.559742,1.559742
30,country_Swaziland,1.419974,1.419974
19,country_Ghana,1.32529,1.32529
20,country_Guinea,-1.16211,1.16211
17,country_Cote d'Ivoire,1.119371,1.119371
34,urban_or_rural_U,1.044011,1.044011


In [15]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot top 15 features
plt.figure(figsize=(10, 6))
sns.barplot(
    x='Abs_Coefficient',
    y='Feature',
    data=importance_df.head(15),
    palette='viridis'
)
plt.title('Top 15 Most Important Features (Logistic Regression)')
plt.xlabel('Absolute Coefficient Value')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(
  plt.show()


In [16]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced']
}

# Set up GridSearch
grid_search = GridSearchCV(
    LogisticRegression(max_iter=1000, random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1
)

# Run the search
grid_search.fit(X_train, y_train)

# Best results
print("Best parameters:", grid_search.best_params_)
print("Best CV accuracy:", grid_search.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits




Best parameters: {'C': 10, 'class_weight': None, 'solver': 'liblinear'}
Best CV accuracy: 0.8839947628834899


In [17]:
# Predict with the best estimator
best_model = grid_search.best_estimator_
val_preds = best_model.predict(X_val)

# Evaluate
print("Validation Accuracy:", accuracy_score(y_val, val_preds))
print("Confusion Matrix:\n", confusion_matrix(y_val, val_preds))
print("Classification Report:\n", classification_report(y_val, val_preds))

Validation Accuracy: 0.8750873922162666
Confusion Matrix:
 [[3019  198]
 [ 338  736]]
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.94      0.92      3217
           1       0.79      0.69      0.73      1074

    accuracy                           0.88      4291
   macro avg       0.84      0.81      0.83      4291
weighted avg       0.87      0.88      0.87      4291



In [18]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=100,       # Number of trees
    max_depth=None,         # Let trees expand fully
    random_state=42,
    class_weight='balanced' # Great for unbalanced datasets
)

rf_model.fit(X_train, y_train)

# Predict and evaluate
rf_preds = rf_model.predict(X_val)

print("Random Forest Accuracy:", accuracy_score(y_val, rf_preds))
print("Confusion Matrix:\n", confusion_matrix(y_val, rf_preds))
print("Classification Report:\n", classification_report(y_val, rf_preds))

Random Forest Accuracy: 0.8965276159403403
Confusion Matrix:
 [[3042  175]
 [ 269  805]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.95      0.93      3217
           1       0.82      0.75      0.78      1074

    accuracy                           0.90      4291
   macro avg       0.87      0.85      0.86      4291
weighted avg       0.89      0.90      0.89      4291



In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced', 'balanced_subsample']
}

In [20]:
# Use recall as the scoring metric to prioritize class 1 performance
rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='recall',         # Focus on recall for class 1
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'class_weight': ['balanced', 'balanced_subsample'], 'max_depth': [None, 10, ...], 'min_samples_leaf': [1, 2], 'min_samples_split': [2, 5], ...}"
,scoring,'recall'
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,10
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [21]:
print("Best Parameters:", grid_search.best_params_)
print("Best Recall Score (cv):", grid_search.best_score_)

# Evaluate the best model on validation data
best_rf = grid_search.best_estimator_
val_preds = best_rf.predict(X_val)

from sklearn.metrics import classification_report
print("Validation Report:\n", classification_report(y_val, val_preds))

Best Parameters: {'class_weight': 'balanced_subsample', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Best Recall Score (cv): 0.8706866816663528
Validation Report:
               precision    recall  f1-score   support

           0       0.95      0.89      0.92      3217
           1       0.73      0.87      0.79      1074

    accuracy                           0.88      4291
   macro avg       0.84      0.88      0.86      4291
weighted avg       0.90      0.88      0.89      4291



In [22]:
# Make predictions on the test set
final_test_preds = best_rf.predict(test_data)

In [23]:
# Reattach the IDs and prepare for submission
submission = pd.DataFrame({
    'ID': test_ids,
    'Target': final_test_preds
})

# Export to CSV
submission.to_csv('submission.csv', index=False)
print("✅ Submission file created: submission.csv")

✅ Submission file created: submission.csv


In [24]:
import joblib
joblib.dump(best_rf, 'best_random_forest_model.pkl')
joblib.dump(X_train.columns.tolist(), 'training_columns.pkl')

['training_columns.pkl']