In [1]:
# importing relevant libraries
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
# importing machine learning modules
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [3]:
# importing the data(train and test)
training_data = pd.read_csv('training_data.csv')
test_data = pd.read_csv('test_data.csv')

training_data = training_data.drop('Unnamed: 0', axis=1)
test_data = test_data.drop('Unnamed: 0', axis=1)

In [4]:
training_data.columns

Index(['amount_tsh', 'days_since_recorded', 'funder', 'gps_height',
       'installer', 'basin', 'subvillage', 'population', 'public_meeting',
       'scheme_management', 'permit', 'construction_year', 'extraction_type',
       'management_group', 'payment_type', 'water_quality', 'quantity_group',
       'source_type', 'source_class', 'waterpoint_type', 'status_group'],
      dtype='object')

In [5]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14381 entries, 0 to 14380
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_tsh             14381 non-null  float64
 1   days_since_recorded    14381 non-null  int64  
 2   funder                 14381 non-null  object 
 3   gps_height             14381 non-null  int64  
 4   installer              14381 non-null  object 
 5   basin                  14381 non-null  object 
 6   subvillage             14381 non-null  object 
 7   population             14381 non-null  int64  
 8   public_meeting         14381 non-null  bool   
 9   scheme_management      14381 non-null  object 
 10  permit                 13737 non-null  object 
 11  construction_year      14381 non-null  int64  
 12  extraction_type        14381 non-null  object 
 13  management_group       14381 non-null  object 
 14  payment_type           14381 non-null  object 
 15  wa

1. Pre-processing:
* Transform the categorical data 
* One-hot encode the categorical variables, such as funder, installer, basin, etc. to handle non-numeric data.
* Split the data into training and testing sets.

In [6]:
# Select the categorical columns you want to encode
categorical_columns = ["funder", "installer", "scheme_management",'management_group','payment_type','water_quality','permit','public_meeting']
# Use Pandas get_dummies method to create the OHE data
df_ohe = pd.get_dummies(training_data, columns=categorical_columns, prefix=categorical_columns, drop_first=True)


In [7]:
# Split the data into features (X) and target (y) variables
X = df_ohe.drop([ 'basin', 'subvillage',  'subvillage','quantity_group',
       'source_type', 'source_class', 'waterpoint_type','extraction_type','status_group'], axis=1)
y = df_ohe['status_group']


In [8]:
X

Unnamed: 0,amount_tsh,days_since_recorded,gps_height,population,construction_year,funder_Aar,funder_Abas Ka,funder_Abasia,funder_Abc-ihushi Development Cent,funder_Abd,...,payment_type_unknown,water_quality_fluoride,water_quality_fluoride abandoned,water_quality_milky,water_quality_salty,water_quality_salty abandoned,water_quality_soft,water_quality_unknown,permit_True,public_meeting_True
0,6000.0,995,1390,109,1999,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1,0.0,272,1399,280,2010,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1
2,25.0,281,686,250,2009,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1
3,0.0,309,263,58,1986,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1
4,0.0,874,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59252,10.0,214,1210,125,1999,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1
59253,4700.0,941,1212,56,1996,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1
59254,0.0,967,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
59255,0.0,1001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1


In [9]:
# Normalize the feature data using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=0)



2. Logistic Regression:
* Choose logistic regression as the baseline model, since it's a simple and interpretable model.
* Fit the model on the training data and evaluate the performance on the testing data.
* Hyperparameters for logistic regression include the regularization term (e.g. L1 or L2) and the regularization strength (e.g. C).
* Grid search or random search can be used to find the best hyperparameters that result in the best performance on the testing data.

In [11]:
# Train the Logistic Regression model on the training data
logreg_train = LogisticRegression()
logreg_train.fit(X_train, y_train)

# Predict the target values for the training data
y_train_pred = logreg_train.predict(X_train)

# Evaluate the model accuracy using the training data
train_accuracy = logreg_train.score(X_train, y_train)
print("Training Accuracy:", train_accuracy)

# Train the Logistic Regression model on the test data
logreg_test = LogisticRegression()
logreg_test.fit(X_test, y_test)

# Predict the target values for the test data
y_test_pred = logreg_test.predict(X_test)

# Evaluate the model accuracy using the test data
test_accuracy = logreg_test.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Accuracy: 0.7017825123932074


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Test Accuracy: 0.7176847789402633


3. Decision Trees:
* Since logistic regression may not capture complex relationships between features and the target variable, try a decision tree model.
* Choose the optimal hyperparameters using grid search or random search.
* Evaluate the performance on the testing data.
* Hyperparameters for decision trees include the maximum depth of the tree and the minimum number of samples required to split a node.


In [12]:
# Initialize the decision tree classifier
clf = DecisionTreeClassifier(random_state = 0)

# Train the model
clf.fit(X_train, y_train)

# Predict the target values for the training data
y_train_pred = logreg_train.predict(X_train)

# Evaluate the model accuracy using the training data
train_accuracy = logreg_train.score(X_train, y_train)
print("Training Accuracy:", train_accuracy)


# Make predictions on the test set
y_test_pred = clf.predict(X_test)

# Evaluate the model's test accuracy
acc = accuracy_score(y_test, y_test_pred)
print('Test Accuracy:', acc)

Training Accuracy: 0.7017825123932074
Test Accuracy: 0.7011474856564293


In [13]:
# Define the hyperparameters to tune and the values to try
param_grid = {'max_depth': [2, 4, 6, 8, 10], 'min_samples_split': [2, 4, 6, 8, 10]}

# Create a grid search object with a 5-fold cross-validation
grid_search = GridSearchCV(clf, param_grid, cv=5)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)



Best parameters: {'max_depth': 10, 'min_samples_split': 10}
Best score: 0.6734310726716591


4. Random Forest:
* Random Forest is an extension of decision trees, where multiple trees are built and combined to make a prediction.
* Choose the optimal hyperparameters using grid search or random search.
* Evaluate the performance on the testing data.
* Hyperparameters for random forests include the number of trees in the forest, the maximum depth of each tree, and the minimum number of samples required to split a node.


In [14]:
# Create the Random Forest classifier
clf = RandomForestClassifier(random_state=0)

# Fit the classifier to the training data
clf.fit(X_train, y_train)

# Predict the target variable on the train data
y_train_pred = clf.predict(X_train)

# Evaluate the accuracy of the classifier on the train data
accuracy = clf.score(X_train,y_train_pred)
print("Training ccuracy:", accuracy)


# Predict the target variable on the test data
y_test_pred = clf.predict(X_test)

# Evaluate the accuracy of the classifier on the test data
accuracy = clf.score(X_test, y_test_pred)
print("Testing ccuracy:", accuracy)

Training ccuracy: 1.0
Testing ccuracy: 1.0


> This data is overfiting

In [None]:
# Define the hyperparameter grid to search
param_grid = {
    "n_estimators": [10, 50],
    "max_depth": [None, 5],
    "min_samples_split": [2, 10],
    "min_samples_leaf": [1, 2]
}

# Create the Random Forest classifier
clf = RandomForestClassifier(random_state=0)

# Create the Grid Search object
grid_search = GridSearchCV(clf, param_grid, cv=3, scoring="accuracy")

# Fit the Grid Search to the training data
grid_search.fit(X_train, y_train)

# Predict the target variable on the test data
y_pred = grid_search.predict(X_test)

# Print the best hyperparameters found by Grid Search
print("Best hyperparameters:", grid_search.best_params_)

# Evaluate the accuracy of the classifier on the test data
accuracy = grid_search.score(X_test, y_test)
print("Accuracy:", accuracy)


5. Model Comparison:
* Compare the performance of the logistic regression, decision tree, and random forest models to choose the best one.
* Evaluate the performance using metrics such as accuracy, precision, recall, F1-score, AUC-ROC, etc.
