In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np

# Load each CSV file
submission_format = pd.read_csv("SubmissionFormat.csv")
testset_values = pd.read_csv("testsetvalues.csv")
trainingset_labels = pd.read_csv("trainingsetlabels.csv")
trainingset_values = pd.read_csv("trainingsetvalues.csv")

# Merge the training values and labels based on the 'id'
train_data = pd.merge(trainingset_values, trainingset_labels, on='id')

# Display the first few rows of the combined training data
train_data.head()


Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [15]:
# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
train_data_imputed = pd.DataFrame(imputer.fit_transform(train_data), columns=train_data.columns)

# Encode categorical variables
label_encoders = {}
for column in train_data_imputed.select_dtypes(include=['object']).columns:
    if column != 'status_group':  # Exclude the target variable
        label_encoders[column] = LabelEncoder()
        train_data_imputed[column] = label_encoders[column].fit_transform(train_data_imputed[column])

# Encode target variable
target_encoder = LabelEncoder()
train_data_imputed['status_group'] = target_encoder.fit_transform(train_data_imputed['status_group'])

# Drop irrelevant columns (e.g., 'id', 'wpt_name')
X = train_data_imputed.drop(['id', 'wpt_name', 'status_group'], axis=1)
y = train_data_imputed['status_group']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)


In [16]:
# Initialize a Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Train the model on the training data
rf_model.fit(X_train_scaled, y_train)

# Predict on the validation set
y_pred = rf_model.predict(X_val_scaled)


In [17]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_val, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred, target_names=target_encoder.classes_))


Accuracy: 0.8106060606060606

Confusion Matrix:
 [[5753  174  525]
 [ 445  290  128]
 [ 904   74 3587]]

Classification Report:
                          precision    recall  f1-score   support

             functional       0.81      0.89      0.85      6452
functional needs repair       0.54      0.34      0.41       863
         non functional       0.85      0.79      0.81      4565

               accuracy                           0.81     11880
              macro avg       0.73      0.67      0.69     11880
           weighted avg       0.80      0.81      0.80     11880



In [None]:
# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)

# Best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Parameters:", best_params)

# Predict using the best model
y_pred_best = best_model.predict(X_val_scaled)

# Evaluate the tuned model
print("\nTuned Model Accuracy:", accuracy_score(y_val, y_pred_best))
print("\nTuned Model Confusion Matrix:\n", confusion_matrix(y_val, y_pred_best))
print("\nTuned Model Classification Report:\n", classification_report(y_val, y_pred_best, target_names=target_encoder.classes_))


Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [None]:
# Preprocess the test set
testset_values_imputed = pd.DataFrame(imputer.transform(testset_values), columns=testset_values.columns)

# Encode categorical variables
for column in testset_values_imputed.select_dtypes(include=['object']).columns:
    if column in label_encoders:
        testset_values_imputed[column] = label_encoders[column].transform(testset_values_imputed[column])

# Feature scaling
X_test = testset_values_imputed.drop(['id', 'wpt_name'], axis=1)
X_test_scaled = scaler.transform(X_test)

# Make predictions
test_predictions = best_model.predict(X_test_scaled)

# Convert predictions back to original labels
test_predictions_labels = target_encoder.inverse_transform(test_predictions)

# Prepare the submission file
submission_format['status_group'] = test_predictions_labels
submission_format.to_csv("submission.csv", index=False)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_val, y_pred_best)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=target_encoder.classes_, yticklabels=target_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()


In [None]:
# Visualize the distribution of the predicted classes
unique, counts = np.unique(y_pred_best, return_counts=True)
predicted_class_distribution = dict(zip(target_encoder.inverse_transform(unique), counts))

plt.figure(figsize=(8, 6))
sns.barplot(x=list(predicted_class_distribution.keys()), y=list(predicted_class_distribution.values()), palette='viridis')
plt.title('Predicted Class Distribution')
plt.xlabel('Classes')
plt.ylabel('Number of Predictions')
plt.show()


In [None]:

import numpy as np

# Compute the predicted class distribution
unique, counts = np.unique(y_pred_best, return_counts=True)
predicted_class_distribution = dict(zip(target_encoder.inverse_transform(unique), counts))

# Output the predicted class distribution in text format
for class_label, count in predicted_class_distribution.items():
    print(f"Class '{class_label}': {count} predictions")

