In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the dataset
file_path = '/content/wbg_merged_cleaned_for_analysis.csv'
df = pd.read_csv(file_path)

# Clean the data by removing irrelevant columns
df_cleaned = df.drop(columns=['project_id_merged', 'countryname', 'boardapprovaldate', 'country_economy_fcs_status'], errors='ignore')

# Handle missing values separately for categorical and numerical columns
# Categorical columns: fill with the mode (most frequent value)
categorical_columns = df_cleaned.select_dtypes(include=['object']).columns
df_cleaned[categorical_columns] = df_cleaned[categorical_columns].fillna(df_cleaned[categorical_columns].mode().iloc[0])

# Numerical columns: fill with the median
numerical_columns = df_cleaned.select_dtypes(include=['number']).columns
df_cleaned[numerical_columns] = df_cleaned[numerical_columns].fillna(df_cleaned[numerical_columns].median())

# Define the target (outcome_binary) and features
df_cleaned['outcome_binary'] = df_cleaned['outcome'].apply(lambda x: 1 if x in ['Satisfactory', 'Moderately Satisfactory'] else 0)  # Binary outcome
X = df_cleaned.drop(columns=['outcome', 'outcome_binary'])  # Features
y = df_cleaned['outcome_binary']  # Target variable

# One-hot encode categorical variables
X = pd.get_dummies(X)

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data (important for models sensitive to feature scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_rf = rf.predict(X_test_scaled)

# Evaluate the model
conf_matrix = confusion_matrix(y_test, y_pred_rf)
class_report = classification_report(y_test, y_pred_rf)
accuracy = accuracy_score(y_test, y_pred_rf)

print("Random Forest Classifier")
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)
print(f"Accuracy Score: {accuracy:.4f}")


Random Forest Classifier
Confusion Matrix:
[[171 160]
 [131 771]]

Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.52      0.54       331
           1       0.83      0.85      0.84       902

    accuracy                           0.76      1233
   macro avg       0.70      0.69      0.69      1233
weighted avg       0.76      0.76      0.76      1233

Accuracy Score: 0.7640


In [6]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_scaled, y_train)
y_pred_log_reg = log_reg.predict(X_test_scaled)
conf_matrix_log_reg = confusion_matrix(y_test, y_pred_log_reg)
class_report_log_reg = classification_report(y_test, y_pred_log_reg)
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)

print("Logistic Regression")
print("Confusion Matrix:\n", conf_matrix_log_reg)
print("\nClassification Report:\n", class_report_log_reg)
print(f"Accuracy Score: {accuracy_log_reg:.4f}\n")

Logistic Regression
Confusion Matrix:
 [[178 153]
 [113 789]]

Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.54      0.57       331
           1       0.84      0.87      0.86       902

    accuracy                           0.78      1233
   macro avg       0.72      0.71      0.71      1233
weighted avg       0.78      0.78      0.78      1233

Accuracy Score: 0.7843



In [9]:
#XGBoost
import xgboost as xgb
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_train_scaled, y_train)
y_pred_xgb = xgb_model.predict(X_test_scaled)
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
class_report_xgb = classification_report(y_test, y_pred_xgb)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(" XGBoost Classifier")
print("Confusion Matrix:\n", conf_matrix_xgb)
print("\nClassification Report:\n", class_report_xgb)
print(f"Accuracy Score: {accuracy_xgb:.4f}\n")

 XGBoost Classifier
Confusion Matrix:
 [[161 170]
 [108 794]]

Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.49      0.54       331
           1       0.82      0.88      0.85       902

    accuracy                           0.77      1233
   macro avg       0.71      0.68      0.69      1233
weighted avg       0.76      0.77      0.77      1233

Accuracy Score: 0.7745



In [10]:
#Support Vector Machine (SVM)
from sklearn.svm import SVC
svm = SVC(kernel='linear', probability=True, random_state=42)
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
class_report_svm = classification_report(y_test, y_pred_svm)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("Support Vector Machine (SVM)")
print("Confusion Matrix:\n", conf_matrix_svm)
print("\nClassification Report:\n", class_report_svm)
print(f"Accuracy Score: {accuracy_svm:.4f}")

Support Vector Machine (SVM)
Confusion Matrix:
 [[211 120]
 [145 757]]

Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.64      0.61       331
           1       0.86      0.84      0.85       902

    accuracy                           0.79      1233
   macro avg       0.73      0.74      0.73      1233
weighted avg       0.79      0.79      0.79      1233

Accuracy Score: 0.7851
