- Import you data and perform basic data exploration phase
- Display general information about the dataset
- Create a pandas profiling reports to gain insights into the dataset
- Handle Missing and corrupted values
- Remove duplicates, if they exist
- Handle outliers, if they exist
- Encode categorical features
- Select your target variable and the features
- Split your dataset to training and test sets
- Based on your data exploration phase select a ML classification algorithm and train it on the training set
- Assess your model performance on the test set using relevant evaluation metrics
- Discuss with your cohort alternative ways to improve your model performance

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
#import linear algebra and data manipulation libraries
import numpy as np
import pandas as pd

#import standard visualization
import matplotlib.pyplot as plt
import seaborn as sns

#import machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import xgboost

from sklearn.model_selection import train_test_split #split
from sklearn.metrics import accuracy_score #metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#tools for hyperparameters search
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# import the label Encoder library 
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv("African_crises_dataset - African_crises_dataset.csv")

In [None]:
df

In [None]:
df.info()

In [None]:
le =LabelEncoder()

df["banking_crisis"] = le.fit_transform(df["banking_crisis"])

In [None]:
categorical_features = df.select_dtypes(include='object').columns
categorical_features

In [None]:
df["country_code"].value_counts()

In [None]:
df["country"].value_counts()

In [None]:
%matplotlib inline
plt.figure(figsize=(25, 25))
for i in range(0, len(categorical_features)):
    plt.subplot(4, 4, i+1)
    sns.countplot(x = df[categorical_features[i]], palette = 'viridis')
    plt.title(categorical_features[i], fontsize = 30)
    plt.xlabel(' ')
    plt.xticks(rotation=90)
    plt.tight_layout()

In [None]:
numerical_features = df.select_dtypes(include='number').columns
numerical_features

In [None]:
len(numerical_features)

In [None]:
df.describe().T

In [None]:
plt.figure(figsize=(25, 25))
for i in range(0, len(numerical_features)):
    plt.subplot(4, 3, i+1)
    sns.boxplot(x = df[numerical_features[i]], palette = 'viridis')
    plt.title(numerical_features[i], fontsize = 30)
    plt.xlabel(' ')
    plt.tight_layout()

In [None]:
plt.figure(figsize=(30, 20))
correlation_matrix = df[numerical_features].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
numerical_features = df.select_dtypes(include='number').columns
correlation = df[numerical_features].corr().apply(abs)
correlation = correlation[['banking_crisis']].sort_values(['banking_crisis'], ascending=False)
correlation

# Set a threshold for low correlation (e.g., absolute correlation < 0.1)
threshold = 0.2

# Filter the column names that have an absolute correlation with SalePrice below the threshold
low_corr_columns = correlation[correlation['banking_crisis']< threshold].index.tolist()

# Print the list of columns with low correlation
print(f"Columns with correlation less than {threshold}:\n", low_corr_columns)

In [None]:
df = df.drop(columns=low_corr_columns)


In [None]:
df

In [None]:
df = df.drop(["country_code"], axis = 1)

In [None]:
df = df.join(pd.get_dummies(df["country"]))

In [None]:
df = df.drop(["country"], axis = 1)

In [None]:
df

In [None]:
X = df.drop(columns=['banking_crisis'])  # Drop the target column to get features
y = df['banking_crisis']  # Select the target column

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

# Selecting Algorithm

### 1. XGBoost Classifier

In [None]:
import xgboost

xgb = xgboost.XGBClassifier(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
xgb.fit(X_train,y_train.squeeze().values)

#calculate and print scores for the model for the features
y_pred_xgb = xgb.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
class_report_xgb = classification_report(y_test, y_pred_xgb)

print(f"Extreme Gradient Boost Classifier:")
print(f"Accuracy: {accuracy_xgb:.4f}")
print("Confusion Matrix:")
print(conf_matrix_xgb)
print("Classification Report:")
print(class_report_xgb)

### 2. Support Vector Classifier(SVC)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svc_class = SVC(kernel='rbf')
svc_class.fit(X_train_scaled, y_train)
y_pred_svc = svc_class.predict(X_test_scaled)
accuracy_svc = accuracy_score(y_test, y_pred_svc)
conf_matrix_svc = confusion_matrix(y_test, y_pred_svc)
class_report_svc = classification_report(y_test, y_pred_svc)

print(f"Support Vector Classification:")
print(f"Accuracy: {accuracy_svc:.4f}")
print("Confusion Matrix:")
print(conf_matrix_svc)
print("Classification Report:")
print(class_report_svc)

### 3. Decision Tree Classifier

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

tree_clf = DecisionTreeClassifier()

tree_clf.fit(X_train_scaled, y_train)
y_pred_tree = tree_clf.predict(X_test_scaled)
accuracy_tree = accuracy_score(y_test, y_pred_tree)
conf_matrix_tree = confusion_matrix(y_test, y_pred_tree)
class_report_tree = classification_report(y_test, y_pred_tree)

print(f"Decision Tree Classifier:")
print(f"Accuracy: {accuracy_tree:.4f}")
print("Confusion Matrix:")
print(conf_matrix_tree)
print("Classification Report:")
print(class_report_tree)

### 4. Logistic Regression

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)
y_pred_log = log_reg.predict(X_test_scaled)
y_train_pred_log = log_reg.predict(X_train_scaled)

accuracy_log = accuracy_score(y_test, y_pred_log)
conf_matrix_log = confusion_matrix(y_test, y_pred_log)
class_report_log = classification_report(y_test, y_pred_log)

print(f"Logiistic Regression:")
print(f"Accuracy: {accuracy_log:.4f}")
print("Confusion Matrix:")
print(conf_matrix_log)
print("Classification Report:")
print(class_report_log)

### 5. KNeighbors Classifier

In [None]:
KN = KNeighborsClassifier()
KN.fit(X_train_scaled, y_train)
y_pred_KN = KN.predict(X_test_scaled)
y_train_pred_KN = KN.predict(X_train_scaled)

accuracy_KN = accuracy_score(y_test, y_pred_KN)
conf_matrix_KN = confusion_matrix(y_test, y_pred_KN)
class_report_KN = classification_report(y_test, y_pred_KN)

print(f"KNeighbors Classifier:")
print(f"Accuracy: {accuracy_KN:.4f}")
print("Confusion Matrix:")
print(conf_matrix_KN)
print("Classification Report:")
print(class_report_KN)

### 3. Random Forest Classifier

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

forest_clf = RandomForestClassifier()

forest_clf.fit(X_train_scaled, y_train)
y_pred_forest = forest_clf.predict(X_test_scaled)
accuracy_forest = accuracy_score(y_test, y_pred_forest)
conf_matrix_forest = confusion_matrix(y_test, y_pred_forest)
class_report_forest = classification_report(y_test, y_pred_forest)

print(f"Random Forest Classifier:")
print(f"Accuracy: {accuracy_forest:.4f}")
print("Confusion Matrix:")
print(conf_matrix_forest)
print("Classification Report:")
print(class_report_forest)