- Import you data and perform basic data exploration phase
- Display general information about the dataset
- Create a pandas profiling reports to gain insights into the dataset
- Handle Missing and corrupted values
- Remove duplicates, if they exist
- Handle outliers, if they exist
- Encode categorical features
- Select your target variable and the features
- Split your dataset to training and test sets
- Based on your data exploration phase select a ML classification algorithm and train it on the training set
- Assess your model performance on the test set using relevant evaluation metrics
- Discuss with your cohort alternative ways to improve your model performance

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
#import linear algebra and data manipulation libraries
import numpy as np
import pandas as pd

#import standard visualization
import matplotlib.pyplot as plt
import seaborn as sns

#import machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import xgboost

from sklearn.model_selection import train_test_split #split
from sklearn.metrics import accuracy_score #metrics

#tools for hyperparameters search
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# import the label Encoder library 
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [None]:
#import Dataset
afcrises=pd.read_csv("African_crises_dataset.csv")

In [None]:
afcrises

In [None]:
afcrises.describe().T
afcrises["country"].value_counts()

In [None]:
afcrises.info()

In [None]:
afcrises["country_number"].value_counts()

In [None]:
categorical_features = ["country_number", "country"]
plt.figure(figsize=(25, 25))
for i in range(0, len(categorical_features)):
    plt.subplot(4, 4, i+1)
    sns.countplot(x = afcrises[categorical_features[i]], palette = 'viridis')
    plt.title(categorical_features[i], fontsize = 30)
    plt.xlabel(' ')
    plt.xticks(rotation=90)
    plt.tight_layout()

In [None]:
numerical_features = afcrises.select_dtypes(include='number').columns
numerical_features

In [None]:
plt.figure(figsize=(25, 25))
for i in range(0, len(numerical_features)):
    plt.subplot(5, 3, i+1)
    sns.boxplot(x = afcrises[numerical_features[i]], palette = 'viridis')
    plt.title(numerical_features[i], fontsize = 30)
    plt.xlabel(' ')
    plt.tight_layout()

In [None]:
plt.figure(figsize=(15, 7.5))
correlation_matrix = afcrises[numerical_features].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
afcrises["banking_crisis"] = label_encoder.fit_transform(afcrises["banking_crisis"])

In [None]:
afcrises["systemic_crisis"].value_counts

In [None]:
numerical_features = afcrises.select_dtypes(include='number').columns
correlation = afcrises[numerical_features].corr().apply(abs)
correlation = correlation[['banking_crisis']].sort_values(['banking_crisis'], ascending=False)
correlation

# Set a threshold for low correlation (e.g., absolute correlation < 0.1)
threshold = 0.2

# Filter the column names that have an absolute correlation with SalePrice below the threshold
low_corr_columns = correlation[correlation['banking_crisis']< threshold].index.tolist()

# Print the list of columns with low correlation
print(f"Columns with correlation less than {threshold}:\n", low_corr_columns)

In [None]:
afcrises = afcrises.drop(columns=low_corr_columns)

In [None]:
afcrises = afcrises.drop("country_code", axis=1)

In [None]:
afcrises.head()

In [None]:
afcrises = afcrises.join(pd.get_dummies(afcrises.country))
afcrises = afcrises.drop("country", axis=1)

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X, y = afcrises.drop("currency_crises", axis=1),afcrises["currency_crises"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
afcrises.describe().T

In [None]:
# Import necessary libraries
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Assuming you have preprocessed your dataset and encoded it properly

# Split your dataset (adjust 'X' and 'y' based on your actual feature and target variable names)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost model
xgb = xgboost.XGBClassifier(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                            colsample_bytree=1, max_depth=7)

# Train the model
xgb.fit(X_train, y_train.squeeze().values)

# Make predictions
y_train_preds = xgb.predict(X_train)
y_test_preds = xgb.predict(X_test)

# Calculate accuracy
accuracy_xgb = accuracy_score(y_test, y_test_preds)

# Calculate confusion matrix
conf_matrix_xgb = confusion_matrix(y_test, y_test_preds)

# Calculate classification report
class_report_xgb = classification_report(y_test, y_test_preds)

# Print the metrics
print(f"xgboost:")
print(f"Accuracy: {accuracy_xgb:.4f}")
print("Confusion Matrix:")
print(conf_matrix_xgb)
print("Classification Report:")
print(class_report_xgb)


In [None]:
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

# Initialize the SVC model
svc_reg = SVC(kernel='rbf')

# Fit the model to the training data
svc_reg.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred_svc = svc_reg.predict(X_test_scaled)

# Calculate accuracy
accuracy_svc = accuracy_score(y_test, y_pred_svc)

# Calculate confusion matrix
conf_matrix_svc = confusion_matrix(y_test, y_pred_svc)

# Calculate classification report
class_report_svc = classification_report(y_test, y_pred_svc)

# Print the metrics
print(f"Support Vector Classification:")
print(f"Accuracy: {accuracy_svc:.4f}")
print("Confusion Matrix:")
print(conf_matrix_svc)
print("Classification Report:")
print(class_report_svc)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

# Initialize the GradientBoostingClassifier
gb_clf = GradientBoostingClassifier()

# Fit the model to the training data
gb_clf.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred_gb = gb_clf.predict(X_test_scaled)

# Calculate accuracy
accuracy_gb = accuracy_score(y_test, y_pred_gb)

# Calculate confusion matrix
conf_matrix_gb = confusion_matrix(y_test, y_pred_gb)

# Calculate classification report
class_report_gb = classification_report(y_test, y_pred_gb)

# Print the metrics
print(f"Gradient Boosting Classifier:")
print(f"Accuracy: {accuracy_gb:.4f}")
print("Confusion Matrix:")
print(conf_matrix_gb)
print("Classification Report:")
print(class_report_gb)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

# Initialize the KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=5)

# Fit the model to the training data
knn_clf.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred_knn = knn_clf.predict(X_test_scaled)

# Calculate accuracy
accuracy_knn = accuracy_score(y_test, y_pred_knn)

# Calculate confusion matrix
conf_matrix_knn = confusion_matrix(y_test, y_pred_knn)

# Calculate classification report
class_report_knn = classification_report(y_test, y_pred_knn)

# Print the metrics
print(f"K-Nearest Neighbors Classifier:")
print(f"Accuracy: {accuracy_knn:.4f}")
print("Confusion Matrix:")
print(conf_matrix_knn)
print("Classification Report:")
print(class_report_knn)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

# Initialize the RandomForestClassifier
forest_clf = RandomForestClassifier()

# Fit the model to the training data
forest_clf.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred_forest = forest_clf.predict(X_test_scaled)

# Calculate accuracy
accuracy_forest = accuracy_score(y_test, y_pred_forest)

# Calculate confusion matrix
conf_matrix_forest = confusion_matrix(y_test, y_pred_forest)

# Calculate classification report
class_report_forest = classification_report(y_test, y_pred_forest)

# Print the metrics
print(f"Random Forest Classifier:")
print(f"Accuracy: {accuracy_forest:.4f}")
print("Confusion Matrix:")
print(conf_matrix_forest)
print("Classification Report:")
print(class_report_forest)

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

# Initialize the DecisionTreeClassifier
tree_clf = DecisionTreeClassifier()

# Fit the model to the training data
tree_clf.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred_tree = tree_clf.predict(X_test_scaled)

# Calculate accuracy
accuracy_tree = accuracy_score(y_test, y_pred_tree)

# Calculate confusion matrix
conf_matrix_tree = confusion_matrix(y_test, y_pred_tree)

# Calculate classification report
class_report_tree = classification_report(y_test, y_pred_tree)

# Print the metrics
print(f"Decision Tree Classifier:")
print(f"Accuracy: {accuracy_tree:.4f}")
print("Confusion Matrix:")
print(conf_matrix_tree)
print("Classification Report:")
print(class_report_tree)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Initialize the Logistic Regression model
log_reg = LogisticRegression()

# Fit the model to the training data
log_reg.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred_log = log_reg.predict(X_test_scaled)

# Predict on the training data (for train evaluation)
y_train_pred_log = log_reg.predict(X_train_scaled)

# Calculate accuracy
accuracy_log = accuracy_score(y_test, y_pred_log)

# Calculate confusion matrix
conf_matrix_log = confusion_matrix(y_test, y_pred_log)

# Calculate classification report
class_report_log = classification_report(y_test, y_pred_log)

# Print the metrics
print(f"Logistic Regression:")
print(f"Accuracy: {accuracy_log:.4f}")
print("Confusion Matrix:")
print(conf_matrix_log)
print("Classification Report:")
print(class_report_log)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
Adaboost = AdaBoostClassifier()

Adaboost.fit(X_train, y_train)

y_pred_Ada = Adaboost.predict(X_test)

# Calculate accuracy
accuracy_Adaboost= accuracy_score(y_test, y_pred_Ada)

# Calculate confusion matrix
conf_matrix_Adaboost= confusion_matrix(y_test, y_pred_Ada)

# Calculate classification report
class_report_Adaboost = classification_report(y_test, y_pred_Ada)

# Print the metrics
print(f"Adaboost Classifier:")
print(f"Accuracy: {accuracy_Adaboost:.4f}")
print("Confusion Matrix:")
print(conf_matrix_Adaboost)
print("Classification Report:")
print(class_report_Adaboost)