In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, accuracy_score
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.pipeline import make_pipeline
from data_preprocessing import get_cleaned_data


In [26]:
data = get_cleaned_data(convert_categorical=True)

In [27]:
# Take the first 3000 rows as data
data = data[:100_000]


In [28]:
# # Normalize the data to have values between 0 and 1
# data = (data - data.min()) / (data.max() - data.min())

In [29]:
# print the count of target = 1
print(data[data['TARGET'] == 1]["TARGET"].count())

8093


In [37]:
# split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# train, test = train_test_split(data, test_size=0.1)

X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=["TARGET"]), data["TARGET"], test_size=0.1, random_state=42)

In [31]:

# split data into features and labels
train_features = train.drop(columns=["TARGET"])
train_labels = train["TARGET"]

test_features = test.drop(columns=["TARGET"])
test_labels = test["TARGET"]

In [32]:
tomek = TomekLinks()
X_train_tomek, y_train_tomek = tomek.fit_resample(train_features, train_labels)
print('Resampled dataset shape:', y_train_tomek.value_counts())

Resampled dataset shape: TARGET
0    79349
1     7313
Name: count, dtype: int64


In [33]:
smote_tomek = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=13)
pipeline = make_pipeline(smote_tomek, rf_classifier)

In [34]:
recall_scores = cross_val_score(pipeline, train_features, train_labels, scoring='recall', cv=5)
print("Cross Validation Recall Scores:", recall_scores)
print("Average Cross Validation Recall score:", recall_scores.mean())


Cross Validation Recall Scores: [0.02188782 0.02051984 0.03212577 0.01913876 0.0252905 ]
Average Cross Validation Recall score: 0.023792536932431815


In [35]:
pipeline.fit(train_features, train_labels)
y_pred = pipeline.predict(test_features)

In [36]:
# Confusion matrix and other scores
# cm = confusion_matrix(y_test, y_pred)
# recall = recall_score(y_test, y_pred)
# precision = precision_score(y_test, y_pred)
# f1 = f1_score(y_test, y_pred)
# accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(test_labels, y_pred)
recall = recall_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred)
f1 = f1_score(test_labels, y_pred)
accuracy = accuracy_score(test_labels, y_pred)

print("Confusion Matrix:\n", cm)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Accuracy:", accuracy)

# DataFrame to hold the scores
scores_df = pd.DataFrame({
    'Random Forest with': ['SMOTE + Tomek'],
    'Recall': [recall],
    'Precision': [precision],
    'F1 Score': [f1],
    'Accuracy': [accuracy]
})
print(scores_df)

Confusion Matrix:
 [[9126   94]
 [ 764   16]]
Recall: 0.020512820512820513
Precision: 0.14545454545454545
F1 Score: 0.03595505617977528
Accuracy: 0.9142
  Random Forest with    Recall  Precision  F1 Score  Accuracy
0      SMOTE + Tomek  0.020513   0.145455  0.035955    0.9142
