In [76]:
%run ./utility.ipynb

In [77]:
X_test = pd.read_csv('./data/cleaned_X_test.csv')
y_test = pd.read_csv('./data/cleaned_y_test.csv')

X_test.drop("Unnamed: 0", axis=1, inplace=True)
y_test.drop("Unnamed: 0", axis=1, inplace=True)

X_test.shape, y_test.shape

((1192, 19), (1192, 1))

# Balancing the target class

In [78]:
oversample = SMOTE()
balanced_X_train, balanced_y_train = oversample.fit_resample(X_train, y_train)

In [79]:
print(f"Value counts imbalanced: {y_train.value_counts()}")
print(f"Value counts balanced: {balanced_y_train.value_counts()}")

Value counts imbalanced: BAD
0      3816
1       950
Name: count, dtype: int64
Value counts balanced: BAD
0      3816
1      3816
Name: count, dtype: int64


# Training de algoritms on imbalanced, balanced and weighted classes

In [80]:
classifiers = [
    ("rfc", RandomForestClassifier()),
    ("log_reg", LogisticRegression())  
]

classifier_scores_imbalanced = []
classifier_scores_balanced = []

for classifier in classifiers:
    clf_name = classifier[0]
    clf = classifier[1]
        
    cv_scores_imbalanced = cross_val_score(clf, X_train.values, y_train.values, cv=5)
    mean_imbalanced_score = np.mean(cv_scores_imbalanced)
    classifier_scores_imbalanced.append((clf_name, clf ,mean_imbalanced_score))
    
    cv_scores_imbalanced = cross_val_score(clf, balanced_X_train.values, balanced_y_train.values, cv=5)
    mean_imbalanced_score = np.mean(cv_scores_imbalanced)
    classifier_scores_balanced.append((clf_name, clf ,mean_imbalanced_score))

In [81]:
classifiers_balanced = [
    ("rfc", RandomForestClassifier(class_weight="balanced")),
    ("log_reg", LogisticRegression(class_weight="balanced"))  
]

classifier_scores_weighted = []

for classifier in classifiers_balanced:
    clf_name = classifier[0]
    clf = classifier[1]
    
    cv_scores_imbalanced = cross_val_score(clf, X_train.values, y_train.values, cv=5)
    mean_imbalanced_score = np.mean(cv_scores_imbalanced)
    classifier_scores_weighted.append((clf_name, clf, mean_imbalanced_score))

In [82]:
classifier_scores_imbalanced

[('rfc', RandomForestClassifier(), 0.9225759545603534),
 ('log_reg', LogisticRegression(), 0.8333995481553342)]

In [83]:
classifier_scores_balanced

[('rfc', RandomForestClassifier(), 0.9634472032896719),
 ('log_reg', LogisticRegression(), 0.7670414839571849)]

In [84]:
classifier_scores_weighted

[('rfc', RandomForestClassifier(class_weight='balanced'), 0.910824253543373),
 ('log_reg', LogisticRegression(class_weight='balanced'), 0.7557671790066017)]

We see that the balanced dataset scores best in training

# Evaluation

In [85]:
print("Imbalanced classes")
for classifier in classifier_scores_imbalanced:
    classifier[1].fit(X_train, y_train)
    acc = classifier[1].score(X_test, y_test)
    
    print(f"{classifier[0]}: {acc}%")

Imbalanced classes
rfc: 0.9211409395973155%
log_reg: 0.8330536912751678%


In [86]:
print("balanced classes")
for classifier in classifier_scores_balanced:
    classifier[1].fit(balanced_X_train, balanced_y_train)
    acc = classifier[1].score(X_test, y_test)
    
    print(f"{classifier[0]}: {acc}%")

balanced classes
rfc: 0.7483221476510067%
log_reg: 0.7760067114093959%


In [88]:
print("weighted algorithms")
for classifier in classifier_scores_weighted:
    classifier[1].fit(X_train, y_train)
    acc = classifier[1].score(X_test, y_test)
    
    print(f"{classifier[0]}: {acc}%")

weighted algorithms
rfc: 0.9186241610738255%
log_reg: 0.7625838926174496%


In evaluation we see that the imbalanced and weighted algorithms score better than the smoted dataset