In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

income = pd.read_csv("C:/Users/47089/OneDrive/Desktop/marlabs/income.csv")

income.head()

Unnamed: 0,age,JobType,EdType,maritalstatus,occupation,relationship,race,gender,capitalgain,capitalloss,hoursperweek,nativecountry,SalStat
0,45,Private,HS-grad,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,28,United-States,"less than or equal to 50,000"
1,24,Federal-gov,HS-grad,Never-married,Armed-Forces,Own-child,White,Male,0,0,40,United-States,"less than or equal to 50,000"
2,44,Private,Some-college,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,"greater than 50,000"
3,27,Private,9th,Never-married,Craft-repair,Other-relative,White,Male,0,0,40,Mexico,"less than or equal to 50,000"
4,20,Private,Some-college,Never-married,Sales,Not-in-family,White,Male,0,0,35,United-States,"less than or equal to 50,000"


In [2]:
def income_model_prep(data):
    from sklearn.model_selection import train_test_split
    data = pd.get_dummies(
        data.assign(
            target = np.where(data["SalStat"] == " less than or equal to 50,000", 0, 1),
            nativecountry = data["nativecountry"].replace({" Holand-Netherlands": " Germany"}),
            occupation = data["occupation"].replace({" Armed-Forces": " ?"}),
            JobType = data["JobType"].replace({" Never-worked": " Without-pay"}),
        ).drop("SalStat", axis=1), 
        drop_first=True
    )
    X = data.drop("target", axis=1)
    y = data["target"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    return X_train, X_test, y_train, y_test

In [3]:
X_train, X_test, y_train, y_test = income_model_prep(income) 

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

lr = LogisticRegression(max_iter=5000)

lr.fit(X_train, y_train)

print(f"Accuracy: {lr.score(X_test, y_test)}")
print(f"F1: {f1_score(y_test, lr.predict(X_test))}")

Accuracy: 0.8539712320200125
F1: 0.6652329749103942


## Assignment 1: Sampling Methods

The following steps can be done one by one or in a single cell.

1. Undersample the data to a 2:1 ratio of 0s to 1s and fit a logistic regression - generate a confusion matrix and calculate common evaluation metrics (Accuracy, Precision Recall, F1). 

2. Oversample the data using random oversampling.Create 4x the the current number of 1s. and fit a logistic regression - generate a confusion matrix and calculate common evaluation metrics. 

3. Use SMOTE to oversample the data. Create 4x the the current number of 1s. Fit a logistic regression and generate a confusion matrix, as well as calculate common evaluation metrics. 

4. Which model sampling approach best for this data? Pick the one that gave the best performance at the default threshold, then tune the threshold and report optimized F1 score.

In [24]:
import imblearn.under_sampling as US, imblearn.over_sampling as OS
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, recall_score, f1_score

In [19]:
# Undersample:
n_pos = np.sum(y_train == 1)
n_neg = np.sum(y_train == 0)
#print(n_pos, n_neg)
minority_pct = 1/3
RUS = US.RandomUnderSampler(
    sampling_strategy = (minority_pct) / (1-minority_pct),
    random_state = 101
)

X_train_rs, y_train_rs = RUS.fit_resample(X_train, y_train)
# print(np.mean(y_train))
# print(np.mean(y_train_rs))
lr_us = LogisticRegression(max_iter=5000)
lr_us.fit(X_train_rs, y_train_rs)

print('Confusion Matrix:\n', confusion_matrix(y_test, lr_us.predict(X_test)))
print('Accuracy: ', lr_us.score(X_test, y_test))
print('Precision: ', precision_score(y_test, lr_us.predict(X_test)))
print('Recall: ', recall_score(y_test, lr_us.predict(X_test)))
print('F1 score: ', f1_score(y_test, lr_us.predict(X_test)))

Confusion Matrix:
 [[4324  557]
 [ 424 1091]]
Accuracy:  0.8466228893058161
Precision:  0.6620145631067961
Recall:  0.7201320132013201
F1 score:  0.6898514068921909


In [22]:
# Oversample:
ratio = {1:n_pos * 4, 0:n_neg}
ROS = OS.RandomOverSampler(
    sampling_strategy = ratio,
    random_state = 101
)

X_train_os, y_train_os = ROS.fit_resample(X_train, y_train)
lr_os = LogisticRegression(max_iter=5000)
lr_os.fit(X_train_os, y_train_os)

print('Confusion Matrix:\n', confusion_matrix(y_test, lr_os.predict(X_test)))
print('Accuracy: ', lr_os.score(X_test, y_test))
print('Precision: ', precision_score(y_test, lr_os.predict(X_test)))
print('Recall: ', recall_score(y_test, lr_os.predict(X_test)))
print('F1 score: ', f1_score(y_test, lr_os.predict(X_test)))

Confusion Matrix:
 [[3648 1233]
 [ 150 1365]]
Accuracy:  0.7837711069418386
Precision:  0.5254041570438799
Recall:  0.900990099009901
F1 score:  0.6637490882567468


In [26]:
# SMOTE
from imblearn.over_sampling import SMOTE
ratio = {1:n_pos * 4, 0:n_neg}
smote = SMOTE(sampling_strategy = ratio)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
lr_smote = LogisticRegression(max_iter=5000)
lr_smote.fit(X_train_smote, y_train_smote)

print('Confusion Matrix:\n', confusion_matrix(y_test, lr_smote.predict(X_test)))
print('Accuracy: ', lr_smote.score(X_test, y_test))
print('Precision: ', precision_score(y_test, lr_smote.predict(X_test)))
print('Recall: ', recall_score(y_test, lr_smote.predict(X_test)))
print('F1 score: ', f1_score(y_test, lr_smote.predict(X_test)))

Confusion Matrix:
 [[4134  747]
 [ 378 1137]]
Accuracy:  0.824108818011257
Precision:  0.6035031847133758
Recall:  0.7504950495049505
F1 score:  0.6690203000882613


Since Undersample model has the highest F1 score, Undersample is best for the data.

In [28]:
# Tune the threshold
f1 = []
thresholds = np.linspace(0, 1, 100)

for threshold in thresholds:
    y_pred = (lr_us.predict_proba(X_test)[:,1] > threshold)
    f1.append(f1_score(y_test, y_pred))
print('Optimized F1 Score: ', max(f1))

Optimized F1 Score:  0.6943879034603082


## Assignment 2: Class Weights

1. Fit a regression with standard, balanced and 4:1 (minority vs majority) class weights. Calculate the AUC for each.
2. For the weighting that had the best AUC, tune the threshold to maximize F1 score.


In [30]:
from sklearn.metrics import roc_auc_score, precision_recall_curve

# standard
lr = LogisticRegression(max_iter=5000)
lr.fit(X_train, y_train)
auc_standard = roc_auc_score(y_test, lr.predict_proba(X_test)[:, 1])
print('AUC Standard: ', auc_standard)

AUC Standard:  0.9068523939056475


In [35]:
# balanced
lr_balanced = LogisticRegression(class_weight = 'balance', max_iter = 5000)
lr_balanced.fit(X_train, y_train)
auc_balanced = roc_auc_score(y_test, lr_balanced.predict_proba(X_test)[:, 1])
print('AUC Balanced: ', auc_balanced)

AUC Balanced:  0.9068523939056475


In [34]:
# 4:1
lr_4x = LogisticRegression(class_weight = {1:4, 0:1}, max_iter = 5000)
lr_4x.fit(X_train, y_train)
auc_4x = roc_auc_score(y_test, lr_4x.predict_proba(X_test)[:, 1])
print('AUC 4x: ', auc_4x)

AUC 4x:  0.9090833115272192


In [42]:
# Best F1 Score
# lr_4x has the highest AUC
p_curve, r_curve, t_curve = precision_recall_curve(y_test, lr_4x.predict_proba(X_test)[:, 1])
f1 = 2 * r_curve * p_curve / (r_curve + p_curve)
print('Optimized F1 Score:', max(f1))

Optimized F1 Score: 0.7032710280373832
