In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
diabetes_df = pd.read_csv('../week_13/diabetes.csv')
diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
# to see if data is imbalanced or not, take what you're trying to predict and see what the distribution of the data is
#output shows that it's not too imbalanced, but you still want to make it as balanced as possible (500 w/o diabetes and 269 w/ diabetes)
diabetes_df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [4]:
# do this step before oversampling
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']

# example of why to use stratify: https://machinelearningmastery.com/train-test-split-for-evaluating-machine-learning-algorithms/#:~:text=We%20can%20achieve%20this%20by,the%20provided%20%E2%80%9Cy%E2%80%9D%20array.
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42, stratify=y)

#Standardize
# you can standardize the X's before or after train_test_split
sc = StandardScaler()
X_train_scaler = sc.fit_transform(X_train)
X_test_scaler = sc.fit_transform(X_test)

# Oversampling

In [5]:
#Resample training data with RandomOversampler (naive approach)
from imblearn.over_sampling import RandomOverSampler

#instatiate the random oversampler and give a random state
ros = RandomOverSampler(random_state=42)

#this is where we apply the resampling technique to our data
#passing in the standardized data into the fit_resample function
# leaving y_train as is because we don't need to standardize what we're trying to predict
X_resampled, y_resampled = ros.fit_resample(X_train_scaler, y_train)

In [5]:
#train our model using resampled data. This done after preprocessing
model = LogisticRegression(random_state=42)
model.fit(X_resampled, y_resampled)


LogisticRegression(random_state=42)

In [6]:
#calculate accuracy
#the balanced_accuracy score is assuming that the model is balanced
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test_scaler)
balanced_accuracy_score(y_test, y_pred)

0.7575308641975309

In [8]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))
#a technique for improving recall is to resample because it gives us a 
#better understanding of the postives, thus improving our true positive rate

                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      0.79      0.73      0.81      0.76      0.58       150
          1       0.65      0.73      0.79      0.69      0.76      0.57        81

avg / total       0.77      0.77      0.75      0.77      0.76      0.57       231



# Undersampling

In [9]:
#clusterCentroids uses k-means to reduce the number of samples
# it's going to find data thats in the same groups and reduce the number in those groups based on what it determined those groups are

from imblearn.under_sampling import ClusterCentroids
#instiate culstercentriods with the random state
rus = ClusterCentroids(random_state=42)
#getting our resampled data
X_resampled, y_resampled = rus.fit_resample(X_train_scaler, y_train)


In [10]:
# instiatiate model then fit it
model = LogisticRegression(random_state=42)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=42)

In [11]:
#get predicted values
y_pred = model.predict(X_test_scaler)

In [12]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.82      0.80      0.67      0.81      0.73      0.54       150
          1       0.64      0.67      0.80      0.65      0.73      0.53        81

avg / total       0.76      0.75      0.71      0.75      0.73      0.54       231



Oversampling is often a better approach than undersampling because it will improve the recall more. 