In [1]:
# import dependencies
from sklearn.model_selection import train_test_split
import pandas as pd
import datetime as dt
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import r2_score, accuracy_score, confusion_matrix, balanced_accuracy_score
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids
from imblearn.combine import SMOTEENN

In [2]:
# read in our cleaned, merged data
ML_df = pd.read_csv('../resources/superstore_ML_prepped.csv', index_col=[0])
ML_df

Unnamed: 0,Sales,Quantity,Discount,Profit,Shipping Cost,Returned,Profit Margin Percentage,Returned $ Amount,Days to Ship,Monday,...,Sub-Category_Machines,Sub-Category_Paper,Sub-Category_Phones,Sub-Category_Storage,Sub-Category_Supplies,Sub-Category_Tables,Order Priority_Critical,Order Priority_High,Order Priority_Low,Order Priority_Medium
0,221.98,2,0.0,62.15,40.770,0.0,27.998018,0.0,2,0,...,0,0,1,0,0,0,0,1,0,0
1,3709.40,9,0.1,-288.77,923.630,0.0,-7.784817,0.0,2,0,...,0,0,0,0,0,0,1,0,0,0
2,5175.17,9,0.1,919.97,915.490,0.0,17.776614,0.0,1,0,...,0,0,1,0,0,0,0,0,0,1
3,2892.51,5,0.1,-96.54,910.160,0.0,-3.337586,0.0,2,0,...,0,0,1,0,0,0,0,0,0,1
4,2832.96,8,0.0,311.52,903.040,0.0,10.996272,0.0,1,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51285,65.10,5,0.0,4.50,1.010,0.0,6.912442,0.0,0,0,...,0,0,0,0,0,0,0,0,0,1
51286,16.72,5,0.2,3.34,1.930,0.0,19.976077,0.0,4,0,...,0,0,0,0,0,0,0,1,0,0
51287,8.56,2,0.0,2.48,1.580,0.0,28.971963,0.0,5,0,...,0,0,0,0,0,0,0,1,0,0
51288,13.44,2,0.0,2.40,1.003,0.0,17.857143,0.0,4,0,...,0,0,0,0,0,0,0,0,0,1


# Oversampling

In [3]:
# set target (Returned) and features
y = ML_df['Returned']
X = ML_df.drop(columns=['Returned', 'Returned $ Amount'])

In [4]:
# split into train & test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
                                                    #stratify=y)
Counter(y_train)

Counter({0.0: 36805, 1.0: 1662})

## Random Oversampling

In [5]:
# randomly oversample the minority (returned) class
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(X_train, y_train)
Counter(y_ros)

Counter({0.0: 36805, 1.0: 36805})

In [6]:
# initiate new model & fit to the oversampled train data
model= LogisticRegression()
model.fit(X_ros, y_ros)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [7]:
# create predictions and confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[6455, 5810],
       [ 261,  297]], dtype=int64)

In [8]:
# balanced accuracy score
ros_acc_score = balanced_accuracy_score(y_test, y_pred)
print(f'The balanced accuracy score for the randomly oversampled logistic regression is {ros_acc_score}')

The balanced accuracy score for the randomly oversampled logistic regression is 0.5292761989926752


## Synthetic Minority Oversampling Technique (SMOTE)

In [9]:
# SMOTE the train data sets
X_SMOTE, y_SMOTE = SMOTE(sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_SMOTE)

Counter({0.0: 36805, 1.0: 36805})

In [10]:
# initiate new model & fit to the randomly oversampled train data
model= LogisticRegression()
model.fit(X_SMOTE, y_SMOTE)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [11]:
# create predictions and confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[11173,  1092],
       [  510,    48]], dtype=int64)

In [12]:
# balanced accuracy score
SMOTE_acc_score = balanced_accuracy_score(y_test, y_pred)
print(f'The balanced accuracy score for the SMOTE logistic regression is {SMOTE_acc_score}')

The balanced accuracy score for the SMOTE logistic regression is 0.4984938346286531


## Random Undersampling

In [13]:
# random under sample the train data sets
rus = RandomUnderSampler()
X_rus, y_rus = rus.fit_resample(X_train, y_train)
Counter(y_rus)

Counter({0.0: 1662, 1.0: 1662})

In [14]:
# initiate new model & fit to the randomly oversampled train data
model= LogisticRegression()
model.fit(X_rus, y_rus)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [15]:
# create predictions and confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[6409, 5856],
       [ 262,  296]], dtype=int64)

In [16]:
# balanced accuracy score
rus_acc_score = balanced_accuracy_score(y_test, y_pred)
print(f'The balanced accuracy score for the randomly under sampled logistic regression is {rus_acc_score}')

The balanced accuracy score for the randomly under sampled logistic regression is 0.526504886854952


## Cluster Centroid Undersampling

In [17]:
# initiate resampling module & use to resample data
cc = ClusterCentroids()
X_cc, y_cc = cc.fit_resample(X_train, y_train)
#Counter(y_cc)

In [18]:
# initiate new model & fit to the undersampled train data
model= LogisticRegression()
model.fit(X_cc, y_cc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [19]:
# create predictions and confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  545, 11720],
       [   27,   531]], dtype=int64)

In [20]:
# balanced accuracy score
cc_acc_score = balanced_accuracy_score(y_test, y_pred)
print(f'The balanced accuracy score for the ClusterCentroid logistic regression is {cc_acc_score}')

The balanced accuracy score for the ClusterCentroid logistic regression is 0.4980241442341833


# SMOTE + Edited Nearest Neighbors (SMOTEENN)

In [21]:
# create instance of SMOTEENN and resample train data 
smote_enn = SMOTEENN()
X_SMOTEENN, y_SMOTEENN = smote_enn.fit_resample(X_train, y_train)

In [22]:
# initiate new model & fit to the resampled train data
model= LogisticRegression()
model.fit(X_SMOTEENN, y_SMOTEENN)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [23]:
# create predictions and confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[6695, 5570],
       [ 297,  261]], dtype=int64)

In [24]:
# balanced accuracy score
SMOTEENN_acc_score = balanced_accuracy_score(y_test, y_pred)
print(f'The balanced accuracy score for the SMOTEENN sampled logistic regression is {SMOTEENN_acc_score}')

The balanced accuracy score for the SMOTEENN sampled logistic regression is 0.5068020725116053


# Comparative Results

In [25]:
print(f'The balanced accuracy score for the randomly oversampled logistic regression is {ros_acc_score}')
print(f'The balanced accuracy score for the SMOTE logistic regression is {SMOTE_acc_score}')
print(f'The balanced accuracy score for the randomly under sampled logistic regression is {rus_acc_score}')
print(f'The balanced accuracy score for the ClusterCentroid logistic regression is {cc_acc_score}')
print(f'The balanced accuracy score for the SMOTEENN sampled logistic regression is {SMOTEENN_acc_score}')

The balanced accuracy score for the randomly oversampled logistic regression is 0.5292761989926752
The balanced accuracy score for the SMOTE logistic regression is 0.4984938346286531
The balanced accuracy score for the randomly under sampled logistic regression is 0.526504886854952
The balanced accuracy score for the ClusterCentroid logistic regression is 0.4980241442341833
The balanced accuracy score for the SMOTEENN sampled logistic regression is 0.5068020725116053
