In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
from sklearn.model_selection import cross_val_score 
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

In [2]:
dta = sm.datasets.fair.load_pandas().data
dta.head()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,0.111111
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,3.230769
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1.4
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,0.727273
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,4.666666


Number of observations: 6366
Number of variables: 9
Variable name definitions:

    rate_marriage   : How rate marriage, 1 = very poor, 2 = poor, 3 = fair,
                    4 = good, 5 = very good
    age             : Age
    yrs_married     : No. years married. Interval approximations. See
                    original paper for detailed explanation.
    children        : No. children
    religious       : How relgious, 1 = not, 2 = mildly, 3 = fairly,
                    4 = strongly
    educ            : Level of education, 9 = grade school, 12 = high
                    school, 14 = some college, 16 = college graduate,
                    17 = some graduate school, 20 = advanced degree
    occupation      : 1 = student, 2 = farming, agriculture; semi-skilled,
                    or unskilled worker; 3 = white-colloar; 4 = teacher
                    counselor social worker, nurse; artist, writers;
                    technician, skilled worker, 5 = managerial,
                    administrative, business, 6 = professional with
                    advanced degree
    occupation_husb : Husband's occupation. Same as occupation.
    affairs         : measure of time spent in extramarital affairs

In [3]:
dta['affair'] = (dta.affairs > 0).astype(int)
dta.head()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs,affair
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,0.111111,1
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,3.230769,1
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1.4,1
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,0.727273,1
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,4.666666,1


In [4]:
y, X = dmatrices('affair ~ rate_marriage + age + yrs_married + children + religious + educ + C(occupation) + C(occupation_husb)',
dta, return_type="dataframe")

In [5]:
X = X.rename(columns =
{'C(occupation)[T.2.0]':'occ_2',

'C(occupation)[T.3.0]':'occ_3',
'C(occupation)[T.4.0]':'occ_4',
'C(occupation)[T.5.0]':'occ_5',
'C(occupation)[T.6.0]':'occ_6',
'C(occupation_husb)[T.2.0]':'occ_husb_2',
'C(occupation_husb)[T.3.0]':'occ_husb_3',
'C(occupation_husb)[T.4.0]':'occ_husb_4',
'C(occupation_husb)[T.5.0]':'occ_husb_5',
'C(occupation_husb)[T.6.0]':'occ_husb_6'})

In [6]:
y = np.ravel(y)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state = 20)

In [8]:
undersample = RandomUnderSampler(sampling_strategy='majority')
X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)

In [9]:
scaler = MinMaxScaler(feature_range=(0,5))
X_train_scaled = scaler.fit_transform(X_train_under)

logReg = LogisticRegression()
logReg.fit(X_train_scaled,y_train_under)

LogisticRegression()

In [10]:
X_test_under, y_test_under = undersample.fit_resample(X_test, y_test)

In [11]:
X_test_scaled = scaler.fit_transform(X_test_under)
y_predict = logReg.predict(X_test_scaled)

In [12]:
print("Classification Report")
print(classification_report(y_test_under, y_predict))
print("\nConfusion Matrix")
print(confusion_matrix(y_test_under, y_predict))
print("\nROC AUC Score")
print(roc_auc_score(y_test_under, y_predict))

Classification Report
              precision    recall  f1-score   support

         0.0       0.67      0.69      0.68       613
         1.0       0.68      0.66      0.67       613

    accuracy                           0.68      1226
   macro avg       0.68      0.68      0.68      1226
weighted avg       0.68      0.68      0.68      1226


Confusion Matrix
[[426 187]
 [208 405]]

ROC AUC Score
0.6778140293637847


In [None]:
#SMOTE = SMOTE()
#X_train_SMOTE, y_train_SMOTE = SMOTE.fit_resample(X_train, y_train)

In [None]:
#scaler = MinMaxScaler(feature_range=(0,5))
#X_train_scaled = scaler.fit_transform(X_train_SMOTE)

#logReg = LogisticRegression()
#logReg.fit(X_train_scaled,y_train_SMOTE)

In [None]:
#X_test_SMOTE, y_test_SMOTE = SMOTE.fit_resample(X_test, y_test)

In [None]:
#X_test_scaled = scaler.fit_transform(X_test_SMOTE)
#y_predict = logReg.predict(X_test_scaled)

In [None]:
#print("Classification Report")
#print(classification_report(y_test_SMOTE, y_predict))
#print("\nConfusion Matrix")
#print(confusion_matrix(y_test_SMOTE, y_predict))
#print("\nROC AUC Score")
#print(roc_auc_score(y_test_SMOTE, y_predict))