## ```NoteBook Focus```
---
1. Train numerous models to select best models for hypertuning 

## ```Imports```
---

In [1]:
import pandas as pd
from classifiers_copy import classify
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

## ```Creating new dataframe with balanced classes```
---

In [2]:
drugs = pd.read_csv('../drugs_2020_simply_imputed.csv')
drugs.drop(columns=['Unnamed: 0','Unnamed: 0.1'], inplace=True)
drugs.columns = drugs.columns.str.lower()
drugs.head()

Unnamed: 0,accgdln,age,altdum,amendyr,amttotal,casetype,citwhere,combdrg2,crimhist,disposit,...,typemony,typeoths,unit1,mwgt1,wgt1,xcrhissr,xfolsor,xmaxsor,xminsor,sentrnge
0,1.0,20.0,0,2018.0,0,1.0,211.0,6.0,1.0,1,...,1.0,0,1.0,63560990.0,85104.433315,1.0,17.0,30.0,24.0,8.0
1,1.0,64.0,0,2018.0,0,1.0,211.0,1.0,1.0,1,...,1.0,0,1.0,1193400.0,5967.0,3.0,27.0,108.0,87.0,0.0
2,1.0,28.0,0,2018.0,0,1.0,211.0,3.0,1.0,1,...,1.0,0,2.0,2000000.0,2000.0,6.0,27.0,162.0,130.0,2.0
3,2.0,55.0,0,2018.0,0,1.0,211.0,77.0,1.0,1,...,1.0,0,1.0,10300.0,4.12,5.0,13.0,37.0,30.0,0.0
4,1.0,30.0,0,2018.0,0,1.0,211.0,6.0,1.0,1,...,1.0,0,1.0,169200.0,84.6,6.0,25.0,137.0,110.0,2.0


In [3]:
# checking for missing values Claire might've missed before modeling
drugs.isnull().sum()

accgdln     0
age         0
altdum      0
amendyr     0
amttotal    0
           ..
xcrhissr    0
xfolsor     0
xmaxsor     0
xminsor     0
sentrnge    0
Length: 65, dtype: int64

In [4]:
# concatting new df with equal classes for modeling

# separate all minor classes
df_0 = drugs[drugs['prisdum']==0]
print(df_0.shape)

# separate all majority class and sample 754 to match minor class
df_1 = drugs[drugs['prisdum']==1]
df_1_sample = df_1.sample(n=754, replace=False)
print(df_1_sample.shape)

# concat both df's
equal_class_df = pd.concat([df_0,df_1_sample], ignore_index=True)
print(equal_class_df.shape)

(754, 65)
(754, 65)
(1508, 65)


## ```Modeling```
---

In [5]:
# model with balanced classes

#set up X/y
X = equal_class_df.drop(columns='prisdum')
y = equal_class_df['prisdum']

# set train/test split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42, train_size=0.7, stratify=y)

# scale data
ss = StandardScaler()
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.fit_transform(X_test)

# train multiple models
equal_class_scores = classify(X_train_ss,X_test_ss,y_train,y_test)

In [6]:
equal_class_scores

Unnamed: 0,Train Acc,Test Acc,Acc-diff,Train-F1,Test-F1,F1-diff,Train-Pres,Test-Pres,Pres-diff,Train_Recall,Test-Recall,Recall_diff
knn,0.987678,0.975717,0.01196,0.987724,0.975501,0.012223,0.984934,0.982063,0.002871,0.99053,0.969027,0.021504
logreg,0.999052,0.99117,0.007882,0.999054,0.991111,0.007943,0.99811,0.995536,0.002574,1.0,0.986726,0.013274
dt,1.0,0.503311,0.496689,1.0,0.667651,0.332349,1.0,0.501109,0.498891,1.0,1.0,0.0
bag,1.0,0.993377,0.006623,1.0,0.993348,0.006652,1.0,0.995556,0.004444,1.0,0.99115,0.00885
bag_knn,0.988626,0.977925,0.010701,0.988658,0.977778,0.01088,0.986792,0.982143,0.00465,0.99053,0.973451,0.017079
bag_log,0.996209,0.99117,0.005039,0.996226,0.991111,0.005115,0.992481,0.995536,0.003055,1.0,0.986726,0.013274
rf,1.0,0.995585,0.004415,1.0,0.995595,0.004405,1.0,0.991228,0.008772,1.0,1.0,0.0
et,1.0,0.997792,0.002208,1.0,0.997792,0.002208,1.0,0.995595,0.004405,1.0,1.0,0.0
ada,1.0,0.953642,0.046358,1.0,0.955603,0.044397,1.0,0.91498,0.08502,1.0,1.0,0.0
gboost,1.0,0.953642,0.046358,1.0,0.955603,0.044397,1.0,0.91498,0.08502,1.0,1.0,0.0


In [7]:
# import model
from sklearn.linear_model import LogisticRegression

# instantiate, train, evaluate
logreg = LogisticRegression()
logreg.fit(X_train_ss,y_train)
print(f"train acc: {logreg.score(X_train_ss,y_train)}")
print(f"test acc: {logreg.score(X_test_ss,y_test)}")

# create df to visualize coefs
logreg_scores = pd.DataFrame(columns=X.columns,data=logreg.coef_).T

train acc: 0.9990521327014218
test acc: 0.9911699779249448


In [8]:
logreg_scores[logreg_scores[0]>0].sort_values(by=0,ascending=False).head(10)

Unnamed: 0,0
suprdum,1.689471
timservc,0.764388
sensplt0,0.732293
senspcap,0.732293
suprel,0.703795
totchpts,0.483695
reas2,0.46578
accgdln,0.424791
supermin,0.389197
offguide,0.368115


In [9]:
# model with UN_balanced classes

#set up X/y
X = drugs.drop(columns='prisdum')
y = drugs['prisdum']

# set train/test split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42, train_size=0.7, stratify=y)

# scale data
ss = StandardScaler()
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.fit_transform(X_test)

# train multiple models
unbalanced_class_scores = classify(X_train_ss,X_test_ss,y_train,y_test)

In [10]:
unbalanced_class_scores

Unnamed: 0,Train Acc,Test Acc,Acc-diff,Train-F1,Test-F1,F1-diff,Train-Pres,Test-Pres,Pres-diff,Train_Recall,Test-Recall,Recall_diff
knn,0.998217,0.998614,0.000396,0.999067,0.999275,0.000208,0.999022,0.998757,0.000265,0.999111,0.999793,0.000681
logreg,0.998387,0.998217,0.00017,0.999156,0.999067,8.9e-05,0.999111,0.99917,5.9e-05,0.9992,0.998963,0.000237
dt,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
bag,1.0,0.996039,0.003961,1.0,0.997931,0.002069,1.0,0.99587,0.00413,1.0,1.0,0.0
bag_knn,0.997963,0.998416,0.000453,0.998933,0.999171,0.000238,0.999022,0.998757,0.000265,0.998845,0.999585,0.000741
bag_log,0.997793,0.997821,2.8e-05,0.998845,0.99886,1.5e-05,0.998667,0.99855,0.000118,0.999022,0.999171,0.000148
rf,1.0,0.995841,0.004159,1.0,0.997827,0.002173,1.0,0.995869,0.004131,1.0,0.999793,0.000207
et,1.0,0.999208,0.000792,1.0,0.999585,0.000415,1.0,0.999378,0.000622,1.0,0.999793,0.000207
ada,1.0,0.996039,0.003961,1.0,0.997931,0.002069,1.0,0.99587,0.00413,1.0,1.0,0.0
gboost,1.0,0.996039,0.003961,1.0,0.997931,0.002069,1.0,0.99587,0.00413,1.0,1.0,0.0
