In [1]:

from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
import pandas as pd
import sklearn
from imblearn.combine import SMOTETomek
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,confusion_matrix,accuracy_score,classification_report
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.preprocessing import PolynomialFeatures,StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
pd.set_option('display.max_columns', 500)
import pickle as pkl

In [2]:

df1 = pd.read_csv('../prepeddata.csv',index_col='Unnamed: 0')
df1['stimulant'] = df1['stimulant'].apply(lambda x: 'low' if x <= 2 else 'high')

RandomForestClassifier()


RandomForestClassifier()

In [3]:
stimx = df1.drop(['stimulant','hallucinagen','depressant'],axis = 1)
 
stimy = df1.stimulant
stimx

sm = SMOTE()
tl = TomekLinks()
sampling = SMOTETomek(sampling_strategy='auto', random_state=None, smote=sm, tomek=tl, n_jobs=1)

stimx, stimy = sampling.fit_resample(stimx, stimy)

In [4]:
stimXtrain,stimXtest,stimYtrain,stimYtest = train_test_split(stimx,stimy)

In [5]:
stimscaler = StandardScaler()
stimXtrain = stimscaler.fit_transform(stimXtrain)
stimXtest = stimscaler.transform(stimXtest)
stimYtrain

268      low
1766     low
1887    high
1186    high
309      low
        ... 
329      low
263      low
1852     low
286      low
3182    high
Name: stimulant, Length: 2438, dtype: object

Logistic regression classifiers

In [6]:
logregparam_grid = {'C': [0.001,0.1,1, 10],'max_iter' :[1000],'class_weight':['auto'],'warm_start' :[True,False], 'solver': ['liblinear', 'rbf','sag']}
mod = GridSearchCV(LogisticRegression(), logregparam_grid, n_jobs=-1, cv=5, verbose=1)
mod.fit(stimXtrain,stimYtrain)
stimtrianlogypred = mod.predict(stimXtrain)
stimtestlogypred = mod.predict(stimXtest)


Fitting 5 folds for each of 24 candidates, totalling 120 fits


        nan        nan        nan        nan 0.74569209 0.74569209
        nan        nan        nan        nan 0.74610193 0.74610193
        nan        nan        nan        nan 0.74610277 0.74610277]


In [7]:
standardmodel1 = mod.best_estimator_
standardmodel1

LogisticRegression(C=0.001, class_weight='auto', max_iter=1000, solver='sag',
                   warm_start=True)

In [8]:
f1_score(stimYtrain,stimtrianlogypred,average='weighted'),f1_score(stimYtest,stimtestlogypred,average='weighted')

(0.7495974723294041, 0.7673052205178671)

In [9]:
logregparam_grid2 = {'tol': [0,0.0001,0.001,0.1,1]
                     ,'C': [0,0.001,0.1,1]
                     ,'penalty': ['l1','l2','elasticnet']
                     ,'max_iter' :[1000],'class_weight':['auto']
                     ,'warm_start' :[True]
                     , 'solver': ['lbfgs','liblinear', 'rbf','sag','elasticnet','saga',]
                     ,'dual':[False,True]
                    }
mod2 = GridSearchCV(LogisticRegression(), logregparam_grid2, n_jobs=-1, cv=3, verbose=1)
mod2.fit(stimXtrain,stimYtrain)
stimtrianlog2ypred = mod2.predict(stimXtrain)
stimtestlog2ypred = mod2.predict(stimXtest)

Fitting 3 folds for each of 720 candidates, totalling 2160 fits


        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan       

In [10]:
standardmodel2 = mod2.best_estimator_
standardmodel2

LogisticRegression(C=1, class_weight='auto', max_iter=1000, solver='saga',
                   tol=0.1, warm_start=True)

In [14]:
f1_score(stimYtrain,stimtrianlog2ypred,average='weighted'),f1_score(stimYtest,stimtestlog2ypred,average='weighted')

(0.8556228407643084, 0.8597782366188155)

In [15]:
logregparam_grid3 = {'tol': [0,0.0001,0.001,0.1,1]
                     ,'C': [0,0.001,0.1,1]
                     ,'penalty': ['l1','l2','elasticnet']
                     ,'max_iter' :[5000],'class_weight':['auto']
                     
                     , 'solver': ['lbfgs','liblinear', 'rbf','sag','elasticnet','saga',]
                     ,'dual':[False,True],"l1_ratio":[None,0.01,0.1,0.2,0.3,0.4]
                    ,'fit_intercept':[True,False]}
mod3 = GridSearchCV(LogisticRegression(), logregparam_grid3, n_jobs=-1, cv=5, verbose=0)
mod3.fit(stimXtrain,stimYtrain)
stimtrianlog3ypred = mod3.predict(stimXtrain)
stimtestlog3ypred = mod3.predict(stimXtest)

  "(penalty={})".format(self.penalty))


In [16]:
standardmodel3 = mod3.best_estimator_
standardmodel3

LogisticRegression(C=1, class_weight='auto', fit_intercept=False, l1_ratio=0.2,
                   max_iter=5000, penalty='l1', solver='saga', tol=1)

In [17]:
f1_score(stimYtrain,stimtrianlog3ypred,average='weighted'),f1_score(stimYtest,stimtestlog3ypred,average='weighted')


(0.8536508699519856, 0.8657411971160501)

In [18]:
classification_report(stimYtrain,stimtrianlog3ypred)

'              precision    recall  f1-score   support\n\n        high       0.82      0.91      0.86      1225\n         low       0.90      0.80      0.85      1215\n\n    accuracy                           0.85      2440\n   macro avg       0.86      0.85      0.85      2440\nweighted avg       0.86      0.85      0.85      2440\n'

In [19]:
pkl.dump(standardmodel1,open('StandardModel1.pkl','wb'))
pkl.dump(standardmodel2,open('StandardModel2.pkl','wb'))
pkl.dump(standardmodel3,open('StandardModel3.pkl','wb'))

Decision Trees

In [20]:
parameters={'min_samples_split' : range(10,500,45),'max_depth': range(1,20,1),'ccp_alpha':[x/300 for x in list(range(0,60,1))],'criterion':['gini']}
dtc = GridSearchCV(DecisionTreeClassifier(DecisionTreeClassifier()),param_grid=parameters,verbose=True,n_jobs=-1)
dtc.fit(stimXtrain,stimYtrain)
dtcytrainpred = dtc.predict(stimXtrain)
dtcytestpred = dtc.predict(stimXtest)



Fitting 5 folds for each of 12540 candidates, totalling 62700 fits


In [21]:
f1_score(stimYtrain,dtcytrainpred,average='weighted'),f1_score(stimYtest,dtcytestpred,average='weighted')

(0.9044428483178419, 0.8660175551238114)

In [22]:
parameters={'min_samples_split' : range(10,500,45),'ccp_alpha':[x/60 for x in list(range(0,60,1))],'criterion':['entropy','gini']}
dtc2 = GridSearchCV(DecisionTreeClassifier(),param_grid=parameters,verbose=True,n_jobs=-1)
dtc2.fit(stimXtrain,stimYtrain)
dtcytrainpred = dtc2.predict(stimXtrain)
dtcytestpred = dtc2.predict(stimXtest)

Fitting 5 folds for each of 1320 candidates, totalling 6600 fits


In [23]:
f1_score(stimYtrain,dtcytrainpred,average='weighted'),f1_score(stimYtest,dtcytestpred,average='weighted')


(0.9118850830988885, 0.8685545347085643)

In [24]:
parameters={'min_impurity_decrease':[0+x/100 for x in range(1,30,1)],'ccp_alpha':[x/60 for x in list(range(0,60,1))],'criterion':['entropy','gini']}
dtc3 = GridSearchCV(DecisionTreeClassifier(),param_grid=parameters,verbose=True,n_jobs=-1)
dtc3.fit(stimXtrain,stimYtrain)
dtcytrainpred = dtc3.predict(stimXtrain)
dtcytestpred = dtc3.predict(stimXtest)

Fitting 5 folds for each of 3480 candidates, totalling 17400 fits


In [25]:
f1_score(stimYtrain,dtcytrainpred,average='weighted'),f1_score(stimYtest,dtcytestpred,average='weighted')


(0.8897538946679115, 0.8611544185832243)

In [26]:
standardmodel4 = dtc.best_estimator_
standardmodel5 = dtc2.best_estimator_
standardmodel6 = dtc3.best_estimator_

pkl.dump(standardmodel4,open('StandardModel4.pkl','wb'))
pkl.dump(standardmodel5,open('StandardModel5.pkl','wb'))
pkl.dump(standardmodel6,open('StandardModel6.pkl','wb'))




Knearest

In [27]:
# knn = KNeighborsClassifier(n_neighbors=43,weights='uniform')
# knn.fit(stimXtrain,stimYtrain)
# knntrainypred=knn.predict(stimXtrain)
# knntestypred=knn.predict(stimXtest)
# f1_score(stimYtrain,knntrainypred,average='weighted'),f1_score(stimYtest,knntestypred,average='weighted')
parameters={'n_neighbors':range(1,50)}
knn = GridSearchCV(KNeighborsClassifier(),param_grid=parameters,verbose=True,n_jobs=-1)
knn.fit(stimXtrain,stimYtrain)
knnytrainpred = knn.predict(stimXtrain)
knnytestpred = knn.predict(stimXtest)



Fitting 5 folds for each of 49 candidates, totalling 245 fits


In [28]:
parameters={'n_neighbors':range(50,1130)}
knn2 = GridSearchCV(KNeighborsClassifier(),param_grid=parameters,verbose=True,n_jobs=-1)
knn2.fit(stimXtrain,stimYtrain)
knn2ytrainpred = knn2.predict(stimXtrain)
knn2ytestpred = knn2.predict(stimXtest)


Fitting 5 folds for each of 1080 candidates, totalling 5400 fits


In [29]:
standardmodel7 = knn.best_estimator_
standardmodel8 = knn2.best_estimator_

pkl.dump(standardmodel7,open('StandardModel7.pkl','wb'))
pkl.dump(standardmodel8,open('StandardModel8.pkl','wb'))

Random Forest

In [30]:
# rfc = RandomForestClassifier(criterion='gini',ccp_alpha=0.005,n_estimators=1000,max_features=6,class_weight='balanced',max_depth=6,oob_score=True)
# rfc.fit(stimXtrain,stimYtrain)
parameters={'ccp_alpha':[x/70 for x in list(range(0,30,1))],'class_weight':['balanced_subsample'],'criterion':['entropy'],'n_estimators':[100,300,500,750]}
rfc = GridSearchCV(RandomForestClassifier(),param_grid=parameters,verbose=True,n_jobs=-1)
rfc.fit(stimXtrain,stimYtrain)
rfcytrainpred = rfc.predict(stimXtrain)
rfcytestpred = rfc.predict(stimXtest)



Fitting 5 folds for each of 120 candidates, totalling 600 fits


In [31]:
rfctrainypred = rfc.predict(stimXtrain)
rfctestypred = rfc.predict(stimXtest)
f1_score(rfctrainypred,stimYtrain,average='weighted'),f1_score(rfctestypred,stimYtest,average='weighted')

(1.0, 0.9250952501227626)

In [32]:
parameters={'ccp_alpha':[x/70 for x in list(range(0,10,1))]
                         ,'class_weight':['balanced_subsample'],'criterion':['gini']
                         ,'n_estimators':[100,300,500,750]}
rfc2 = GridSearchCV(RandomForestClassifier(),param_grid=parameters,verbose=True,n_jobs=-1)
rfc2.fit(stimXtrain,stimYtrain)
rfc2ytrainpred = rfc2.predict(stimXtrain)
rfc2ytestpred = rfc2.predict(stimXtest)

rfc2.best_estimator_

Fitting 5 folds for each of 40 candidates, totalling 200 fits


RandomForestClassifier(class_weight='balanced_subsample', n_estimators=750)

In [33]:
rfc2trainypred = rfc2.predict(stimXtrain)
rfc2testypred = rfc2.predict(stimXtest)
f1_score(rfc2trainypred,stimYtrain,average='weighted'),f1_score(rfc2testypred,stimYtest,average='weighted')

(1.0, 0.9226314098543239)

In [34]:
standardmodel9 = rfc.best_estimator_
standardmodel10 = rfc2.best_estimator_

pkl.dump(standardmodel9,open('StandardModel9.pkl','wb'))
pkl.dump(standardmodel10,open('StandardModel10.pkl','wb'))

voting classifier

In [35]:
vtc = VotingClassifier([('model1',standardmodel1),('model2',standardmodel2),('model3',standardmodel3)
                        ,('model5',standardmodel5),('model6',standardmodel6),('model4',standardmodel4)
                        ,('model7',standardmodel7),('model8',standardmodel8),('model9',standardmodel9)
                        ,('model10',standardmodel10)],voting='soft',verbose = True,n_jobs=-1)
vtc.fit(stimXtrain,stimYtrain)

VotingClassifier(estimators=[('model1',
                              LogisticRegression(C=10, class_weight='auto',
                                                 max_iter=1000, solver='sag',
                                                 warm_start=True)),
                             ('model2',
                              LogisticRegression(C=0.1, class_weight='auto',
                                                 max_iter=1000, penalty='l1',
                                                 solver='saga', tol=0,
                                                 warm_start=True)),
                             ('model3',
                              LogisticRegression(C=1, class_weight='auto',
                                                 fit_intercept=False,
                                                 l1_ratio=0.2, max_iter=5000,
                                                 pe...
                              DecisionTreeClassifier(max_depth=5,
                    

In [36]:
vtcypred = vtc.predict(stimXtest)
vtcytrain = vtc.predict(stimXtrain)

print(f1_score(vtcytrain,stimYtrain,average='weighted'),f1_score(stimYtest,vtcypred,average='weighted'))


0.9488282087030745 0.8989299707303677


In [37]:
pkl.dump(vtc,open('vtc.pkl','wb'))

In [38]:
print(classification_report(stimYtrain, vtcytrain))

              precision    recall  f1-score   support

        high       0.92      0.98      0.95      1225
         low       0.98      0.92      0.95      1215

    accuracy                           0.95      2440
   macro avg       0.95      0.95      0.95      2440
weighted avg       0.95      0.95      0.95      2440



In [39]:
print(classification_report(stimYtest, vtcypred))

              precision    recall  f1-score   support

        high       0.85      0.96      0.90       402
         low       0.96      0.84      0.89       412

    accuracy                           0.90       814
   macro avg       0.91      0.90      0.90       814
weighted avg       0.91      0.90      0.90       814



In [40]:
confusion_matrix(stimYtrain, vtcytrain)

array([[1201,   24],
       [ 101, 1114]], dtype=int64)

In [41]:
confusion_matrix(stimYtest,vtcypred)

array([[387,  15],
       [ 67, 345]], dtype=int64)