In [1]:

from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
import pandas as pd
import sklearn
from imblearn.combine import SMOTETomek
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,confusion_matrix,accuracy_score,classification_report
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.preprocessing import PolynomialFeatures,StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
pd.set_option('display.max_columns', 500)
import pickle as pkl

In [2]:

df1 = pd.read_csv('prepeddata.csv',index_col='Unnamed: 0')
df1['stimulant'] = df1['stimulant'].apply(lambda x: 'low' if x <= 2 else 'high')

RandomForestClassifier()


RandomForestClassifier()

In [4]:
stimtestdata = df1.drop(['hallucinagen','depressant'],axis = 1)
stimx = df1.drop('stimulant',axis = 1)
stimy = df1.stimulant
stimx

sm = SMOTE()
tl = TomekLinks()
sampling = SMOTETomek(sampling_strategy='auto', random_state=None, smote=sm, tomek=tl, n_jobs=1)

stimx, stimy = sampling.fit_resample(stimx, stimy)

In [5]:
stimXtrain,stimXtest,stimYtrain,stimYtest = train_test_split(stimx,stimy)

In [6]:
stimscaler = StandardScaler()
stimXtrain = stimscaler.fit_transform(stimXtrain)
stimXtest = stimscaler.transform(stimXtest)
stimYtrain

2490    high
2479    high
606     high
2500    high
195      low
        ... 
947      low
435     high
726      low
785      low
3245    high
Name: stimulant, Length: 2439, dtype: object

Logistic regression classifiers

In [7]:
logregparam_grid = {'C': [0.001,0.1,1, 10],'max_iter' :[1000],'class_weight':['auto'],'warm_start' :[True,False], 'solver': ['liblinear', 'rbf','sag']}
mod = GridSearchCV(LogisticRegression(), logregparam_grid, n_jobs=-1, cv=5, verbose=1)
mod.fit(stimXtrain,stimYtrain)
stimtrianlogypred = mod.predict(stimXtrain)
stimtestlogypred = mod.predict(stimXtest)


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    1.6s finished


In [8]:
standardmodel1 = mod.best_estimator_
standardmodel1

LogisticRegression(C=0.1, class_weight='auto', max_iter=1000, solver='sag',
                   warm_start=True)

In [9]:
f1_score(stimYtrain,stimtrianlogypred,average='weighted'),f1_score(stimYtest,stimtestlogypred,average='weighted')

(0.8596301515663851, 0.8460640704366895)

In [10]:
logregparam_grid2 = {'tol': [0,0.0001,0.001,0.1,1, 10]
                     ,'C': [0,0.001,0.1,1, 10]
                     ,'penalty': ['l1','l2','elasticnet']
                     ,'max_iter' :[1000],'class_weight':['auto']
                     ,'warm_start' :[True,False]
                     , 'solver': ['lbfgs','liblinear', 'rbf','sag','elasticnet','saga',]
                     ,'dual':[False,True],"l1_ratio":[None,0.01,0.1,0.2,0.3,0.4]
                    ,'fit_intercept':[True,False]}
mod2 = GridSearchCV(LogisticRegression(), logregparam_grid2, n_jobs=-1, cv=5, verbose=1)
mod2.fit(stimXtrain,stimYtrain)
stimtrianlog2ypred = mod2.predict(stimXtrain)
stimtestlog2ypred = mod2.predict(stimXtest)

Fitting 5 folds for each of 25920 candidates, totalling 129600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 3560 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 19560 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 35530 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done 56656 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done 65592 tasks      | elapsed:   54.0s
[Parallel(n_jobs=-1)]: Done 87640 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 114912 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 129600 out of 129600 | elapsed:  2.0min finished


In [11]:
standardmodel2 = mod2.best_estimator_
standardmodel2

LogisticRegression(C=10, class_weight='auto', fit_intercept=False,
                   max_iter=1000, penalty='l1', solver='saga', tol=10,
                   warm_start=True)

In [12]:
f1_score(stimYtrain,stimtrianlog2ypred,average='weighted'),f1_score(stimYtest,stimtestlog2ypred,average='weighted')

(0.8550900913885211, 0.8428046435966065)

In [13]:
logregparam_grid3 = {'tol': [0,0.0001,0.001,0.1,1, 10]
                     ,'C': [0,0.001,0.1,1, 10]
                     ,'penalty': ['l1','l2','elasticnet']
                     ,'max_iter' :[1000],'class_weight':['auto']
                     ,'warm_start' :[True,False]
                     , 'solver': ['lbfgs','liblinear', 'rbf','sag','elasticnet','saga',]
                     ,'dual':[False,True],"l1_ratio":[None,0.01,0.1,0.2,0.3,0.4]
                    ,'fit_intercept':[True,False]}
mod3 = GridSearchCV(LogisticRegression(), logregparam_grid3, n_jobs=-1, cv=5, verbose=0)
mod3.fit(stimXtrain,stimYtrain)
stimtrianlog3ypred = mod3.predict(stimXtrain)
stimtestlog3ypred = mod3.predict(stimXtest)



In [14]:
standardmodel3 = mod3.best_estimator_
standardmodel3

LogisticRegression(C=10, class_weight='auto', fit_intercept=False, l1_ratio=0.2,
                   max_iter=1000, solver='saga', tol=1)

In [15]:
f1_score(stimYtrain,stimtrianlog3ypred,average='weighted'),f1_score(stimYtest,stimtestlog3ypred,average='weighted')


(0.8538928118779693, 0.8431395574459944)

In [16]:
classification_report(stimYtrain,stimtrianlog3ypred)

'              precision    recall  f1-score   support\n\n        high       0.82      0.91      0.86      1225\n         low       0.90      0.79      0.84      1214\n\n    accuracy                           0.85      2439\n   macro avg       0.86      0.85      0.85      2439\nweighted avg       0.86      0.85      0.85      2439\n'

In [17]:
pkl.dump(standardmodel1,open('StandardModel1.pkl','wb'))
pkl.dump(standardmodel2,open('StandardModel2.pkl','wb'))
pkl.dump(standardmodel3,open('StandardModel3.pkl','wb'))

Decision Trees

In [85]:
parameters={'min_samples_split' : range(10,500,20),'max_depth': range(1,20,1),'ccp_alpha':[x/60 for x in list(range(0,60,1))],'max_features':range(1,30,1),'criterion':['gini']}
dtc = GridSearchCV(DecisionTreeClassifier(DecisionTreeClassifier()),param_grid=parameters,verbose=True,n_jobs=-1)
dtc.fit(stimXtrain,stimYtrain)
dtcytrainpred = dtc.predict(stimXtrain)
dtcytestpred = dtc.predict(stimXtest)



Fitting 5 folds for each of 826500 candidates, totalling 4132500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 1420 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 4452 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 8652 tasks      | elapsed:   13.8s
[Parallel(n_jobs=-1)]: Done 19116 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done 32316 tasks      | elapsed:   32.8s
[Parallel(n_jobs=-1)]: Done 47916 tasks      | elapsed:   45.7s
[Parallel(n_jobs=-1)]: Done 65916 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 86316 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 109116 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 134316 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 161916 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 191916 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 224316 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done

In [101]:
f1_score(stimYtrain,dtcytrainpred,average='weighted'),f1_score(stimYtest,dtcytestpred,average='weighted')

(0.5344461165030636, 0.4984071172586556)

In [87]:
parameters={'min_samples_split' : range(10,500,20),'ccp_alpha':[x/200 for x in list(range(0,100,1))],'max_features':range(1,30,1),'criterion':['entropy','gini']}
dtc2 = GridSearchCV(DecisionTreeClassifier(),param_grid=parameters,verbose=True,n_jobs=-1)
dtc2.fit(stimXtrain,stimYtrain)
dtcytrainpred = dtc2.predict(stimXtrain)
dtcytestpred = dtc2.predict(stimXtest)

Fitting 5 folds for each of 1653000 candidates, totalling 8265000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 2600 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 10600 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 21800 tasks      | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done 36200 tasks      | elapsed:   31.7s
[Parallel(n_jobs=-1)]: Done 53800 tasks      | elapsed:   46.7s
[Parallel(n_jobs=-1)]: Done 74600 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 98600 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 125800 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 156200 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 189800 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 226600 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 266600 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 309800 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: D

In [100]:
f1_score(stimYtrain,dtcytrainpred,average='weighted'),f1_score(stimYtest,dtcytestpred,average='weighted')


(0.5344461165030636, 0.4984071172586556)

In [102]:
parameters={'min_impurity_decrease':[0+x/100 for x in range(1,100)],'ccp_alpha':[x/100 for x in range(0,100,1)],'criterion':['entropy','gini']}
dtc3 = GridSearchCV(DecisionTreeClassifier(),param_grid=parameters,verbose=True,n_jobs=-1)
dtc3.fit(stimXtrain,stimYtrain)
dtcytrainpred = dtc3.predict(stimXtrain)
dtcytestpred = dtc3.predict(stimXtest)

Fitting 5 folds for each of 19800 candidates, totalling 99000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 1420 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 17292 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 39692 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done 68492 tasks      | elapsed:   33.4s
[Parallel(n_jobs=-1)]: Done 98736 tasks      | elapsed:   46.8s
[Parallel(n_jobs=-1)]: Done 99000 out of 99000 | elapsed:   46.9s finished


In [103]:
f1_score(stimYtrain,dtcytrainpred,average='weighted'),f1_score(stimYtest,dtcytestpred,average='weighted')


(0.5344461165030636, 0.4984071172586556)

In [104]:
standardmodel4 = dtc.best_estimator_
standardmodel5 = dtc2.best_estimator_
standardmodel6 = dtc3.best_estimator_

pkl.dump(standardmodel4,open('StandardModel4.pkl','wb'))
pkl.dump(standardmodel5,open('StandardModel5.pkl','wb'))
pkl.dump(standardmodel6,open('StandardModel6.pkl','wb'))




Knearest

In [105]:
# knn = KNeighborsClassifier(n_neighbors=43,weights='uniform')
# knn.fit(stimXtrain,stimYtrain)
# knntrainypred=knn.predict(stimXtrain)
# knntestypred=knn.predict(stimXtest)
# f1_score(stimYtrain,knntrainypred,average='weighted'),f1_score(stimYtest,knntestypred,average='weighted')
parameters={'n_neighbors':range(1,50)}
knn = GridSearchCV(KNeighborsClassifier(),param_grid=parameters,verbose=True,n_jobs=-1)
knn.fit(stimXtrain,stimYtrain)
knnytrainpred = knn.predict(stimXtrain)
knnytestpred = knn.predict(stimXtest)



Fitting 5 folds for each of 49 candidates, totalling 245 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 245 out of 245 | elapsed:    0.6s finished


In [107]:
parameters={'n_neighbors':range(50,1130)}
knn2 = GridSearchCV(KNeighborsClassifier(),param_grid=parameters,verbose=True,n_jobs=-1)
knn2.fit(stimXtrain,stimYtrain)
knn2ytrainpred = knn2.predict(stimXtrain)
knn2ytestpred = knn2.predict(stimXtest)


Fitting 5 folds for each of 1080 candidates, totalling 5400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 748 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 2748 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 5400 out of 5400 | elapsed:   23.2s finished


In [109]:
standardmodel7 = knn.best_estimator_
standardmodel8 = knn2.best_estimator_

pkl.dump(standardmodel7,open('StandardModel7.pkl','wb'))
pkl.dump(standardmodel8,open('StandardModel8.pkl','wb'))

Random Forest

In [15]:
# rfc = RandomForestClassifier(criterion='gini',ccp_alpha=0.005,n_estimators=1000,max_features=6,class_weight='balanced',max_depth=6,oob_score=True)
# rfc.fit(stimXtrain,stimYtrain)
parameters={'ccp_alpha':[x/10 for x in list(range(0,10,1))],'class_weight':['balanced_subsamples'],'criterion':['entropy'],'n_estimators':[100,200,300,500,1000]}
rfc = GridSearchCV(RandomForestClassifier(),param_grid=parameters,verbose=True,n_jobs=-1)
rfc.fit(stimXtrain,stimYtrain)
rfcytrainpred = rfc.predict(stimXtrain)
rfcytestpred = rfc.predict(stimXtest)



Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   30.2s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   45.5s finished


In [None]:
rfctrainypred = rfc.predict(stimXtrain)
rfctestypred = rfc.predict(stimXtest)
f1_score(rfctrainypred,stimYtrain,average='weighted'),f1_score(rfctestypred,stimYtest,average='weighted')

In [45]:
parameters={'ccp_alpha':[x/10 for x in list(range(0,10,1))]
                         ,'class_weight':['balanced_subsamples'],'criterion':['gini']
                         ,}
rfc2 = GridSearchCV(RandomForestClassifier(),param_grid=parameters,verbose=True,n_jobs=-1)
rfc2.fit(stimXtrain,stimYtrain)
rfc2ytrainpred = rfc2.predict(stimXtrain)
rfc2ytestpred = rfc2.predict(stimXtest)

rfc2.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    1.4s finished


RandomForestClassifier(class_weight={'high': 7, 'low': 1.57928703})

In [46]:
rfc2trainypred = rfc2.predict(stimXtrain)
rfc2testypred = rfc2.predict(stimXtest)
f1_score(rfc2trainypred,stimYtrain,average='weighted'),f1_score(rfc2testypred,stimYtest,average='weighted')

(1.0, 0.89798854709217)

In [18]:
standardmodel9 = rfc.best_estimator_
standardmodel10 = rfc2.best_estimator_

pkl.dump(standardmodel9,open('StandardModel9.pkl','wb'))
pkl.dump(standardmodel10,open('StandardModel10.pkl','wb'))

voting classifier

In [None]:
vtc = VotingClassifier([('model1',standardmodel1),('model2',standardmodel2),('model3',standardmodel3)
                        ,('model5',standardmodel5),('model6',polymodel6)
                        ,('polymodel7',polymodel7),('polymodel8',polymodel8),('polymodel9',polymodel9)
                        ,('polymodel10',polymodel10)],voting='soft',verbose = True,n_jobs=-1)
vtc.fit(stimXtrain,stimYtrain)

In [24]:
vtcypred = vtc.predict(stimXtest)
vtcytrain = vtc.predict(stimXtrain)

print(f1_score(vtcytrain,stimYtrain,average='weighted'),f1_score(stimYtest,vtcypred,average='weighted'))


VotingClassifier(estimators=[('dtc2',
                              DecisionTreeClassifier(ccp_alpha=0.0052,
                                                     criterion='entropy',
                                                     max_depth=50)),
                             ('rfc2',
                              RandomForestClassifier(ccp_alpha=0.01,
                                                     class_weight='balanced',
                                                     criterion='entropy',
                                                     n_estimators=10000,
                                                     oob_score=True)),
                             ('knn', KNeighborsClassifier(n_neighbors=43)),
                             ('knn2', KNeighborsClassifier(n_neighbors=11)),
                             ('log',
                              LogisticRegres...
                              LogisticRegression(class_weight='balanced',
                                 

In [25]:
vtcypred = vtc.predict(stimXtest)
vtcytrain = vtc.predict(stimXtrain)

print(f1_score(vtcytrain,stimYtrain,average='weighted'),f1_score(stimYtest,vtcypred,average='weighted'))


0.6260190582728704 0.6084518184576705


In [26]:
pkl.dump(vtc,open('vtc.pkl','wb'))