In [None]:

from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
import pandas as pd
import sklearn
from imblearn.combine import SMOTETomek
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,confusion_matrix,accuracy_score,classification_report
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.preprocessing import PolynomialFeatures,StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
pd.set_option('display.max_columns', 500)
import pickle as pkl

In [45]:

df1 = pd.read_csv('prepeddata.csv',index_col='Unnamed: 0')
df1['stimulant'] = df1['stimulant'].apply(lambda x: 'low' if x <= 2 else 'high')

RandomForestClassifier()


RandomForestClassifier()

In [46]:
stimtestdata = df1.drop(['hallucinagen','depressant'],axis = 1)
stimx = df1.drop('stimulant',axis = 1)
stimy = df1.stimulant
stimx

sm = SMOTE()
tl = TomekLinks()
sampling = SMOTETomek(sampling_strategy='auto', random_state=None, smote=sm, tomek=tl, n_jobs=1)

stimx, stimy = sampling.fit_resample(stimx, stimy)

ValueError: When 'sampling_strategy' is a float, it should be in the range (0, 1]. Got 1.33 instead.

In [47]:
stimXtrain,stimXtest,stimYtrain,stimYtest = train_test_split(stimx,stimy)

In [48]:
stimscaler = StandardScaler()
stimXtrain = stimscaler.fit_transform(stimXtrain)
stimXtest = stimscaler.transform(stimXtest)
stimYtrain

44       low
744      low
661      low
1386     low
631      low
        ... 
163      low
686     high
1772     low
1113     low
1231    high
Name: stimulant, Length: 1413, dtype: object

Logistic regression classifiers

In [49]:
logregparam_grid = {'C': [0.001,0.1,1, 10],'max_iter' :[1000],'class_weight':['auto'],'warm_start' :[True,False], 'solver': ['liblinear', 'rbf','sag']}
mod = GridSearchCV(LogisticRegression(), logregparam_grid, n_jobs=-1, cv=5, verbose=1)
mod.fit(stimXtrain,stimYtrain)
stimtrianlogypred = mod.predict(stimXtrain)
stimtestlogypred = mod.predict(stimXtest)


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    2.2s finished


In [50]:
standardmodel1 = mod.best_estimator_
standardmodel1

LogisticRegression(C=0.1, class_weight='auto', max_iter=1000, solver='sag',
                   warm_start=True)

In [51]:
f1_score(stimYtrain,stimtrianlogypred,average='weighted'),f1_score(stimYtest,stimtestlogypred,average='weighted')

(0.8858617391579917, 0.8583139207968512)

In [52]:
logregparam_grid2 = {'tol': [0,0.0001,0.001,0.1,1]
                     ,'C': [0,0.001,0.1,1]
                     ,'penalty': ['l1','l2','elasticnet']
                     ,'max_iter' :[1000],'class_weight':['auto']
                     ,'warm_start' :[True]
                     , 'solver': ['lbfgs','liblinear', 'rbf','sag','elasticnet','saga',]
                     ,'dual':[False,True]
                    }
mod2 = GridSearchCV(LogisticRegression(), logregparam_grid2, n_jobs=-1, cv=3, verbose=1)
mod2.fit(stimXtrain,stimYtrain)
stimtrianlog2ypred = mod2.predict(stimXtrain)
stimtestlog2ypred = mod2.predict(stimXtest)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 720 candidates, totalling 2160 fits


[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1580 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed:    1.4s finished


In [53]:
standardmodel2 = mod2.best_estimator_
standardmodel2

LogisticRegression(C=1, class_weight='auto', max_iter=1000, penalty='l1',
                   solver='saga', tol=0, warm_start=True)

In [54]:
f1_score(stimYtrain,stimtrianlog2ypred,average='weighted'),f1_score(stimYtest,stimtestlog2ypred,average='weighted')

(0.8869145335496375, 0.8627529639933966)

In [55]:
logregparam_grid3 = {'tol': [0,0.0001,0.001,0.1,1]
                     ,'C': [0,0.001,0.1,1]
                     ,'penalty': ['l1','l2','elasticnet']
                     ,'max_iter' :[5000],'class_weight':['auto']
                     
                     , 'solver': ['lbfgs','liblinear', 'rbf','sag','elasticnet','saga',]
                     ,'dual':[False,True],"l1_ratio":[None,0.01,0.1,0.2,0.3,0.4]
                    ,'fit_intercept':[True,False]}
mod3 = GridSearchCV(LogisticRegression(), logregparam_grid3, n_jobs=-1, cv=5, verbose=0)
mod3.fit(stimXtrain,stimYtrain)
stimtrianlog3ypred = mod3.predict(stimXtrain)
stimtestlog3ypred = mod3.predict(stimXtest)

KeyboardInterrupt: 

In [None]:
standardmodel3 = mod3.best_estimator_
standardmodel3

In [None]:
f1_score(stimYtrain,stimtrianlog3ypred,average='weighted'),f1_score(stimYtest,stimtestlog3ypred,average='weighted')


In [None]:
classification_report(stimYtrain,stimtrianlog3ypred)

In [None]:
pkl.dump(standardmodel1,open('StandardModel1.pkl','wb'))
pkl.dump(standardmodel2,open('StandardModel2.pkl','wb'))
pkl.dump(standardmodel3,open('StandardModel3.pkl','wb'))

Decision Trees

In [None]:
parameters={'min_samples_split' : range(10,500,45),'max_depth': range(1,20,1),'ccp_alpha':[x/300 for x in list(range(0,60,1))],'criterion':['gini']}
dtc = GridSearchCV(DecisionTreeClassifier(DecisionTreeClassifier()),param_grid=parameters,verbose=True,n_jobs=-1)
dtc.fit(stimXtrain,stimYtrain)
dtcytrainpred = dtc.predict(stimXtrain)
dtcytestpred = dtc.predict(stimXtest)

In [None]:
f1_score(stimYtrain,dtcytrainpred,average='weighted'),f1_score(stimYtest,dtcytestpred,average='weighted')

In [None]:
parameters={'min_samples_split' : range(10,500,45),'ccp_alpha':[x/60 for x in list(range(0,60,1))],'criterion':['entropy','gini']}
dtc2 = GridSearchCV(DecisionTreeClassifier(),param_grid=parameters,verbose=True,n_jobs=-1)
dtc2.fit(stimXtrain,stimYtrain)
dtcytrainpred = dtc2.predict(stimXtrain)
dtcytestpred = dtc2.predict(stimXtest)

In [None]:
f1_score(stimYtrain,dtcytrainpred,average='weighted'),f1_score(stimYtest,dtcytestpred,average='weighted')


In [None]:
parameters={'min_impurity_decrease':[0+x/100 for x in range(1,30,1)],'ccp_alpha':[x/60 for x in list(range(0,60,1))],'criterion':['entropy','gini']}
dtc3 = GridSearchCV(DecisionTreeClassifier(),param_grid=parameters,verbose=True,n_jobs=-1)
dtc3.fit(stimXtrain,stimYtrain)
dtcytrainpred = dtc3.predict(stimXtrain)
dtcytestpred = dtc3.predict(stimXtest)

In [None]:
f1_score(stimYtrain,dtcytrainpred,average='weighted'),f1_score(stimYtest,dtcytestpred,average='weighted')


In [None]:
standardmodel4 = dtc.best_estimator_
standardmodel5 = dtc2.best_estimator_
standardmodel6 = dtc3.best_estimator_

pkl.dump(standardmodel4,open('StandardModel4.pkl','wb'))
pkl.dump(standardmodel5,open('StandardModel5.pkl','wb'))
pkl.dump(standardmodel6,open('StandardModel6.pkl','wb'))




Knearest

In [None]:
# knn = KNeighborsClassifier(n_neighbors=43,weights='uniform')
# knn.fit(stimXtrain,stimYtrain)
# knntrainypred=knn.predict(stimXtrain)
# knntestypred=knn.predict(stimXtest)
# f1_score(stimYtrain,knntrainypred,average='weighted'),f1_score(stimYtest,knntestypred,average='weighted')
parameters={'n_neighbors':range(1,50)}
knn = GridSearchCV(KNeighborsClassifier(),param_grid=parameters,verbose=True,n_jobs=-1)
knn.fit(stimXtrain,stimYtrain)
knnytrainpred = knn.predict(stimXtrain)
knnytestpred = knn.predict(stimXtest)



In [None]:
parameters={'n_neighbors':range(50,1130)}
knn2 = GridSearchCV(KNeighborsClassifier(),param_grid=parameters,verbose=True,n_jobs=-1)
knn2.fit(stimXtrain,stimYtrain)
knn2ytrainpred = knn2.predict(stimXtrain)
knn2ytestpred = knn2.predict(stimXtest)


In [None]:
standardmodel7 = knn.best_estimator_
standardmodel8 = knn2.best_estimator_

pkl.dump(standardmodel7,open('StandardModel7.pkl','wb'))
pkl.dump(standardmodel8,open('StandardModel8.pkl','wb'))

Random Forest

In [None]:
# rfc = RandomForestClassifier(criterion='gini',ccp_alpha=0.005,n_estimators=1000,max_features=6,class_weight='balanced',max_depth=6,oob_score=True)
# rfc.fit(stimXtrain,stimYtrain)
parameters={'ccp_alpha':[x/70 for x in list(range(0,30,1))],'class_weight':['balanced_subsample'],'criterion':['entropy'],'n_estimators':[100,300,500,750]}
rfc = GridSearchCV(RandomForestClassifier(),param_grid=parameters,verbose=True,n_jobs=-1)
rfc.fit(stimXtrain,stimYtrain)
rfcytrainpred = rfc.predict(stimXtrain)
rfcytestpred = rfc.predict(stimXtest)



In [None]:
rfctrainypred = rfc.predict(stimXtrain)
rfctestypred = rfc.predict(stimXtest)
f1_score(rfctrainypred,stimYtrain,average='weighted'),f1_score(rfctestypred,stimYtest,average='weighted')

In [None]:
parameters={'ccp_alpha':[x/70 for x in list(range(0,10,1))]
                         ,'class_weight':['balanced_subsample'],'criterion':['gini']
                         ,'n_estimators':[100,300,500,750]}
rfc2 = GridSearchCV(RandomForestClassifier(),param_grid=parameters,verbose=True,n_jobs=-1)
rfc2.fit(stimXtrain,stimYtrain)
rfc2ytrainpred = rfc2.predict(stimXtrain)
rfc2ytestpred = rfc2.predict(stimXtest)

rfc2.best_estimator_

In [None]:
rfc2trainypred = rfc2.predict(stimXtrain)
rfc2testypred = rfc2.predict(stimXtest)
f1_score(rfc2trainypred,stimYtrain,average='weighted'),f1_score(rfc2testypred,stimYtest,average='weighted')

In [36]:
standardmodel9 = rfc.best_estimator_
standardmodel10 = rfc2.best_estimator_

pkl.dump(standardmodel9,open('StandardModel9.pkl','wb'))
pkl.dump(standardmodel10,open('StandardModel10.pkl','wb'))

voting classifier

In [59]:
vtc = VotingClassifier([('model1',standardmodel1),('model2',standardmodel2),('model3',standardmodel3)
                        ,('model5',standardmodel5),('model6',standardmodel6)
                        ,('model7',standardmodel7),('model8',standardmodel8),('model9',standardmodel9)
                        ,('model10',standardmodel10)],voting='soft',verbose = True,n_jobs=-1)
vtc.fit(stimXtrain,stimYtrain)

VotingClassifier(estimators=[('model1',
                              LogisticRegression(C=0.1, class_weight='auto',
                                                 max_iter=1000, solver='sag',
                                                 warm_start=True)),
                             ('model2',
                              LogisticRegression(C=1, class_weight='auto',
                                                 max_iter=1000, penalty='l1',
                                                 solver='saga', tol=0,
                                                 warm_start=True)),
                             ('model3',
                              LogisticRegression(C=0.1, class_weight='auto',
                                                 max_iter=5000, solver='saga',
                                                 tol=0.1)),
                             ('model5',...
                                                     min_impurity_decrease=0.01)),
                       

In [60]:
vtcypred = vtc.predict(stimXtest)
vtcytrain = vtc.predict(stimXtrain)

print(f1_score(vtcytrain,stimYtrain,average='weighted'),f1_score(stimYtest,vtcypred,average='weighted'))


0.9541241111481806 0.8667356015236592


In [61]:
pkl.dump(vtc,open('vtc.pkl','wb'))

In [40]:
print(classification_report(stimYtrain, vtcytrain))

              precision    recall  f1-score   support

        high       0.93      0.98      0.95      1220
         low       0.98      0.92      0.95      1218

    accuracy                           0.95      2438
   macro avg       0.95      0.95      0.95      2438
weighted avg       0.95      0.95      0.95      2438



In [41]:
print(classification_report(stimYtest, vtcypred))

              precision    recall  f1-score   support

        high       0.87      0.97      0.92       407
         low       0.97      0.85      0.91       406

    accuracy                           0.91       813
   macro avg       0.92      0.91      0.91       813
weighted avg       0.92      0.91      0.91       813



In [42]:
confusion_matrix(stimYtrain, vtcytrain)

array([[1193,   27],
       [  93, 1125]])

In [43]:
confusion_matrix(stimYtest,vtcypred)

array([[396,  11],
       [ 60, 346]])