In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from collections import Counter

from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import tree
import xgboost as xgb
from catboost import CatBoostClassifier

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import roc_curve, auc

In [2]:
df = pd.read_csv('./PCOS.csv')

df.columns = ['time','diagnosed','result','age','overweight','weightgain','periods','conceiving','chinHair','cheeksHair',
              'upperLipHair','betweenBreastHair','armsHair','innerThighHair','acneOrskinTag','hairThinning','darkPatch',
              'tiredness','moodSwings','exercise','eatOutside','cannedFood','city']

data = pd.DataFrame(df[df.diagnosed == 'Yes'])
data.reset_index(drop = True, inplace = True)
data.drop('time', axis = 1, inplace = True)
data.weightgain.fillna('abc', inplace = True)

In [3]:
def g(s):
    if (s == 'Yes'):
        return 1
    if (s == 'No'):
        return 0
    if (s == 'YES'):
        return 1
    if (s == 'NO'):
        return 0
    if (s == 'abc'):
        return 2

In [4]:
data['Diagnosed'] = data.diagnosed.apply(g)
data['Overweight'] = data.overweight.apply(g)
data['Weightgain'] = data.weightgain.apply(g)
data['Periods'] = data.periods.apply(g)
data['Conceiving'] = data.conceiving.apply(g)
data['AcneOrskinTag'] = data.acneOrskinTag.apply(g)
data['HairThinning'] = data.hairThinning.apply(g)
data['DarkPatch'] = data.darkPatch.apply(g)
data['Tiredness'] = data.tiredness.apply(g)
data['MoodSwings'] = data.moodSwings.apply(g)
data['CannedFood'] = data.cannedFood.apply(g)
data['City'] = data.city.apply(g)

del data['diagnosed']
del data['overweight']
del data['weightgain']
del data['periods']
del data['conceiving']
del data['acneOrskinTag']
del data['hairThinning']
del data['darkPatch']
del data['tiredness']
del data['moodSwings']
del data['cannedFood']
del data['city']

In [5]:
data['result'] = data['result'].map({'Yes': 1, 'No': 0, 'Yes(Detected Positive)': 1, 'No(Detected Negative)': 0})

In [6]:
X = data.drop(labels = ["result"],axis = 1)
y = data.result

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [8]:
ada_clf = AdaBoostClassifier(random_state=42)
ada_clf.fit(X_train,y_train)
acc_ada_clf_train = round(ada_clf.score(X_train, y_train)*100,2) 
acc_ada_clf_test = round(ada_clf.score(X_test,y_test)*100,2)
print("Training Accuracy: % {}".format(acc_ada_clf_train))
print("Testing Accuracy: % {}".format(acc_ada_clf_test))

Training Accuracy: % 96.49
Testing Accuracy: % 85.71


In [9]:
gb_clf = GradientBoostingClassifier(learning_rate=0.01,random_state=42)
gb_clf.fit(X_train,y_train)
acc_gb_clf_train = round(gb_clf.score(X_train, y_train)*100,2) 
acc_gb_clf_test = round(gb_clf.score(X_test,y_test)*100,2)
print("Training Accuracy: % {}".format(acc_gb_clf_train))
print("Testing Accuracy: % {}".format(acc_gb_clf_test))

Training Accuracy: % 88.6
Testing Accuracy: % 79.59


In [10]:
xgb_clf = xgb.XGBClassifier(random_state=42,learning_rate=0.01)
xgb_clf.fit(X_train,y_train)
acc_xgb_clf_train = round(xgb_clf.score(X_train, y_train)*100,2) 
acc_xgb_clf_test = round(xgb_clf.score(X_test,y_test)*100,2)
print("Training Accuracy: % {}".format(acc_xgb_clf_train))
print("Testing Accuracy: % {}".format(acc_xgb_clf_test))

Training Accuracy: % 95.61
Testing Accuracy: % 85.71


In [11]:
cat_clf = CatBoostClassifier(learning_rate=0.01, l2_leaf_reg=5)
cat_clf.fit(X_train,y_train)
acc_cat_clf_train = round(cat_clf.score(X_train, y_train)*100,2) 
acc_cat_clf_test = round(cat_clf.score(X_test,y_test)*100,2)
print("Training Accuracy: % {}".format(acc_cat_clf_train))
print("Testing Accuracy: % {}".format(acc_cat_clf_test))

0:	learn: 0.6866574	total: 136ms	remaining: 2m 16s
1:	learn: 0.6808684	total: 138ms	remaining: 1m 8s
2:	learn: 0.6758406	total: 139ms	remaining: 46.1s
3:	learn: 0.6717942	total: 140ms	remaining: 34.8s
4:	learn: 0.6676064	total: 140ms	remaining: 27.9s
5:	learn: 0.6614378	total: 141ms	remaining: 23.4s
6:	learn: 0.6550329	total: 142ms	remaining: 20.2s
7:	learn: 0.6501065	total: 143ms	remaining: 17.7s
8:	learn: 0.6436460	total: 144ms	remaining: 15.9s
9:	learn: 0.6382613	total: 145ms	remaining: 14.4s
10:	learn: 0.6334181	total: 146ms	remaining: 13.1s
11:	learn: 0.6289416	total: 147ms	remaining: 12.1s
12:	learn: 0.6234268	total: 148ms	remaining: 11.3s
13:	learn: 0.6186504	total: 150ms	remaining: 10.5s
14:	learn: 0.6136100	total: 151ms	remaining: 9.89s
15:	learn: 0.6092543	total: 152ms	remaining: 9.33s
16:	learn: 0.6042086	total: 153ms	remaining: 8.83s
17:	learn: 0.5993197	total: 154ms	remaining: 8.39s
18:	learn: 0.5946578	total: 155ms	remaining: 8s
19:	learn: 0.5909176	total: 156ms	remaining

222:	learn: 0.2179777	total: 478ms	remaining: 1.67s
223:	learn: 0.2174103	total: 479ms	remaining: 1.66s
224:	learn: 0.2164128	total: 480ms	remaining: 1.65s
225:	learn: 0.2151739	total: 524ms	remaining: 1.8s
226:	learn: 0.2147760	total: 555ms	remaining: 1.89s
227:	learn: 0.2139847	total: 556ms	remaining: 1.88s
228:	learn: 0.2125060	total: 556ms	remaining: 1.87s
229:	learn: 0.2121202	total: 557ms	remaining: 1.86s
230:	learn: 0.2116418	total: 558ms	remaining: 1.86s
231:	learn: 0.2109396	total: 559ms	remaining: 1.85s
232:	learn: 0.2102188	total: 560ms	remaining: 1.84s
233:	learn: 0.2092872	total: 561ms	remaining: 1.83s
234:	learn: 0.2085923	total: 562ms	remaining: 1.83s
235:	learn: 0.2079368	total: 563ms	remaining: 1.82s
236:	learn: 0.2071013	total: 569ms	remaining: 1.83s
237:	learn: 0.2060326	total: 570ms	remaining: 1.82s
238:	learn: 0.2053004	total: 571ms	remaining: 1.82s
239:	learn: 0.2049356	total: 572ms	remaining: 1.81s
240:	learn: 0.2041794	total: 573ms	remaining: 1.8s
241:	learn: 0.

427:	learn: 0.1110498	total: 828ms	remaining: 1.11s
428:	learn: 0.1107965	total: 829ms	remaining: 1.1s
429:	learn: 0.1105226	total: 831ms	remaining: 1.1s
430:	learn: 0.1101735	total: 832ms	remaining: 1.1s
431:	learn: 0.1099408	total: 833ms	remaining: 1.09s
432:	learn: 0.1095624	total: 834ms	remaining: 1.09s
433:	learn: 0.1092798	total: 835ms	remaining: 1.09s
434:	learn: 0.1090191	total: 836ms	remaining: 1.09s
435:	learn: 0.1086876	total: 837ms	remaining: 1.08s
436:	learn: 0.1082052	total: 839ms	remaining: 1.08s
437:	learn: 0.1079191	total: 840ms	remaining: 1.08s
438:	learn: 0.1077559	total: 841ms	remaining: 1.07s
439:	learn: 0.1074883	total: 842ms	remaining: 1.07s
440:	learn: 0.1071733	total: 843ms	remaining: 1.07s
441:	learn: 0.1070042	total: 854ms	remaining: 1.08s
442:	learn: 0.1066329	total: 854ms	remaining: 1.07s
443:	learn: 0.1064039	total: 855ms	remaining: 1.07s
444:	learn: 0.1061461	total: 856ms	remaining: 1.07s
445:	learn: 0.1058863	total: 857ms	remaining: 1.06s
446:	learn: 0.1

687:	learn: 0.0567039	total: 1.16s	remaining: 528ms
688:	learn: 0.0565693	total: 1.16s	remaining: 526ms
689:	learn: 0.0564530	total: 1.17s	remaining: 524ms
690:	learn: 0.0563265	total: 1.17s	remaining: 522ms
691:	learn: 0.0562190	total: 1.17s	remaining: 520ms
692:	learn: 0.0561202	total: 1.17s	remaining: 518ms
693:	learn: 0.0560278	total: 1.17s	remaining: 516ms
694:	learn: 0.0558906	total: 1.17s	remaining: 514ms
695:	learn: 0.0557745	total: 1.17s	remaining: 512ms
696:	learn: 0.0556562	total: 1.17s	remaining: 510ms
697:	learn: 0.0555417	total: 1.17s	remaining: 508ms
698:	learn: 0.0554369	total: 1.17s	remaining: 506ms
699:	learn: 0.0553189	total: 1.18s	remaining: 504ms
700:	learn: 0.0552257	total: 1.18s	remaining: 502ms
701:	learn: 0.0551230	total: 1.18s	remaining: 500ms
702:	learn: 0.0550115	total: 1.18s	remaining: 498ms
703:	learn: 0.0548676	total: 1.18s	remaining: 496ms
704:	learn: 0.0547329	total: 1.18s	remaining: 494ms
705:	learn: 0.0546126	total: 1.18s	remaining: 492ms
706:	learn: 

910:	learn: 0.0372870	total: 1.52s	remaining: 148ms
911:	learn: 0.0372358	total: 1.52s	remaining: 147ms
912:	learn: 0.0371826	total: 1.52s	remaining: 145ms
913:	learn: 0.0371297	total: 1.52s	remaining: 143ms
914:	learn: 0.0370791	total: 1.52s	remaining: 141ms
915:	learn: 0.0370244	total: 1.52s	remaining: 140ms
916:	learn: 0.0369551	total: 1.52s	remaining: 138ms
917:	learn: 0.0368983	total: 1.52s	remaining: 136ms
918:	learn: 0.0368298	total: 1.53s	remaining: 135ms
919:	learn: 0.0367697	total: 1.53s	remaining: 133ms
920:	learn: 0.0367197	total: 1.53s	remaining: 131ms
921:	learn: 0.0366688	total: 1.53s	remaining: 129ms
922:	learn: 0.0365991	total: 1.53s	remaining: 128ms
923:	learn: 0.0365506	total: 1.53s	remaining: 126ms
924:	learn: 0.0364963	total: 1.53s	remaining: 124ms
925:	learn: 0.0364423	total: 1.53s	remaining: 123ms
926:	learn: 0.0363865	total: 1.53s	remaining: 121ms
927:	learn: 0.0363217	total: 1.53s	remaining: 119ms
928:	learn: 0.0362682	total: 1.54s	remaining: 117ms
929:	learn: 

In [12]:
from joblib import dump

In [14]:
dump(ada_clf, './savedModels/adaBoost.sav')

['./savedModels/adaBoost.sav']

In [15]:
dump(gb_clf, './savedModels/gradBoost.sav')

['./savedModels/gradBoost.sav']

In [16]:
dump(xgb_clf, './savedModels/xgBoost.sav')

['./savedModels/xgBoost.sav']

In [18]:
cat_clf.save_model('./savedModels/catBoost', format='cbm')

In [19]:
dump(cat_clf, './savedModels/catBoostJoblib.sav')

['./savedModels/catBoostJoblib.sav']