# Libraries

In [1]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.style as style
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
from scipy import stats
import warnings


warnings.filterwarnings('ignore')

# Loading Dataset

In [2]:
data = pd.read_excel("dataset.xlsx")

In [3]:
data.shape

(29, 2)

# EDA

In [4]:
data.head()

Unnamed: 0,Sentence,Outcome
0,What is the difference between depression and ...,C
1,What are other psychiatric conditions that can...,C
2,Why is depression more prevalent in women than...,C
3,What happens during menopause with regards to ...,C
4,What efforts are underway to improve treatment...,C


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sentence  29 non-null     object
 1   Outcome   29 non-null     object
dtypes: object(2)
memory usage: 592.0+ bytes


In [7]:
cols=data.columns
cols

Index(['Sentence', 'Outcome'], dtype='object')

In [8]:
data=pd.DataFrame(data,columns=cols)

In [9]:
import copy
df_cpy=copy.deepcopy(data)
cols=np.array(data.columns[data.dtypes != object])
for i in df_cpy.columns:
    if i not in cols:
        df_cpy[i]=df_cpy[i].map(str)

In [10]:
df_cpy.head(20)

Unnamed: 0,Sentence,Outcome
0,What is the difference between depression and ...,C
1,What are other psychiatric conditions that can...,C
2,Why is depression more prevalent in women than...,C
3,What happens during menopause with regards to ...,C
4,What efforts are underway to improve treatment...,C
5,Why have researchers so far failed to identify...,C
6,Could other health conditions be contributing ...,C
7,Is depression treatable?,I
8,How is depression diagnosed and treated?,I
9,What is the most promising recent development ...,I


In [11]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

# build dictionary function
cols=np.array(data.columns[data.dtypes != object])
d = defaultdict(LabelEncoder)

# only for categorical columns apply dictionary by calling fit_transform 
df_cpy = df_cpy.apply(lambda x: d[x.name].fit_transform(x))
df_cpy[cols] = data[cols]

In [12]:
cols=df_cpy.columns

In [13]:
df_cpy=pd.DataFrame(df_cpy, columns=cols)

In [14]:
len(df_cpy.columns)

2

In [15]:
corr = df_cpy.corr()
corr.sort_values(["Outcome"], ascending = False, inplace = True)
print(corr.Outcome)

Outcome     1.000000
Sentence   -0.040044
Name: Outcome, dtype: float64


# Feature Vector and Target Variable

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from scipy.stats import randint

In [17]:
X=df_cpy.drop(columns=['Outcome']).values
y=df_cpy['Outcome'].values

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
X_train=X
y_train=y
X_train.shape, X_test.shape

((29, 1), (8, 1))

In [19]:
rf = RandomForestClassifier(n_jobs=-1)
params = {
    'max_depth': [3,5,10,20],
    'min_samples_leaf': [2,3,4,5,7,10],
    'n_estimators': [3,5,7,9,10,15],
    'random_state' : [2,3,4,5,7,10,15,33,40]
}

In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
grid_search = GridSearchCV(estimator=rf,param_grid=params,cv = 7,n_jobs=-1, verbose=1, scoring="accuracy")

In [22]:
grid_search.fit(X_train, y_train)

Fitting 7 folds for each of 1296 candidates, totalling 9072 fits


In [23]:
grid_search.best_score_

0.4571428571428572

In [24]:
rf_best = grid_search.best_estimator_
rf_best

In [None]:
classifier_rf=RandomForestClassifier(max_depth=3, min_samples_leaf=5, n_estimators=3,
                       n_jobs=-1, random_state=5)
classifier_rf.fit(X_train, y_train)

In [None]:
from sklearn.tree import plot_tree
plt.figure(figsize=(20,5))
plot_tree(rf_best.estimators_[1],feature_names=['Sentence'],class_names=['C','I','O','P'],filled=True);

In [None]:
plt.figure(figsize=(20,7))
plot_tree(rf_best.estimators_[2],feature_names=['Sentence'],class_names=['C','I','O','P'],filled=True);

In [None]:
plt.figure(figsize=(20,5))
plot_tree(rf_best.estimators_[0],feature_names=['Sentence'],class_names=['C','I','O','P'],filled=True);

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score

In [None]:
confusion_matrix(y_test, classifier_rf.predict(X_test))

In [None]:
Y_pred = classifier_rf.predict(X_test)  
acc_sc = accuracy_score(y_test, Y_pred)
print("{0:.1f}".format(acc_sc*100))

In [None]:
bal_acc_sc = balanced_accuracy_score(y_test, Y_pred)
bal_acc_sc*100

In [None]:
# import pickle

# with open('PICO_Classifier_RF','wb') as f:
#     pickle.dump(classifier_rf,f)


In [None]:
from mlxtend.plotting import plot_decision_regions

def plotSVC(title):
    plot_decision_regions(X_test, y_test, clf=svc, legend=2)

    # Adding axes annotations
    plt.title(f'SVM {title}')
    plt.show()

In [None]:
from sklearn.svm import SVC

kernels = ["linear", "rbf", "poly"]
for kernel in kernels:
    svc = SVC(C=0.05,kernel=kernel).fit(X_train, y_train)
    plotSVC("kernel=" + str(kernel))
    print("Accuracy:",accuracy_score(y_test,Y_pred))