# data handling

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
df=pd.read_csv('Clinical_Data.csv')
df.shape

(50, 19)

In [3]:
df['state'].value_counts()

1    37
0    13
Name: state, dtype: int64

In [4]:
print(df['Sex'].unique())

['F' 'M']


In [5]:
df['Sex'].value_counts()

F    26
M    24
Name: Sex, dtype: int64

In [6]:
#encoding Sex to f and m to 0 and 1
from sklearn.preprocessing import OneHotEncoder
df['Sex']=df['Sex'].astype('category')
df['Gen_new']=df['Sex'].cat.codes

In [7]:
enc=OneHotEncoder()

In [8]:
enc_data=pd.DataFrame(enc.fit_transform(df[['Gen_new']]).toarray())

In [9]:
# F=0 , M=1 
New_df=df.join(enc_data)
print(New_df)

    No.       Subject Age range (year) Sex   BSA  \
0     1  Subject C 01            15-20   F  1.63   
1     2  Subject C 02            15-20   M  2.13   
2     3  Subject C 03            20-25   F  1.67   
3     4  Subject C 04            25-30   M  2.02   
4     5  Subject C 05            35-40   F  1.48   
5     6  Subject C 06            25-30   F  1.45   
6     7  Subject C 07            30-35   M  1.82   
7     8  Subject C 08            30-35   M  1.92   
8     9  Subject C 09            30-35   M  1.97   
9    10  Subject C 10            30-35   F  1.73   
10   11  Subject C 11            30-35   F  1.63   
11   12  Subject C 12            30-35   M  1.66   
12   13  Subject C 13            25-30   F  1.37   
13    1    Subject 01            20-25   F  1.49   
14    2    Subject 02            25-30   F  1.45   
15    3    Subject 03            15-20   F  1.33   
16    4    Subject 04            20-25   M  1.71   
17    5    Subject 05            25-30   F  1.37   
18    6    S

In [10]:
New_df.drop(New_df.columns[[0, 1, 2,3,20,21]], axis=1, inplace=True) #one hot encoding added extra columns we don't need so i droped them also

In [11]:
cols = New_df.columns.tolist()

In [12]:
#changing the new column position to keep the data frame order
cols = cols[-1:] + cols[:-1]
cols

['Gen_new',
 'BSA',
 'LAEDV\n(Left Atrial End-Diastolic Volume)',
 'LAESV\n(Left Atrial End-Sistolic Volume)',
 'LASV\n(Left Atrial Stroke Volume)',
 'LAEDV\nIndex',
 'LAESV\nIndex',
 'LASV\nIndex',
 'LAEF\n(Left Atrial Ejection-Fraction)',
 'LVEF\n(Left Ventricle Ejection-Fraction)',
 'T2*\n(Heart)',
 'LAEDV\nIndex2',
 'LAESV\nIndex2',
 'LASV\nIndex2',
 'LAEF',
 'state']

In [13]:
#Merging the new encoded column to the data frame
New_df = New_df[cols]  

In [14]:
print(New_df)

    Gen_new   BSA  LAEDV\n(Left Atrial End-Diastolic Volume)  \
0         0  1.63                                       41.0   
1         1  2.13                                       40.0   
2         0  1.67                                       42.0   
3         1  2.02                                       37.0   
4         0  1.48                                       48.8   
5         0  1.45                                       35.0   
6         1  1.82                                       59.0   
7         1  1.92                                       51.6   
8         1  1.97                                       52.0   
9         0  1.73                                       29.0   
10        0  1.63                                       50.0   
11        1  1.66                                       57.0   
12        0  1.37                                       34.0   
13        0  1.49                                       40.0   
14        0  1.45                       

# splitting the data into train and test

In [15]:
X = New_df.iloc[:, :-1].values #features
y = New_df.iloc[:, -1].values #state column 

In [25]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1])

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# SVC

In [66]:
from sklearn import svm
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='poly', C=2,degree=2,probability=True)

In [67]:
#cross-validating the data
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X, y, cv=2)
scores    

array([0.72, 0.72])

In [68]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))


0.72 accuracy with a standard deviation of 0.00


In [72]:
clf.fit(X,y)
y_pred_train0=clf.predict(X_train)
y_pred_test0=clf.predict(X_test)

from sklearn.metrics import accuracy_score
train_accuracy0=accuracy_score(y_train,y_pred_train0)
test_accuracy0=accuracy_score(y_test,y_pred_test0)
print('train accuracy',train_accuracy0)
print('test accuracy',test_accuracy0)

train accuracy 0.875
test accuracy 0.8


In [76]:
X_test[0]

array([ 1.  ,  1.28, 44.  , 21.6 , 22.4 , 34.38, 16.88, 17.5 , 50.91,
       58.  ,  4.  , 44.5 , 23.13, 21.37, 48.02])

In [74]:
clf.predict_proba(X_test[2].reshape(1, -1))

array([[0.46219496, 0.53780504]])

In [75]:
import pickle

# Save all the trained models in a list
with open('svcfinal.pkl', 'wb') as f:
  pickle.dump(clf, f)

# KNN

In [69]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train, y_train)

KNeighborsClassifier()

In [70]:
y_pred_train=neigh.predict(X_train)
y_pred_test=neigh.predict(X_test)

from sklearn.metrics import accuracy_score
train_accuracy=accuracy_score(y_train,y_pred_train)
test_accuracy=accuracy_score(y_test,y_pred_test)
print('train accuracy',train_accuracy)
print('test accuracy',test_accuracy)

train accuracy 0.825
test accuracy 0.7


# RandomForestClassifier

In [371]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=8,max_depth=2, random_state=8)
rf.fit(X,y)

RandomForestClassifier(max_depth=2, n_estimators=8, random_state=8)

In [372]:
y_pred_train1=rf.predict(X_train)
y_pred_test1=rf.predict(X_test)

from sklearn.metrics import accuracy_score
train_accuracy1=accuracy_score(y_train,y_pred_train1)
test_accuracy1=accuracy_score(y_test,y_pred_test1)
print('train accuracy',train_accuracy1)
print('test accuracy',test_accuracy1)

train accuracy 0.925
test accuracy 0.9


# DecisionTreeClassifier

In [403]:
from sklearn import tree
dt = tree.DecisionTreeClassifier(criterion='gini',random_state=5,max_depth=2,splitter='random',max_leaf_nodes=5)
dt = dt.fit(X, y)

In [404]:
from sklearn.metrics import accuracy_score
y_pred_train2=dt.predict(X_train)
y_pred_test2=dt.predict(X_test)
train_accuracy2=accuracy_score(y_train,y_pred_train2)
test_accuracy2=accuracy_score(y_test,y_pred_test2)
print('train accuracy',train_accuracy2)
print('test accuracy',test_accuracy2)

train accuracy 0.825
test accuracy 0.8
