In [1]:
#opening the fixed table
import pandas as pd
df = pd.read_csv('fixed_table', low_memory=False)
df.drop(columns=df.columns[0], axis=1,inplace=True)
print(df.dtypes)

initial_weight        int64
age_at_diagnosis      int64
vital_status         object
age_at_index          int64
days_to_birth         int64
                      ...  
ENSG00000288669.1     int64
ENSG00000288670.1     int64
ENSG00000288671.1     int64
ENSG00000288674.1     int64
ENSG00000288675.1     int64
Length: 60670, dtype: object


In [2]:
#removing outliers (these patients have 1 and 0 days left until death, possibly skewing the results)
df = df.drop([15, 17])

In [3]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

#encoding data to be read as floats rather than objects
df['demographic_id'] = le.fit_transform(df['demographic_id'])
df['vital_status'] = le.fit_transform(df['vital_status'])
df['paper_clinical_stage'] = le.fit_transform(df['paper_clinical_stage'])
#E.g. vital_status has been encoded to be 0 for alive and 1 for dead
deadPatients = df[df['vital_status'] != 0]
alivePatients = df[df['vital_status'] == 0]
print(df['vital_status'].unique())

[0 1]


In [4]:
#preparing the data in X and y sets
X = df.drop(columns = ['initial_weight','vital_status', 'year_of_death','days_to_death'])
y = df['vital_status']
X2 = deadPatients.drop(columns = ['initial_weight','vital_status', 'year_of_death','days_to_death'])
y2 = deadPatients['days_to_death']

print(X.head(3))
print(y.head(3))

   age_at_diagnosis  age_at_index  days_to_birth  year_of_birth  \
0             25041            68         -25041           1940   
1             21901            59         -21901           1952   
2             30643            83         -30643           1926   

   demographic_id  paper_clinical_stage  ENSG00000000003.15  \
0              43                     0                4511   
1              16                     0                6035   
2              11                     2                5583   

   ENSG00000000005.6  ENSG00000000419.13  ENSG00000000457.14  ...  \
0                402                1433                 201  ...   
1                 23                1886                 775  ...   
2                  8                2542                 946  ...   

   ENSG00000288661.1  ENSG00000288662.1  ENSG00000288663.1  ENSG00000288665.1  \
0                  0                  0                  4                  0   
1                  0                  0

In [5]:
#dividing data into sets to be trained and tested at a 20% test size
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size = 0.2)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2.values, y2, test_size = 0.2)

In [6]:
#Scale the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)
X2_train = sc.fit_transform(X2_train)
X2_test = sc.fit_transform(X2_test)

In [7]:
#Preparing and training the models
#Creating a function to train all models at once

def models(X_train, y_train):
    #0 = Decision Tree Classifier
    from sklearn.tree import DecisionTreeClassifier
    dtc = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    dtc.fit(X_train,y_train)
    
    #1 = Random Forest Classifier
    from sklearn.ensemble import RandomForestClassifier
    rfc = RandomForestClassifier(n_estimators=10, criterion = 'entropy', random_state = 0)
    rfc.fit(X_train,y_train)

    #2 = SVC (linear k)
    from sklearn.svm import SVC
    svc_ln = SVC(kernel='linear', random_state = 0)
    svc_ln.fit(X_train,y_train)
    
    #3 = SVC (RBF)
    from sklearn.svm import SVC
    svc_rbf = SVC(kernel='rbf', random_state = 0)
    svc_rbf.fit(X_train,y_train)

    #4 = Logistic Regression
    from sklearn.linear_model import LogisticRegression
    lreg = LogisticRegression(random_state = 0)
    lreg.fit(X_train,y_train)

    #5 = Gaussian NB
    from sklearn.naive_bayes import GaussianNB
    gnb = GaussianNB()
    gnb.fit(X_train,y_train)

    #6 = K-Neighbours
    from sklearn.neighbors import KNeighborsClassifier
    knc = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2)
    knc.fit(X_train,y_train)
    
    #print the accuracy of each model
    print('0 - Decision Tree Classifier', dtc.score(X_train, y_train))
    print('1 - Random Forest Classifier', rfc.score(X_train, y_train))
    print('2 - SVC (linear k)', svc_ln.score(X_train, y_train))
    print('3 - SVC (RBF)', svc_rbf.score(X_train, y_train))
    print('4 - Logistic Regression', lreg.score(X_train, y_train))
    print('5 - Gaussian NB', gnb.score(X_train, y_train))
    print('6 - K-Neighbours', knc.score(X_train, y_train))
    
    return dtc, rfc, svc_ln, svc_rbf, lreg, gnb, knc

model = models(X_train, y_train)


0 - Decision Tree Classifier 1.0
1 - Random Forest Classifier 1.0
2 - SVC (linear k) 1.0
3 - SVC (RBF) 0.9772727272727273
4 - Logistic Regression 1.0
5 - Gaussian NB 1.0
6 - K-Neighbours 0.75


In [8]:
from sklearn.metrics import confusion_matrix

for i in range(len(model)):
    cm = confusion_matrix(y_test, model[i].predict(X_test))
    
    TN, FP, FN, TP = confusion_matrix(y_test, model[i].predict(X_test)).ravel()
    
    score = (TP+TN)/(TP+TN+FN+FP)
    print(cm)
    print('Model[{}] Testing Accuracy = "{}"'.format(i, score),"\n")

[[4 2]
 [3 2]]
Model[0] Testing Accuracy = "0.5454545454545454" 

[[3 3]
 [2 3]]
Model[1] Testing Accuracy = "0.5454545454545454" 

[[1 5]
 [1 4]]
Model[2] Testing Accuracy = "0.45454545454545453" 

[[0 6]
 [0 5]]
Model[3] Testing Accuracy = "0.45454545454545453" 

[[1 5]
 [1 4]]
Model[4] Testing Accuracy = "0.45454545454545453" 

[[0 6]
 [0 5]]
Model[5] Testing Accuracy = "0.45454545454545453" 

[[1 5]
 [0 5]]
Model[6] Testing Accuracy = "0.5454545454545454" 



In [9]:
#0 = Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
daysToDeathModel = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
daysToDeathModel.fit(X2_train,y2_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [10]:
daysPrediction = daysToDeathModel.predict(X_test)
daysPrediction

array([3115.,  481.,  148.,  541., 2680., 3115.,  911.,  522.,  541.,
        418., 2043.])

In [11]:
for i in range(len(X_test)):
    if model[1].predict([X_test[i]]) == 0:
        print("Patient predicted to be Alive")
        if(y_test.iloc[i]==0):
            print("\t correct prediction")
        else:
            print("\t incorrect prediction - Patient is dead.")
        
    else:
        print("Patient predicted to be Dead")
        if(y_test.iloc[i]==1):
            print("\t correct prediction, prediction of days left for this patient: {}".format(int(daysPrediction[i])))
        else:
            print("\t incorrect prediction - Patient is alive")
print(y_test)

Patient predicted to be Dead
	 correct prediction, prediction of days left for this patient: 3115
Patient predicted to be Dead
	 correct prediction, prediction of days left for this patient: 481
Patient predicted to be Alive
	 incorrect prediction - Patient is dead.
Patient predicted to be Dead
	 incorrect prediction - Patient is alive
Patient predicted to be Alive
	 correct prediction
Patient predicted to be Alive
	 correct prediction
Patient predicted to be Dead
	 correct prediction, prediction of days left for this patient: 911
Patient predicted to be Dead
	 incorrect prediction - Patient is alive
Patient predicted to be Alive
	 incorrect prediction - Patient is dead.
Patient predicted to be Dead
	 incorrect prediction - Patient is alive
Patient predicted to be Alive
	 correct prediction
28    1
8     1
53    1
5     0
24    0
56    0
34    1
45    0
19    1
39    0
50    0
Name: vital_status, dtype: int32
