In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from seaborn import set_style
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from mpl_toolkits import mplot3d
set_style("whitegrid")


In [None]:
clTrials = pd.read_csv("Cleaned_and_Modified_Data.csv")

In [None]:
clTrials

In [None]:
clTrials_train, clTrials_test = train_test_split(clTrials.copy(), 
                                            shuffle=True,
                                            random_state=314,
                                            stratify=clTrials['Completed_Status'],
                                            test_size=.2)

In [None]:
clTrials_completed = clTrials[clTrials["Status"]=="Completed"]
clTrials_not_completed = clTrials[clTrials["Status"]!="Completed"]

In [None]:
clTrials_train_modified = clTrials_train[clTrials_train['Enrollment'] <= 1500]
clTrials_test_modified = clTrials_test[clTrials_test['Enrollment'] <= 1500]

## Use k-nearest neighbor model.

In [None]:
TEST = list(clTrials_test_modified['Completed_Status'])

# Function that applies knn to k neighbours with features p,q,r
def knn_for_three_features_pred(k,p,q,r):
    a = KNeighborsClassifier(k)
    a.fit(clTrials_train_modified[[p, q, r]],clTrials_train_modified.Completed_Status)
    return a.predict(clTrials_test_modified[[p, q, r]])
def knn_for_three_features_err(k,p,q,r):
    c=0
    for i in range(len(knn_for_three_features_pred(k,p,q,r))):
        if knn_for_three_features_pred(k,p,q,r)[i] == TEST[i]:
            c = c+1
    return c/len(knn_for_three_features_pred(k,p,q,r))

In [None]:
# For graphing the number of neighbours for knn versus error
inp = [2*i+1 for i in range(18)]
def graph(p,q,r):
    out = [knn_for_three_features_err(k,p,q,r) for k in inp]
    plt.plot(inp,out)

In [None]:
graph("Enrollment","Study Duration (months)","Intervention Model")

In [None]:
graph("Enrollment","Study Duration (months)","Minimum Age")

In [None]:
graph("Enrollment","Study Duration (months)","Hospital")

### So far, it seems to us that, only 2 features are important, namely Enrollment and Study Duration. Therefore, we now apply KNN to only these two features

In [None]:
# Function that applies knn to k neighbours with features p,q
def knn_for_two_features_pred(k,p,q):
    a = KNeighborsClassifier(k)
    a.fit(clTrials_train_modified[[p, q]],clTrials_train_modified.Completed_Status)
    return a.predict(clTrials_test_modified[[p, q]])
def knn_for_two_features_err(k,p,q):
    c=0
    for i in range(len(knn_for_two_features_pred(k,p,q))):
        if knn_for_two_features_pred(k,p,q)[i] == TEST[i]:
            c = c+1
    return c/len(knn_for_two_features_pred(k,p,q))

In [None]:
plt.plot(inp, [knn_for_two_features_err(k,"Enrollment","Study Duration (months)") for k in inp])

### With k-nearest approach, where k=5, we get 80% accuracy. With k = 27, we get ~83%

In [None]:
#Confusion Matrix
#Decide what y values to use!
sklearn.metrics.confusion_matrix(y_true, y_pred)

## Now use Decision Tree Classifier

In [None]:
# Defining a function that takes depth of the decision tree and features as input and gives out acurracy
TEST = list(clTrials_test['Completed_Status'])
def decision_tree(depth,features):
    X = np.array([[list(clTrials_train[features[j]])[i] for j in range(len(features))] for i in range(len(clTrials_train["Enrollment"]))])
    y = np.array(clTrials_train["Completed_Status"])
    X_test = np.array([[list(clTrials_test[features[j]])[i] for j in range(len(features))] for i in range(len(clTrials_test["Enrollment"]))])
    tree = DecisionTreeClassifier(max_depth  = depth)
    tree.fit(X,y)
    preds = tree.predict(X_test)
    c=0
    for j in range(len(preds)):
        if preds[j] == TEST[j]:
            c = c+1
    return c/len(preds)
    

In [None]:
decision_tree(5,["Enrollment","Minimum Age","Study Duration (months)"])

In [None]:
#to show overfitting as we increase the depth, we introduce the accuracy function on the tranining set.
TRAIN = list(clTrials_train['Completed_Status'])
def decision_tree_on_training_set(depth,features):
    X = np.array([[list(clTrials_train[features[j]])[i] for j in range(len(features))] for i in range(len(clTrials_train["Enrollment"]))])
    y = np.array(clTrials_train["Completed_Status"])
    #X_test = np.array([[list(clTrials_test[features[j]])[i] for j in range(len(features))] for i in range(len(clTrials_test["Enrollment"]))])
    tree = DecisionTreeClassifier(max_depth  = depth)
    tree.fit(X,y)
    preds = tree.predict(X)
    c=0
    for j in range(len(preds)):
        if preds[j] == TRAIN[j]:
            c = c+1
    return c/len(preds)

In [None]:
decision_tree_on_training_set(5,["Enrollment","Minimum Age","Study Duration (months)"])

In [None]:
Features = ["Enrollment","Study Duration (months)"]
b = [decision_tree(i,Features) for i in range(1,16)]
b

In [None]:
plt.plot(range(1,16),b)

In [None]:
c = [decision_tree_on_training_set(i,Features) for i in range(1,16)]
c

In [None]:
plt.plot(range(1,16),c)

In [None]:
plt.figure(figsize = (10,8))
plt.plot(range(1,16),b,color = 'blue',label = "Accuracy of Prediction")
plt.plot(range(1,16),c,color = 'green',label = "Accuracy of Training set")
plt.legend(loc = "upper left")
plt.show()

In [None]:
# xx1, xx2 = np.meshgrid(np.arange(-.01, 1.01, .01),
#                           np.arange(-.01, 1.01, .01))

# X_pred = np.zeros((len(xx1.reshape(-1,1)), 2))
# X_pred[:,0] = xx1.flatten()
#X_pred[:,1] = xx2.flatten()
X = np.array([[list(clTrials_train["Study Duration (months)"])[i],list(clTrials_train["Enrollment"])[i]] for i in range(len(clTrials_train["Enrollment"]))])
y = np.array(clTrials_train["Completed_Status"])

X_pred = np.array([[list(clTrials_test["Study Duration (months)"])[i],list(clTrials_test["Enrollment"])[i]] for i in range(len(clTrials_test["Enrollment"]))])


for i in range(1,11):
    tree = DecisionTreeClassifier(max_depth  = i)
    
    tree.fit(X, y)
    
    preds = tree.predict(X_pred)
    
    plt.figure(figsize=(10,8))

    plt.scatter(X_pred[preds==0,0],
                X_pred[preds==0,1],
                alpha=.1,
                c='lightblue',
                s=100)
    plt.scatter(X_pred[preds==1,0],
                X_pred[preds==1,1],
                alpha=.1,
                c='orange',
                s=100)
    
    plt.scatter(X[y==0,0], 
                X[y==0,1],
                label='Training 0',
                c = 'darkblue',
                edgecolor='black',
                s=100)
    plt.scatter(X[y==1,0], 
                X[y==1,1],
                label='Training 1',
                c = 'darkorange',
                marker='v',
                edgecolor='black',
                s=100)
    
    plt.title("Maximum Depth of " + str(i), fontsize=20)
    plt.xlabel("$x_1$", fontsize=18)
    plt.ylabel("$x_2$", fontsize=18)
    plt.legend(fontsize=14)
    
    plt.show()


### We do not even see the staircase shape of the decision tree because the two different types of data are mixed with each other. This might suggest Decision Trees might not be the best approach for this data set.