In [57]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib import pyplot
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn import svm
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix

In [2]:
#Load data
data= pd.read_excel("C:/Users/HP/Desktop/Data Science Projects-Anusha/Pancreatic Cancer Data Set/Pancreatic Cancer detection .xlsx")
data

Unnamed: 0,sample_id,patient_cohort,sample_origin,age,sex,diagnosis,stage,benign_sample_diagnosis,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A
0,S1,Cohort1,BPTB,33,F,0,,,11.7,1.83222,0.893219,52.948840,654.282174,1262.000
1,S10,Cohort1,BPTB,81,F,0,,,,0.97266,2.037585,94.467030,209.488250,228.407
2,S100,Cohort2,BPTB,51,M,0,,,7.0,0.78039,0.145589,102.366000,461.141000,
3,S101,Cohort2,BPTB,61,M,0,,,8.0,0.70122,0.002805,60.579000,142.950000,
4,S102,Cohort2,BPTB,62,M,0,,,9.0,0.21489,0.000860,65.540000,41.088000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,S549,Cohort2,BPTB,68,M,1,IV,,,0.52026,7.058209,156.241000,525.178000,
586,S558,Cohort2,BPTB,71,F,1,IV,,,0.85956,8.341207,16.915000,245.947000,
587,S560,Cohort2,BPTB,63,M,1,IV,,,1.36851,7.674707,289.701000,537.286000,
588,S583,Cohort2,BPTB,75,F,1,IV,,,1.33458,8.206777,205.930000,722.523000,


In [3]:
#Removing unnecessary columns
data=data.drop(['sample_id','patient_cohort','sample_origin','stage','benign_sample_diagnosis','plasma_CA19_9','REG1A'],axis=1)
data

Unnamed: 0,age,sex,diagnosis,creatinine,LYVE1,REG1B,TFF1
0,33,F,0,1.83222,0.893219,52.948840,654.282174
1,81,F,0,0.97266,2.037585,94.467030,209.488250
2,51,M,0,0.78039,0.145589,102.366000,461.141000
3,61,M,0,0.70122,0.002805,60.579000,142.950000
4,62,M,0,0.21489,0.000860,65.540000,41.088000
...,...,...,...,...,...,...,...
585,68,M,1,0.52026,7.058209,156.241000,525.178000
586,71,F,1,0.85956,8.341207,16.915000,245.947000
587,63,M,1,1.36851,7.674707,289.701000,537.286000
588,75,F,1,1.33458,8.206777,205.930000,722.523000


In [4]:
#Create dummy values for column 'sex' since it has categorical data
sex_dummy= pd.get_dummies(data['sex'])
data= pd.concat([data,sex_dummy],axis=1)
data.drop(['sex'],inplace=True,axis=1)
data

Unnamed: 0,age,diagnosis,creatinine,LYVE1,REG1B,TFF1,F,M
0,33,0,1.83222,0.893219,52.948840,654.282174,1,0
1,81,0,0.97266,2.037585,94.467030,209.488250,1,0
2,51,0,0.78039,0.145589,102.366000,461.141000,0,1
3,61,0,0.70122,0.002805,60.579000,142.950000,0,1
4,62,0,0.21489,0.000860,65.540000,41.088000,0,1
...,...,...,...,...,...,...,...,...
585,68,1,0.52026,7.058209,156.241000,525.178000,0,1
586,71,1,0.85956,8.341207,16.915000,245.947000,1,0
587,63,1,1.36851,7.674707,289.701000,537.286000,0,1
588,75,1,1.33458,8.206777,205.930000,722.523000,1,0


In [5]:
#Moving the predictor label to the last column
newcols=['age','creatinine','LYVE1','REG1B','TFF1','F','M','diagnosis']
data= data.reindex(columns=newcols)
data

Unnamed: 0,age,creatinine,LYVE1,REG1B,TFF1,F,M,diagnosis
0,33,1.83222,0.893219,52.948840,654.282174,1,0,0
1,81,0.97266,2.037585,94.467030,209.488250,1,0,0
2,51,0.78039,0.145589,102.366000,461.141000,0,1,0
3,61,0.70122,0.002805,60.579000,142.950000,0,1,0
4,62,0.21489,0.000860,65.540000,41.088000,0,1,0
...,...,...,...,...,...,...,...,...
585,68,0.52026,7.058209,156.241000,525.178000,0,1,1
586,71,0.85956,8.341207,16.915000,245.947000,1,0,1
587,63,1.36851,7.674707,289.701000,537.286000,0,1,1
588,75,1.33458,8.206777,205.930000,722.523000,1,0,1


In [6]:
#Checking for missing values
np.any(np.isnan(data))

False

In [7]:
#Normalize the data (only data with continuous values)
scaler= MinMaxScaler()
normxcols= ['age','creatinine','LYVE1','REG1B','TFF1']
normdata= pd.DataFrame(scaler.fit_transform(data.iloc[:,:-3]),columns= normxcols,index= data.index)
normdata.rename(columns={'age':'Age','creatinine':'Creatinine','LYVE1':'Lyve1','REG1B':'Reg1b','TFF1':'Tff1'},inplace=True)
normdata.round(3)

Unnamed: 0,Age,Creatinine,Lyve1,Reg1b,Tff1
0,0.111,0.437,0.037,0.038,0.049
1,0.873,0.226,0.085,0.067,0.016
2,0.397,0.178,0.006,0.073,0.035
3,0.556,0.159,0.000,0.043,0.011
4,0.571,0.039,0.000,0.047,0.003
...,...,...,...,...,...
585,0.667,0.114,0.295,0.111,0.039
586,0.714,0.198,0.349,0.012,0.018
587,0.587,0.323,0.321,0.206,0.040
588,0.778,0.315,0.344,0.147,0.054


In [8]:
normdata1= pd.concat([normdata, data], axis=1, join= "inner")
normdata2= normdata1.drop(columns= normxcols)
normdata2
np.any(np.isnan(normdata2))

False

In [9]:
#Data Visualization
%matplotlib notebook

In [10]:
normdata2.columns

Index(['Age', 'Creatinine', 'Lyve1', 'Reg1b', 'Tff1', 'F', 'M', 'diagnosis'], dtype='object')

In [None]:
plt.figure(figsize= (10,10))
features= ['Age', 'Creatinine', 'Lyve1', 'Reg1b', 'Tff1', 'F', 'M', 'diagnosis']
for i in enumerate(features):
    plt.subplot(4,2,i[0]+1)
    sns.countplot(i[1], hue= 'diagnosis', data= normdata2)
    

In [11]:
#Cross Validation to test between the methods: Logistic Regression, SVM and DecisionTreeClassifier
X = normdata2.iloc[:,:-1]
y = normdata2.iloc[:,-1]

In [12]:
k= 10
kf = KFold(n_splits=k, random_state=None)
model = LogisticRegression(solver= 'liblinear', C= 5)
model2 = DecisionTreeClassifier(criterion="entropy", max_depth = 5)
model3 = svm.SVC(kernel='linear')

acc_score1 = []
acc_score2 = []
acc_score3 = []
for train_index , test_index in kf.split(X):
    X_train , X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train , y_test = y[train_index], y[test_index]

    model.fit(X_train, y_train)
    model2.fit(X_train, y_train)
    model3.fit(X_train, y_train)
    
    pred_values = model.predict(X_test)
    pred_values2= model2.predict(X_test)
    pred_values2= model3.predict(X_test)
     
    acc = accuracy_score(pred_values , y_test)
    acc_score1.append(acc)
    acc2 = accuracy_score(pred_values2 , y_test)
    acc_score2.append(acc2)
    acc3 = accuracy_score(pred_values2 , y_test)
    acc_score3.append(acc3)
     
avg_acc_score1= sum(acc_score1)/k
avg_acc_score2= sum(acc_score2)/k
avg_acc_score3= sum(acc_score3)/k
 
print('accuracy of each fold - {}'.format(acc_score1))
print('Avg accuracy : {}'.format(avg_acc_score1))
print('accuracy of each fold - {}'.format(acc_score2))
print('Avg accuracy : {}'.format(avg_acc_score2))
print('accuracy of each fold - {}'.format(acc_score3))
print('Avg accuracy : {}'.format(avg_acc_score3))



accuracy of each fold - [0.9830508474576272, 0.9830508474576272, 0.9322033898305084, 0.9491525423728814, 0.7796610169491526, 0.7457627118644068, 0.7627118644067796, 0.4915254237288136, 0.5423728813559322, 0.559322033898305]
Avg accuracy : 0.7728813559322034
accuracy of each fold - [0.9661016949152542, 0.9830508474576272, 0.9322033898305084, 0.9491525423728814, 0.847457627118644, 0.7627118644067796, 0.7457627118644068, 0.4067796610169492, 0.4745762711864407, 0.4915254237288136]
Avg accuracy : 0.7559322033898305
accuracy of each fold - [0.9661016949152542, 0.9830508474576272, 0.9322033898305084, 0.9491525423728814, 0.847457627118644, 0.7627118644067796, 0.7457627118644068, 0.4067796610169492, 0.4745762711864407, 0.4915254237288136]
Avg accuracy : 0.7559322033898305


In [126]:
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.4,random_state=None)
X_train, X_test, y_train, y_test

(          Age  Creatinine     Lyve1     Reg1b      Tff1  F  M
 178  0.634921    0.069638  0.000051  0.006621  0.002496  1  0
 484  0.428571    0.292479  0.241622  0.534625  0.181086  1  0
 189  0.412698    0.078552  0.155350  0.153557  0.049219  0  1
 265  0.206349    0.030641  0.185862  0.021111  0.032329  1  0
 527  0.904762    0.267409  0.448343  0.160314  0.094882  0  1
 ..        ...         ...       ...       ...       ... .. ..
 338  0.539683    0.373259  0.285340  0.047777  0.101330  0  1
 399  0.476190    0.222841  0.341874  0.381282  0.082396  0  1
 102  0.396825    0.033426  0.023903  0.002207  0.002977  0  1
 400  0.650794    0.108635  0.074637  0.080025  0.003585  0  1
 522  0.698413    0.370474  0.266142  0.561356  0.240596  0  1
 
 [354 rows x 7 columns],
           Age  Creatinine     Lyve1     Reg1b      Tff1  F  M
 223  0.777778    0.100279  0.061274  0.008342  0.000242  0  1
 156  0.031746    0.066852  0.000049  0.005356  0.001439  1  0
 29   0.063492    0.278552  

In [153]:
reg= LogisticRegression(C=25)
reg.fit(X_train, y_train)

LogisticRegression(C=25)

In [154]:
y_pred= reg.predict(X_test)
accuracyscore= accuracy_score(y_test,y_pred)
accuracyscore

0.8220338983050848

In [155]:
conf_mat= confusion_matrix(y_test,y_pred)
conf_mat

array([[143,  14],
       [ 28,  51]], dtype=int64)

In [156]:
rocauc= roc_auc_score(y_test,y_pred)
rocauc

0.7781988228654357