In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib import pyplot
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.tree import plot_tree

In [None]:
#Load data
data= pd.read_excel("C:/Users/HP/Desktop/Data Science Projects-Anusha/Pancreatic Cancer Data Set/Pancreatic Cancer detection .xlsx")
data

In [None]:
#Removing unnecessary columns
data=data.drop(['sample_id','patient_cohort','sample_origin','stage','benign_sample_diagnosis','plasma_CA19_9','REG1A'],axis=1)
data

In [None]:
#Create dummy values for column 'sex' since it has categorical data
sex_dummy= pd.get_dummies(data['sex'])
data= pd.concat([data,sex_dummy],axis=1)
data.drop(['sex'],inplace=True,axis=1)
data

In [None]:
#Moving the predictor label to the last column
newcols=['age','creatinine','LYVE1','REG1B','TFF1','F','M','diagnosis']
data= data.reindex(columns=newcols)
data

In [None]:
#Checking for missing values
np.any(np.isnan(data))

In [None]:
#Normalize the data (only data with continuous values)
scaler= MinMaxScaler()
normxcols= ['age','creatinine','LYVE1','REG1B','TFF1']
normdata= pd.DataFrame(scaler.fit_transform(data.iloc[:,:-3]),columns= normxcols,index= data.index)
normdata.rename(columns={'age':'Age','creatinine':'Creatinine','LYVE1':'Lyve1','REG1B':'Reg1b','TFF1':'Tff1'},inplace=True)
normdata.round(3)

In [None]:
normdata1= pd.concat([normdata, data], axis=1, join= "inner")
normdata2= normdata1.drop(columns= normxcols)
normdata2
np.any(np.isnan(normdata2))

In [None]:
#Data Visualization
%matplotlib notebook

In [None]:
normdata2.columns

In [None]:
plt.figure(figsize= (10,10))
features= ['Age', 'Creatinine', 'Lyve1', 'Reg1b', 'Tff1', 'F', 'M', 'diagnosis']
for i in enumerate(features):
    plt.subplot(4,2,i[0]+1)
    sns.countplot(i[1], hue= 'diagnosis', data= normdata2)
    

In [None]:
#Model Comparison using K-Fold Cross validation
k= 10
kf = KFold(n_splits=k, random_state=None)
model = LogisticRegression(solver= 'liblinear',C=20)
model2 = DecisionTreeClassifier()
model3 = svm.SVC(kernel='linear')

acc_score1 = []
acc_score2 = []
acc_score3 = []
for train_index , test_index in kf.split(X):
    X_train , X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train , y_test = y[train_index], y[test_index]

    model.fit(X_train, y_train)
    model2.fit(X_train, y_train)
    model3.fit(X_train, y_train)
    
    pred_values = model.predict(X_test)
    pred_values2= model2.predict(X_test)
    pred_values2= model3.predict(X_test)
     
    acc = accuracy_score(pred_values , y_test)
    acc_score1.append(acc)
    acc2 = accuracy_score(pred_values2 , y_test)
    acc_score2.append(acc2)
    acc3 = accuracy_score(pred_values2 , y_test)
    acc_score3.append(acc3)
     
avg_acc_score1= sum(acc_score1)/k
avg_acc_score2= sum(acc_score2)/k
avg_acc_score3= sum(acc_score3)/k
 
print('accuracy of each fold - {}'.format(acc_score1))
print('Avg accuracy : {}'.format(avg_acc_score1))
print('accuracy of each fold - {}'.format(acc_score2))
print('Avg accuracy : {}'.format(avg_acc_score2))
print('accuracy of each fold - {}'.format(acc_score3))
print('Avg accuracy : {}'.format(avg_acc_score3))

In [None]:
X = normdata2.iloc[:,:-1]
y = normdata2.iloc[:,-1]

In [None]:
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.4,random_state=None)
X_train, X_test, y_train, y_test

In [None]:
reg= LogisticRegression(C=20)
reg.fit(X_train, y_train)

In [None]:
#Accuracy= 80%
plot_confusion_matrix(reg,X_test,y_test,display_labels=["Does not have HD","Has HD"])