In [1]:
##### Standard Libraries #####
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import log, dot, e
%matplotlib inline

##### For Preprocessing #####
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

##### For Building the Model #####
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline

##### For Validation of the Model #####
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

### Loading and Preprocessing Data

In [2]:
### Cancer dataset
df = pd.read_csv("lung_cancer_data.csv")
df.drop(columns=df.columns[0], axis=1, inplace=True)
print("Size of data:", df.shape)
df.head()

Size of data: (1000, 25)


Unnamed: 0,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,P1,33,1,2,4,5,4,3,2,2,...,3,4,2,2,3,1,2,3,4,Low
1,P10,17,1,3,1,5,3,4,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,P100,35,1,4,5,6,5,5,4,6,...,8,7,9,2,1,4,6,7,2,High
3,P1000,37,1,7,7,7,7,6,7,7,...,4,2,3,1,4,5,6,7,5,High
4,P101,46,1,6,8,7,7,7,6,7,...,3,2,4,1,4,2,4,2,3,High


In [3]:
predictor_df = df.copy()

In [4]:
predictor_df.drop(columns=['Frequent Cold', 'Dry Cough', 'Snoring'] , axis=1, inplace=True)
predictor_df

Unnamed: 0,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,...,Passive Smoker,Chest Pain,Coughing of Blood,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Level
0,P1,33,1,2,4,5,4,3,2,2,...,2,2,4,3,4,2,2,3,1,Low
1,P10,17,1,3,1,5,3,4,2,2,...,4,2,3,1,3,7,8,6,2,Medium
2,P100,35,1,4,5,6,5,5,4,6,...,3,4,8,8,7,9,2,1,4,High
3,P1000,37,1,7,7,7,7,6,7,7,...,7,7,8,4,2,3,1,4,5,High
4,P101,46,1,6,8,7,7,7,6,7,...,7,7,9,3,2,4,1,4,2,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,P995,44,1,6,7,7,7,7,6,7,...,8,7,7,5,3,2,7,8,2,High
996,P996,37,2,6,8,7,7,7,6,7,...,8,7,7,9,6,5,7,2,4,High
997,P997,25,2,4,5,6,5,5,4,6,...,3,4,8,8,7,9,2,1,4,High
998,P998,18,2,6,8,7,7,7,6,7,...,7,7,9,3,2,4,1,4,2,High


In [5]:
#predictor_df = predictor_df.replace({'Level' : {'Low': 1, 'Medium': 2, 'High': 3}})
#predictor_df.head(10)

In [6]:
y = predictor_df.Level.values
x_data = predictor_df.drop(['Patient Id', 'Level'], axis = 1)
x_data

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,Smoking,Passive Smoker,Chest Pain,Coughing of Blood,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails
0,33,1,2,4,5,4,3,2,2,4,3,2,2,4,3,4,2,2,3,1
1,17,1,3,1,5,3,4,2,2,2,2,4,2,3,1,3,7,8,6,2
2,35,1,4,5,6,5,5,4,6,7,2,3,4,8,8,7,9,2,1,4
3,37,1,7,7,7,7,6,7,7,7,7,7,7,8,4,2,3,1,4,5
4,46,1,6,8,7,7,7,6,7,7,8,7,7,9,3,2,4,1,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,44,1,6,7,7,7,7,6,7,7,7,8,7,7,5,3,2,7,8,2
996,37,2,6,8,7,7,7,6,7,7,7,8,7,7,9,6,5,7,2,4
997,25,2,4,5,6,5,5,4,6,7,2,3,4,8,8,7,9,2,1,4
998,18,2,6,8,7,7,7,6,7,7,8,7,7,9,3,2,4,1,4,2


In [7]:
###Data Standardization
x_min = np.min(x_data, axis=0)
x_max = np.max(x_data, axis=0) 
x = (x_data - x_min)/(x_max- x_min).values
x

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,Smoking,Passive Smoker,Chest Pain,Coughing of Blood,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails
0,0.322034,0.0,0.142857,0.428571,0.571429,0.428571,0.333333,0.166667,0.166667,0.500000,0.285714,0.142857,0.125,0.375,0.250,0.428571,0.125,0.142857,0.285714,0.000
1,0.050847,0.0,0.285714,0.000000,0.571429,0.285714,0.500000,0.166667,0.166667,0.166667,0.142857,0.428571,0.125,0.250,0.000,0.285714,0.750,1.000000,0.714286,0.125
2,0.355932,0.0,0.428571,0.571429,0.714286,0.571429,0.666667,0.500000,0.833333,1.000000,0.142857,0.285714,0.375,0.875,0.875,0.857143,1.000,0.142857,0.000000,0.375
3,0.389831,0.0,0.857143,0.857143,0.857143,0.857143,0.833333,1.000000,1.000000,1.000000,0.857143,0.857143,0.750,0.875,0.375,0.142857,0.250,0.000000,0.428571,0.500
4,0.542373,0.0,0.714286,1.000000,0.857143,0.857143,1.000000,0.833333,1.000000,1.000000,1.000000,0.857143,0.750,1.000,0.250,0.142857,0.375,0.000000,0.428571,0.125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.508475,0.0,0.714286,0.857143,0.857143,0.857143,1.000000,0.833333,1.000000,1.000000,0.857143,1.000000,0.750,0.750,0.500,0.285714,0.125,0.857143,1.000000,0.125
996,0.389831,1.0,0.714286,1.000000,0.857143,0.857143,1.000000,0.833333,1.000000,1.000000,0.857143,1.000000,0.750,0.750,1.000,0.714286,0.500,0.857143,0.142857,0.375
997,0.186441,1.0,0.428571,0.571429,0.714286,0.571429,0.666667,0.500000,0.833333,1.000000,0.142857,0.285714,0.375,0.875,0.875,0.857143,1.000,0.142857,0.000000,0.375
998,0.067797,1.0,0.714286,1.000000,0.857143,0.857143,1.000000,0.833333,1.000000,1.000000,1.000000,0.857143,0.750,1.000,0.250,0.142857,0.375,0.000000,0.428571,0.125


In [8]:
### Split Dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state = 42)

print("x_train size:", x_train.shape)
print("y_train size:", y_train.shape)
print("\nx_test size:", x_test.shape)
print("y_test size:", y_test.shape)

x_train size: (750, 20)
y_train size: (750,)

x_test size: (250, 20)
y_test size: (250,)


### Setting Up Models

#### Logistic Regression

In [9]:
log_reg = LogisticRegression().fit(x_train, y_train)

#### Naive Bayes

In [10]:
gnb = GaussianNB().fit(x_train, y_train)

### Using models on dataset

#### Logistic Regression

In [12]:
log_y_train_pred = log_reg.predict(x_train)
accuracy_score(y_train, log_y_train_pred)

0.992

In [13]:
log_y_test_pred = log_reg.predict(x_test)
accuracy_score(y_test, log_y_test_pred)

0.984

#### Naive Bayes

In [14]:
gnb_y_train_pred = gnb.predict(x_train)
accuracy_score(y_train, gnb_y_train_pred)

0.8493333333333334

In [15]:
gnb_y_test_pred = gnb.predict(x_test)
accuracy_score(y_test, gnb_y_test_pred)

0.848

### See Weights

In [18]:
###Logistic Regression
print(log_reg.)
log_reg.coef_

array([[-0.43109729, -0.0158753 ,  1.96043546,  1.54054023, -0.19093447,
        -0.02937905,  0.3386353 ,  0.84340818,  1.40899758,  1.84860302,
         0.61294049,  2.54147184, -0.07467239,  2.01067658,  3.17950641,
         1.59390585,  1.62744026,  0.32068648,  1.93745497,  1.21069739],
       [-0.62286929,  0.37065642, -0.73817849, -1.14844705, -0.82292255,
         0.10411514, -0.9212164 , -0.43767659, -0.33025062, -2.67474808,
         0.02151208, -1.71629518,  0.45434474, -1.51250244, -3.41448383,
        -0.28803272, -1.09634539, -2.69607003, -2.55756277, -2.85894484],
       [ 1.05396658, -0.35478112, -1.22225697, -0.39209318,  1.01385703,
        -0.07473609,  0.5825811 , -0.40573159, -1.07874696,  0.82614507,
        -0.63445258, -0.82517666, -0.37967235, -0.49817414,  0.23497743,
        -1.30587314, -0.53109487,  2.37538355,  0.6201078 ,  1.64824745]])