# Installing and Importing Libraries

In [61]:
import pandas as pd
import numpy as np

In [62]:
from sklearn.model_selection import train_test_split

# Random Forest Classifier Model
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import accuracy_score, classification_report

In [63]:
# Visualization
import matplotlib.pyplot as plt

# Elbow Method
from kneed import KneeLocator

# Clustering Model
from sklearn.cluster import KMeans

# Metrics
from sklearn.metrics import silhouette_score

# Preprocessing
from sklearn.preprocessing import StandardScaler

# Loading and Viewing Dataset

In [64]:
df = pd.read_csv("Datasets/lung_cancer_patient.csv")

In [65]:
df.head(10)

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
1,1,P10,17,1,3,1,5,3,4,2,...,1,3,7,8,6,2,1,7,2,Medium
2,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
3,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,High
4,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High
5,5,P102,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
6,6,P103,52,2,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
7,7,P104,28,2,3,1,4,3,2,3,...,3,2,2,4,2,2,3,4,3,Low
8,8,P105,35,2,4,5,6,5,6,5,...,1,4,3,2,4,6,2,4,1,Medium
9,9,P106,46,1,2,3,4,2,4,3,...,1,2,4,6,5,4,2,1,5,Medium


In [66]:
# Dataset Columns 
df.columns

Index(['index', 'Patient Id', 'Age', 'Gender', 'Air Pollution', 'Alcohol use',
       'Dust Allergy', 'OccuPational Hazards', 'Genetic Risk',
       'chronic Lung Disease', 'Balanced Diet', 'Obesity', 'Smoking',
       'Passive Smoker', 'Chest Pain', 'Coughing of Blood', 'Fatigue',
       'Weight Loss', 'Shortness of Breath', 'Wheezing',
       'Swallowing Difficulty', 'Clubbing of Finger Nails', 'Frequent Cold',
       'Dry Cough', 'Snoring', 'Level'],
      dtype='object')

In [67]:
# Checking Datatypes of Each Column
df.dtypes

index                        int64
Patient Id                  object
Age                          int64
Gender                       int64
Air Pollution                int64
Alcohol use                  int64
Dust Allergy                 int64
OccuPational Hazards         int64
Genetic Risk                 int64
chronic Lung Disease         int64
Balanced Diet                int64
Obesity                      int64
Smoking                      int64
Passive Smoker               int64
Chest Pain                   int64
Coughing of Blood            int64
Fatigue                      int64
Weight Loss                  int64
Shortness of Breath          int64
Wheezing                     int64
Swallowing Difficulty        int64
Clubbing of Finger Nails     int64
Frequent Cold                int64
Dry Cough                    int64
Snoring                      int64
Level                       object
dtype: object

In [68]:
# Unique Values in Each Columns
pd.Series({x: df[x].unique() for x in df})

index                       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
Patient Id                  [P1, P10, P100, P1000, P101, P102, P103, P104,...
Age                         [33, 17, 35, 37, 46, 52, 28, 44, 64, 39, 34, 2...
Gender                                                                 [1, 2]
Air Pollution                                        [2, 3, 4, 7, 6, 5, 1, 8]
Alcohol use                                          [4, 1, 5, 7, 8, 3, 6, 2]
Dust Allergy                                         [5, 6, 7, 4, 2, 8, 1, 3]
OccuPational Hazards                                 [4, 3, 5, 7, 2, 6, 8, 1]
Genetic Risk                                            [3, 4, 5, 6, 7, 2, 1]
chronic Lung Disease                                    [2, 4, 7, 6, 3, 5, 1]
Balanced Diet                                           [2, 6, 7, 4, 5, 3, 1]
Obesity                                                 [4, 2, 7, 3, 5, 6, 1]
Smoking                                              [3, 2, 7, 8

In [69]:
# Amount of Rows and Columns
df.shape

(1000, 26)

In [70]:
# Statistics of Dataset
df.describe()

Unnamed: 0,index,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,...,Coughing of Blood,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,499.5,37.174,1.402,3.84,4.563,5.165,4.84,4.58,4.38,4.491,...,4.859,3.856,3.855,4.24,3.777,3.746,3.923,3.536,3.853,2.926
std,288.819436,12.005493,0.490547,2.0304,2.620477,1.980833,2.107805,2.126999,1.848518,2.135528,...,2.427965,2.244616,2.206546,2.285087,2.041921,2.270383,2.388048,1.832502,2.039007,1.474686
min,0.0,14.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,249.75,27.75,1.0,2.0,2.0,4.0,3.0,2.0,3.0,2.0,...,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
50%,499.5,36.0,1.0,3.0,5.0,6.0,5.0,5.0,4.0,4.0,...,4.0,3.0,3.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0
75%,749.25,45.0,2.0,6.0,7.0,7.0,7.0,7.0,6.0,7.0,...,7.0,5.0,6.0,6.0,5.0,5.0,5.0,5.0,6.0,4.0
max,999.0,73.0,2.0,8.0,8.0,8.0,8.0,7.0,7.0,7.0,...,9.0,9.0,8.0,9.0,8.0,8.0,9.0,7.0,7.0,7.0


## Notes
- Data is already encoded
- Data is sufficiently normalized

# Data Preprocessing

In [71]:
# Checking for Null Values
df[df.isna().any(axis = 1)] # No Null Values Found

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level


In [72]:
# Removing Unnecessary Columns
df.drop(["index", "Patient Id"], axis = 1, inplace = True)

In [73]:
# Viewing New Dataframe
df.head(10)

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,Low
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,High
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,High
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,High
5,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,High
6,52,2,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,Low
7,28,2,3,1,4,3,2,3,4,3,...,3,2,2,4,2,2,3,4,3,Low
8,35,2,4,5,6,5,6,5,5,5,...,1,4,3,2,4,6,2,4,1,Medium
9,46,1,2,3,4,2,4,3,3,3,...,1,2,4,6,5,4,2,1,5,Medium


In [74]:
# Encoding Categorical Values Into Numeric Values
df["Level"] = df["Level"].apply(lambda x: 3 if x == "High" else 2 if x == "Medium" else 1)

In [75]:
# Viewing New Dataframe
df.head(10)

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,1
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,2
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,3
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,3
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,3
5,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,3
6,52,2,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,1
7,28,2,3,1,4,3,2,3,4,3,...,3,2,2,4,2,2,3,4,3,1
8,35,2,4,5,6,5,6,5,5,5,...,1,4,3,2,4,6,2,4,1,2
9,46,1,2,3,4,2,4,3,3,3,...,1,2,4,6,5,4,2,1,5,2


In [76]:
# Separating Columns Used For Labelling
y = df["Level"]
x = df.drop("Level", axis = 1)

In [77]:
# Viewing New Label Dataframe
x.head(10)

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Coughing of Blood,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring
0,33,1,2,4,5,4,3,2,2,4,...,4,3,4,2,2,3,1,2,3,4
1,17,1,3,1,5,3,4,2,2,2,...,3,1,3,7,8,6,2,1,7,2
2,35,1,4,5,6,5,5,4,6,7,...,8,8,7,9,2,1,4,6,7,2
3,37,1,7,7,7,7,6,7,7,7,...,8,4,2,3,1,4,5,6,7,5
4,46,1,6,8,7,7,7,6,7,7,...,9,3,2,4,1,4,2,4,2,3
5,35,1,4,5,6,5,5,4,6,7,...,8,8,7,9,2,1,4,6,7,2
6,52,2,2,4,5,4,3,2,2,4,...,4,3,4,2,2,3,1,2,3,4
7,28,2,3,1,4,3,2,3,4,3,...,1,3,2,2,4,2,2,3,4,3
8,35,2,4,5,6,5,6,5,5,5,...,5,1,4,3,2,4,6,2,4,1
9,46,1,2,3,4,2,4,3,3,3,...,4,1,2,4,6,5,4,2,1,5


In [78]:
# Viewing New Training Dataframe
y.head(10)

0    1
1    2
2    3
3    3
4    3
5    3
6    1
7    1
8    2
9    2
Name: Level, dtype: int64

# Random Forest Classifier

In [79]:
# Splitting Into Training And Testing Sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [80]:
# Creating The Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators = 100, random_state = 42)

In [81]:
# Training The Classifier
rf_classifier.fit(x_train, y_train)

In [82]:
# Making Predictions Using The Random Forest Classifier
y_pred = rf_classifier.predict(x_test)
y_pred

array([2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 3, 1, 3, 3, 3, 3, 2,
       1, 2, 3, 1, 2, 3, 1, 1, 1, 3, 2, 1, 1, 3, 3, 2, 3, 1, 3, 3, 1, 2,
       2, 1, 1, 2, 3, 1, 2, 3, 1, 3, 3, 3, 1, 3, 3, 1, 2, 1, 3, 3, 1, 2,
       3, 3, 2, 3, 3, 1, 2, 2, 2, 2, 3, 1, 1, 2, 2, 2, 1, 2, 3, 3, 3, 1,
       3, 3, 1, 1, 2, 3, 3, 3, 3, 3, 2, 1, 1, 3, 1, 1, 2, 3, 3, 2, 2, 2,
       2, 3, 3, 1, 2, 3, 2, 3, 2, 2, 3, 3, 1, 1, 2, 3, 3, 3, 3, 3, 1, 2,
       1, 3, 2, 1, 2, 3, 2, 2, 1, 2, 1, 1, 3, 2, 1, 3, 1, 3, 3, 1, 1, 3,
       1, 3, 3, 3, 1, 3, 1, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 2, 3, 3, 1, 3,
       2, 1, 2, 3, 1, 3, 1, 2, 3, 2, 3, 2, 1, 2, 1, 1, 2, 3, 3, 2, 3, 3,
       1, 2], dtype=int64)

In [83]:
# Evaluating The Model's Performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [84]:
print(accuracy)

1.0


In [85]:
print(classification_rep)

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        55
           2       1.00      1.00      1.00        63
           3       1.00      1.00      1.00        82

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



# Clustering Model

In [86]:
# Normalizing Data
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [88]:
# Viewing Scaled Dataset
print(x_scaled)

[[-0.34784816 -0.81990292 -0.90667901 ... -0.83861787 -0.41855027
   0.72865507]
 [-1.68123833 -0.81990292 -0.41391868 ... -1.38459305  1.54417079
  -0.6282445 ]
 [-0.18117439 -0.81990292  0.07884165 ...  1.34528283  1.54417079
  -0.6282445 ]
 ...
 [-1.01454325  1.21965659  0.07884165 ...  1.34528283  1.54417079
  -0.6282445 ]
 [-1.59790145  1.21965659  1.06436231 ...  0.25333248 -0.90923053
   0.05020528]
 [ 0.81886824 -0.81990292  1.06436231 ...  1.34528283  1.54417079
  -0.6282445 ]]


In [89]:
# Creating The K-Means Clustering Model
kmeans = KMeans(
    init = "random",
    n_clusters = 3,
    n_init = 10,
    max_iter = 300,
    random_state = 42
)

In [90]:
# Fitting The Model
kmeans.fit(x_scaled)

In [91]:
# Predicting The Clusters Using K-Means Clustering Model
y_kmeans = kmeans.predict(x_scaled)

In [92]:
# Predicted Labels From Clustering
print(y_kmeans)

[1 1 0 2 2 0 1 1 2 1 2 2 2 2 1 2 1 2 2 2 2 0 2 2 2 0 1 1 2 0 2 0 2 1 1 2 1
 2 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 2 2 0 1 0 1 1
 0 1 1 2 0 2 1 0 0 1 1 1 0 2 2 2 0 2 0 2 2 2 2 2 2 0 2 1 0 2 2 2 2 2 2 0 2
 0 1 1 2 2 0 1 1 2 1 2 2 2 2 1 2 1 2 2 2 2 0 2 2 0 0 1 1 2 0 2 0 2 1 1 2 1
 2 2 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 0 1 1 2 1 0 1 0 1 1
 0 1 1 2 0 1 1 0 0 1 1 1 0 2 2 2 1 2 0 2 2 2 2 2 2 0 2 2 0 2 2 2 2 2 2 0 2
 0 2 2 2 2 0 1 1 2 1 2 2 2 1 1 2 1 2 2 2 2 0 2 2 1 0 1 1 2 0 2 0 2 1 1 1 1
 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 2 1 0 1 0 1 1
 0 1 1 2 0 1 1 0 0 1 1 1 0 2 2 2 1 2 0 2 2 2 2 2 2 0 2 1 0 2 2 2 2 2 2 0 2
 0 1 1 2 2 0 1 1 2 1 2 2 2 1 1 2 1 2 2 2 2 0 2 2 1 0 1 1 2 0 2 0 2 1 1 1 1
 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 2 1 0 1 0 1 1
 0 1 1 2 0 1 1 0 0 1 1 1 0 2 2 2 1 2 0 2 2 2 2 2 2 0 2 1 0 2 2 2 2 2 2 0 2
 0 2 1 2 2 0 1 1 2 1 2 2 2 1 1 2 1 2 2 2 2 0 2 2 1 0 1 1 2 0 2 0 2 1 1 1 1
 2 2 1 1 1 1 1 1 1 0 1 1 

In [95]:
# Results With Original Dataset Shown
df_clustering_results = x
df_clustering_results["Clustering Labels"] = y_kmeans
df_clustering_results

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Clustering Labels
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,1
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,1
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,0
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,2
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,44,1,6,7,7,7,7,6,7,7,...,5,3,2,7,8,2,4,5,3,2
996,37,2,6,8,7,7,7,6,7,7,...,9,6,5,7,2,4,3,1,4,2
997,25,2,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,0
998,18,2,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,2


In [99]:
# How Many Data Points Are Assigned To Each Label
label_counts = df_clustering_results["Clustering Labels"].value_counts()
label_counts

Clustering Labels
1    445
2    375
0    180
Name: count, dtype: int64