# **CLASSIFICATION USING RANDOM FOREST**

**The data used in the classification: https://www.kaggle.com/datasets/mysarahmadbhat/lung-cancer/**

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('lung-cancer-dataset.csv')
df.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [3]:
df.shape

(309, 16)

In [4]:
df.columns

Index(['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
       'PEER_PRESSURE', 'CHRONIC DISEASE', 'FATIGUE ', 'ALLERGY ', 'WHEEZING',
       'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',
       'SWALLOWING DIFFICULTY', 'CHEST PAIN', 'LUNG_CANCER'],
      dtype='object')

Renaming columns for consistency

In [5]:
df.rename(columns = {'CHRONIC DISEASE':'CHRONIC_DISEASE'}, inplace = True)
df.rename(columns = {'FATIGUE ':'FATIGUE'}, inplace = True)
df.rename(columns = {'ALLERGY ':'ALLERGY'}, inplace = True)
df.rename(columns = {'ALCOHOL CONSUMING':'ALCOHOL_CONSUMING'}, inplace = True)
df.rename(columns = {'SHORTNESS OF BREATH':'SHORTNESS_OF_BREATH'}, inplace = True)
df.rename(columns = {'SWALLOWING DIFFICULTY':'SWALLOWING_DIFFICULTY'}, inplace = True)
df.rename(columns = {'CHEST PAIN':'CHEST_PAIN'}, inplace = True)
df.columns

Index(['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
       'PEER_PRESSURE', 'CHRONIC_DISEASE', 'FATIGUE', 'ALLERGY', 'WHEEZING',
       'ALCOHOL_CONSUMING', 'COUGHING', 'SHORTNESS_OF_BREATH',
       'SWALLOWING_DIFFICULTY', 'CHEST_PAIN', 'LUNG_CANCER'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 309 non-null    object
 1   AGE                    309 non-null    int64 
 2   SMOKING                309 non-null    int64 
 3   YELLOW_FINGERS         309 non-null    int64 
 4   ANXIETY                309 non-null    int64 
 5   PEER_PRESSURE          309 non-null    int64 
 6   CHRONIC_DISEASE        309 non-null    int64 
 7   FATIGUE                309 non-null    int64 
 8   ALLERGY                309 non-null    int64 
 9   WHEEZING               309 non-null    int64 
 10  ALCOHOL_CONSUMING      309 non-null    int64 
 11  COUGHING               309 non-null    int64 
 12  SHORTNESS_OF_BREATH    309 non-null    int64 
 13  SWALLOWING_DIFFICULTY  309 non-null    int64 
 14  CHEST_PAIN             309 non-null    int64 
 15  LUNG_CANCER            

**Tidy up the data**

Replace F to 1 and M to 2

In [7]:
df['GENDER'].replace('F', 1, inplace = True)
df['GENDER'].replace('M', 2, inplace = True)

gender_na_count = df['GENDER'].isna().sum()
gender_unique = df['GENDER'].unique()
print(f'N/A values in Gender: {gender_na_count}')
print(f'Gender values: {gender_unique}')

N/A values in Gender: 0
Gender values: [2 1]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['GENDER'].replace('F', 1, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['GENDER'].replace('M', 2, inplace = True)
  df['GENDER'].replace('M', 2, inplace = True)


Replace YES to 2 and NO to 1 (based on the values in the dataset)

In [8]:
df['LUNG_CANCER'].replace('NO', 1, inplace = True)
df['LUNG_CANCER'].replace('YES', 2, inplace = True)

lungcancer_na_count = df['LUNG_CANCER'].isna().sum()
lungcancer_unique = df['LUNG_CANCER'].unique()
print(f'N/A values in Lung Cancer: {lungcancer_na_count}')
print(f'Lung Cancer values: {lungcancer_unique}')

N/A values in Lung Cancer: 0
Lung Cancer values: [2 1]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['LUNG_CANCER'].replace('NO', 1, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['LUNG_CANCER'].replace('YES', 2, inplace = True)
  df['LUNG_CANCER'].replace('YES', 2, inplace = True)


Check for missing values or other unique values in each column

In [9]:
for (columnName, columnData) in df.items():
    na_count = columnData.isna().sum()
    unique_count = columnData.unique()
    print(f'N/A values in {columnName}: {na_count}')
    print(f'{columnName} values: {unique_count}\n')

N/A values in GENDER: 0
GENDER values: [2 1]

N/A values in AGE: 0
AGE values: [69 74 59 63 75 52 51 68 53 61 72 60 58 48 57 44 64 21 65 55 62 56 67 77
 70 54 49 73 47 71 66 76 78 81 79 38 39 87 46]

N/A values in SMOKING: 0
SMOKING values: [1 2]

N/A values in YELLOW_FINGERS: 0
YELLOW_FINGERS values: [2 1]

N/A values in ANXIETY: 0
ANXIETY values: [2 1]

N/A values in PEER_PRESSURE: 0
PEER_PRESSURE values: [1 2]

N/A values in CHRONIC_DISEASE: 0
CHRONIC_DISEASE values: [1 2]

N/A values in FATIGUE: 0
FATIGUE values: [2 1]

N/A values in ALLERGY: 0
ALLERGY values: [1 2]

N/A values in WHEEZING: 0
WHEEZING values: [2 1]

N/A values in ALCOHOL_CONSUMING: 0
ALCOHOL_CONSUMING values: [2 1]

N/A values in COUGHING: 0
COUGHING values: [2 1]

N/A values in SHORTNESS_OF_BREATH: 0
SHORTNESS_OF_BREATH values: [2 1]

N/A values in SWALLOWING_DIFFICULTY: 0
SWALLOWING_DIFFICULTY values: [2 1]

N/A values in CHEST_PAIN: 0
CHEST_PAIN values: [2 1]

N/A values in LUNG_CANCER: 0
LUNG_CANCER values: [2 

All of the values above are binary (except age).

**Split data into train and test data**

In [10]:
df_input = df.drop('LUNG_CANCER', axis = 1)
df_output = df['LUNG_CANCER']

In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_input, df_output, test_size = 0.2, random_state = 0)

In [12]:
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

rf_class = RandomForestClassifier()
rf_class.fit(x_train, y_train)
y_predict=rf_class.predict(x_test)

In [13]:
from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(y_test, y_predict, target_names = ['1', '2']))


Classification Report

              precision    recall  f1-score   support

           1       0.88      0.70      0.78        10
           2       0.94      0.98      0.96        52

    accuracy                           0.94        62
   macro avg       0.91      0.84      0.87        62
weighted avg       0.93      0.94      0.93        62



Hyperparameter Tuning (FROM TEMPLATE)

In [14]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'criterion':['gini', 'entropy', 'log_loss'],
    'max_depth':[2,4,6,8], 
}

In [15]:
RF_class2 = RandomForestClassifier()
RF_class2= GridSearchCV(RF_class2 ,
                            param_grid = parameters,   # hyperparameters
                            scoring='accuracy',        # metric for scoring
                            cv=5)  

In [16]:
RF_class2.fit(x_train,y_train)
print("Tuned Hyperparameters :", RF_class2.best_params_)
print("Accuracy :",RF_class2.best_score_)

Tuned Hyperparameters : {'criterion': 'gini', 'max_depth': 6}
Accuracy : 0.8947755102040815


In [17]:
#Train the data using based parameter
RF_class_best = RandomForestClassifier(criterion= 'entropy',max_depth=6)

In [18]:
RF_class_best.fit(x_train,y_train)

In [19]:
y_predict_best=RF_class_best.predict(x_test)

In [21]:
print('\nClassification Report\n')
print(classification_report(y_test, y_predict_best, target_names=['1','2']))


Classification Report

              precision    recall  f1-score   support

           1       0.83      0.50      0.62        10
           2       0.91      0.98      0.94        52

    accuracy                           0.90        62
   macro avg       0.87      0.74      0.78        62
weighted avg       0.90      0.90      0.89        62

