<a href="https://colab.research.google.com/github/0alfajar/MachineLearningProject/blob/main/Lung_Cancer_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Dependencies

In [37]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

Data Collection and Processing

In [6]:
# load the dataset
lung_cancer_data = pd.read_csv('/content/dataseter.csv')

In [7]:
# show a 5 row of data
lung_cancer_data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC_DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL_CONSUMING,COUGHING,SHORTNESS_OF_BREATH,SWALLOWING_DIFFICULTY,CHEST_PAIN,LUNG_CANCER
0,M,65,Yes,Yes,Yes,No,No,Yes,No,No,No,No,No,No,Yes,NO
1,F,55,Yes,No,No,Yes,Yes,No,No,No,Yes,Yes,Yes,No,No,NO
2,F,78,No,No,Yes,Yes,Yes,No,Yes,No,Yes,Yes,No,Yes,Yes,YES
3,M,60,No,Yes,Yes,Yes,No,Yes,No,Yes,Yes,No,Yes,No,No,YES
4,F,80,Yes,Yes,No,Yes,Yes,No,Yes,No,Yes,Yes,Yes,Yes,No,NO


In [8]:
# number of row and columns
lung_cancer_data.shape

(3000, 16)

In [9]:
lung_cancer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 3000 non-null   object
 1   AGE                    3000 non-null   int64 
 2   SMOKING                3000 non-null   object
 3   YELLOW_FINGERS         3000 non-null   object
 4   ANXIETY                3000 non-null   object
 5   PEER_PRESSURE          3000 non-null   object
 6   CHRONIC_DISEASE        3000 non-null   object
 7   FATIGUE                3000 non-null   object
 8   ALLERGY                3000 non-null   object
 9   WHEEZING               3000 non-null   object
 10  ALCOHOL_CONSUMING      3000 non-null   object
 11  COUGHING               3000 non-null   object
 12  SHORTNESS_OF_BREATH    3000 non-null   object
 13  SWALLOWING_DIFFICULTY  3000 non-null   object
 14  CHEST_PAIN             3000 non-null   object
 15  LUNG_CANCER          

In [10]:
# checking missing values
lung_cancer_data.isnull().sum()

GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC_DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL_CONSUMING        0
COUGHING                 0
SHORTNESS_OF_BREATH      0
SWALLOWING_DIFFICULTY    0
CHEST_PAIN               0
LUNG_CANCER              0
dtype: int64

In [11]:
# check distribution of target
lung_cancer_data['LUNG_CANCER'].value_counts()

LUNG_CANCER
YES    1518
NO     1482
Name: count, dtype: int64

Data Encoding

In [12]:
encoder = LabelEncoder()

column_to_encode = lung_cancer_data.columns.difference(['Age'])

for col in column_to_encode:
  lung_cancer_data[col] = encoder.fit_transform(lung_cancer_data[col])

In [13]:
# after encode
lung_cancer_data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC_DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL_CONSUMING,COUGHING,SHORTNESS_OF_BREATH,SWALLOWING_DIFFICULTY,CHEST_PAIN,LUNG_CANCER
0,1,35,1,1,1,0,0,1,0,0,0,0,0,0,1,0
1,0,25,1,0,0,1,1,0,0,0,1,1,1,0,0,0
2,0,48,0,0,1,1,1,0,1,0,1,1,0,1,1,1
3,1,30,0,1,1,1,0,1,0,1,1,0,1,0,0,1
4,0,50,1,1,0,1,1,0,1,0,1,1,1,1,0,0


0 --> NO


1 --> YES

Data Standardization

In [14]:
min_max_scaler = MinMaxScaler()

column_to_scale = lung_cancer_data.columns.difference(['LUNG_CANCER'])

for col in column_to_scale:
  lung_cancer_data[col] = min_max_scaler.fit_transform(lung_cancer_data[[col]])

In [15]:
lung_cancer_data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC_DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL_CONSUMING,COUGHING,SHORTNESS_OF_BREATH,SWALLOWING_DIFFICULTY,CHEST_PAIN,LUNG_CANCER
0,1.0,0.7,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
1,0.0,0.5,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0
2,0.0,0.96,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1
3,1.0,0.6,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1
4,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0


Separate Feature and Column

In [16]:
X = lung_cancer_data.drop(['LUNG_CANCER'], axis=1)
y = lung_cancer_data['LUNG_CANCER']

In [17]:
print(X)

      GENDER   AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
0        1.0  0.70      1.0             1.0      1.0            0.0   
1        0.0  0.50      1.0             0.0      0.0            1.0   
2        0.0  0.96      0.0             0.0      1.0            1.0   
3        1.0  0.60      0.0             1.0      1.0            1.0   
4        0.0  1.00      1.0             1.0      0.0            1.0   
...      ...   ...      ...             ...      ...            ...   
2995     0.0  0.82      0.0             1.0      1.0            0.0   
2996     0.0  0.90      1.0             0.0      1.0            1.0   
2997     0.0  0.64      0.0             0.0      0.0            1.0   
2998     1.0  0.00      1.0             1.0      0.0            0.0   
2999     1.0  0.20      1.0             0.0      0.0            1.0   

      CHRONIC_DISEASE  FATIGUE  ALLERGY  WHEEZING  ALCOHOL_CONSUMING  \
0                 0.0      1.0      0.0       0.0                0.0   
1  

In [18]:
print(y)

0       0
1       0
2       1
3       1
4       0
       ..
2995    0
2996    0
2997    1
2998    1
2999    1
Name: LUNG_CANCER, Length: 3000, dtype: int64


SPlitting data into training and test

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
print(X.shape, X_train.shape, X_test.shape)

(3000, 15) (2400, 15) (600, 15)


Model Selection

In [21]:
models = [LogisticRegression(max_iter = 1000), SVC(), RandomForestClassifier(), KNeighborsClassifier()]

In [22]:
model_hyperparameters = {
    'logistic_regression' : {
        'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    },
    'svc' : {
        'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'kernel' : ['linear', 'rbf']
    },
    'random_forest' : {
        'n_estimators' : [10, 100, 200],
        'criterion' : ['gini', 'entropy']
    },
    'knn' : {
        'n_neighbors' : [3, 5],
        'weights' : ['uniform', 'distance']
    }
}

In [23]:
model_key = list(model_hyperparameters.keys())

In [24]:
def model_selection(models, model_hyperparameters):
  result = []
  i = 0
  for model in models:
    key = model_key[i]
    params = model_hyperparameters[key]

    i += 1

    print('Model : ', model)
    print('Hyperparameters : ', params)
    print('-' * 50)

    classifier = GridSearchCV(model, params, cv=5)
    classifier.fit(X_train, y_train)

    result.append({
        'model' : model,
        'best_params' : classifier.best_params_,
        'best_score' : classifier.best_score_
    })

  result_dataframe = pd.DataFrame(result)
  return result_dataframe

In [26]:
model_selection(models, model_hyperparameters)

Model :  LogisticRegression(max_iter=1000)
Hyperparameters :  {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
--------------------------------------------------
Model :  SVC()
Hyperparameters :  {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'kernel': ['linear', 'rbf']}
--------------------------------------------------
Model :  RandomForestClassifier()
Hyperparameters :  {'n_estimators': [10, 100, 200], 'criterion': ['gini', 'entropy']}
--------------------------------------------------
Model :  KNeighborsClassifier()
Hyperparameters :  {'n_neighbors': [3, 5], 'weights': ['uniform', 'distance']}
--------------------------------------------------


Unnamed: 0,model,best_params,best_score
0,LogisticRegression(max_iter=1000),{'C': 0.1},0.514583
1,SVC(),"{'C': 100, 'kernel': 'rbf'}",0.526667
2,RandomForestClassifier(),"{'criterion': 'entropy', 'n_estimators': 200}",0.52125
3,KNeighborsClassifier(),"{'n_neighbors': 5, 'weights': 'uniform'}",0.518333
