In [1]:
import numpy as np
import pandas as pd

In [2]:
db = pd.read_csv('survey lung cancer.csv')

In [3]:
db.head(4) # 1 = NO, 2 = YES

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [5]:
# Encode categorical variables
le = LabelEncoder()
n1=db.columns[db.columns.get_loc("SMOKING"):]
for column in n1:
    db[column] = le.fit_transform(db[column])

In [6]:
db.head(4)

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,0,1,1,0,0,1,0,1,1,1,1,1,1,1
1,M,74,1,0,0,0,1,1,1,0,0,0,1,1,1,1
2,F,59,0,0,0,1,0,1,0,1,0,1,1,0,1,0
3,M,63,1,1,1,0,0,0,0,0,1,0,0,1,1,0


In [7]:
db["GENDER"]=db["GENDER"].map({"F":0,"M":1})
db.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,1,69,0,1,1,0,0,1,0,1,1,1,1,1,1,1
1,1,74,1,0,0,0,1,1,1,0,0,0,1,1,1,1
2,0,59,0,0,0,1,0,1,0,1,0,1,1,0,1,0
3,1,63,1,1,1,0,0,0,0,0,1,0,0,1,1,0
4,0,63,0,1,0,0,0,0,0,1,0,1,1,0,0,0


In [8]:
print(db.head(2)) # transform Gender and Lung_Cancer column --> F = 0, M = 1 And YES = 1 NO = 0

   GENDER  AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
0       1   69        0               1        1              0   
1       1   74        1               0        0              0   

   CHRONIC DISEASE  FATIGUE   ALLERGY   WHEEZING  ALCOHOL CONSUMING  COUGHING  \
0                0         1         0         1                  1         1   
1                1         1         1         0                  0         0   

   SHORTNESS OF BREATH  SWALLOWING DIFFICULTY  CHEST PAIN  LUNG_CANCER  
0                    1                      1           1            1  
1                    1                      1           1            1  


In [9]:
db.shape

(309, 16)

In [10]:
db.head(4)

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,1,69,0,1,1,0,0,1,0,1,1,1,1,1,1,1
1,1,74,1,0,0,0,1,1,1,0,0,0,1,1,1,1
2,0,59,0,0,0,1,0,1,0,1,0,1,1,0,1,0
3,1,63,1,1,1,0,0,0,0,0,1,0,0,1,1,0


In [11]:
db.isnull().sum()

GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL CONSUMING        0
COUGHING                 0
SHORTNESS OF BREATH      0
SWALLOWING DIFFICULTY    0
CHEST PAIN               0
LUNG_CANCER              0
dtype: int64

In [12]:
db['LUNG_CANCER'].value_counts()

LUNG_CANCER
1    270
0     39
Name: count, dtype: int64

In [13]:
# Split features and target
x = db.drop("LUNG_CANCER", axis=1)
y = db["LUNG_CANCER"]

In [14]:
print(y)

0      1
1      1
2      0
3      0
4      0
      ..
304    1
305    1
306    1
307    1
308    1
Name: LUNG_CANCER, Length: 309, dtype: int64


In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

In [16]:
print(x.shape,x_train.shape,x_test.shape)

(309, 15) (247, 15) (62, 15)


In [17]:
print(y.shape,y_train.shape,y_test.shape)

(309,) (247,) (62,)


In [18]:
print(x_test)

     GENDER  AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
68        0   76        1               1        1              1   
274       1   56        1               0        0              0   
63        0   68        0               0        1              0   
283       1   60        0               1        1              0   
286       0   63        0               0        0              0   
..      ...  ...      ...             ...      ...            ...   
58        1   47        0               1        0              1   
111       1   61        1               1        1              0   
98        1   64        0               1        1              1   
302       0   65        1               1        1              1   
81        0   54        1               1        1              1   

     CHRONIC DISEASE  FATIGUE   ALLERGY   WHEEZING  ALCOHOL CONSUMING  \
68                 0         1         1         0                  0   
274                1     

In [19]:
# Train the model
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(x_train, y_train)

In [20]:
#Accuracy on training data
y_pred = model.predict(x_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Confusion Matrix:
 [[ 8  0]
 [ 9 45]]

Classification Report:
               precision    recall  f1-score   support

           0       0.47      1.00      0.64         8
           1       1.00      0.83      0.91        54

    accuracy                           0.85        62
   macro avg       0.74      0.92      0.77        62
weighted avg       0.93      0.85      0.87        62

Accuracy: 0.8548387096774194


In [21]:
#Accuracy on testing data
x_test_prediction = model.predict(x_test)
test_data_acc = accuracy_score(y_test,x_test_prediction)
print(test_data_acc)

0.8548387096774194


In [22]:
print(classification_report(y_test, x_test_prediction))

              precision    recall  f1-score   support

           0       0.47      1.00      0.64         8
           1       1.00      0.83      0.91        54

    accuracy                           0.85        62
   macro avg       0.74      0.92      0.77        62
weighted avg       0.93      0.85      0.87        62



In [23]:
db.tail(10)

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
299,1,65,1,1,1,1,0,1,1,0,0,0,1,1,0,1
300,0,63,1,1,1,1,1,1,1,1,0,1,1,1,1,1
301,1,64,0,1,1,1,0,0,1,0,1,0,0,1,1,1
302,0,65,1,1,1,1,0,1,0,1,0,1,1,1,0,1
303,1,51,0,1,0,0,1,1,1,1,1,1,1,0,1,1
304,0,56,0,0,0,1,1,1,0,0,1,1,1,1,0,1
305,1,70,1,0,0,0,0,1,1,1,1,1,1,0,1,1
306,1,58,1,0,0,0,0,0,1,1,1,1,0,0,1,1
307,1,67,1,0,1,0,0,1,1,0,1,1,1,0,1,1
308,1,62,0,0,0,1,0,1,1,1,1,0,0,1,0,1


In [24]:
db.head(5)

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,1,69,0,1,1,0,0,1,0,1,1,1,1,1,1,1
1,1,74,1,0,0,0,1,1,1,0,0,0,1,1,1,1
2,0,59,0,0,0,1,0,1,0,1,0,1,1,0,1,0
3,1,63,1,1,1,0,0,0,0,0,1,0,0,1,1,0
4,0,63,0,1,0,0,0,0,0,1,0,1,1,0,0,0


In [25]:
#Bulding a predictive system

sample_input = [1,	69	,0	,1	,1	,0	,0	,1,	0,	1,	1,1	,1,	1,	1						]
input_df = pd.DataFrame([sample_input], columns=x.columns)

prediction = model.predict(input_df)[0]
print("Prediction Result:", "Lung Cancer: YES" if prediction == 1 else "Lung Cancer: NO")

Prediction Result: Lung Cancer: YES


In [26]:
import pickle
pickle.dump(model,open('model_lung_update.pkl','wb'))