In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('lung_cancer_dataset.csv')
df.head()

Unnamed: 0,age,gender,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis
0,64.0,Male,Yes,Passive Smoker,29.4,199,0,0,1
1,50.0,Female,Yes,Passive Smoker,41.2,280,1,1,0
2,65.0,Female,Yes,Former Smoker,44.0,268,1,1,0
3,51.0,Female,No,Passive Smoker,43.0,241,1,1,0
4,37.0,Male,No,Passive Smoker,19.7,178,0,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890000 entries, 0 to 889999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   age                890000 non-null  float64
 1   gender             890000 non-null  object 
 2   family_history     890000 non-null  object 
 3   smoking_status     890000 non-null  object 
 4   bmi                890000 non-null  float64
 5   cholesterol_level  890000 non-null  int64  
 6   hypertension       890000 non-null  int64  
 7   asthma             890000 non-null  int64  
 8   cirrhosis          890000 non-null  int64  
dtypes: float64(2), int64(4), object(3)
memory usage: 61.1+ MB


In [5]:
df.smoking_status.value_counts()

smoking_status
Passive Smoker    223170
Never Smoked      222751
Former Smoker     222181
Current Smoker    221898
Name: count, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

categorical_cols = ['gender', 'family_history', 'smoking_status']
numeric_cols = ['age', 'bmi', 'cholesterol_level', 'hypertension', 'asthma']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols),
        ('num', StandardScaler(), numeric_cols)
    ]
)

X = df.drop('cirrhosis', axis=1)
y = df['cirrhosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

model.fit(X_train, y_train)

print("Test Accuracy:", model.score(X_test, y_test))


Test Accuracy: 0.7730224719101123


In [7]:
# from sklearn.pipeline import Pipeline
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.neural_network import MLPClassifier

# # Example: Random Forest
# model_rf = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', RandomForestClassifier(random_state=42))
# ])
# # 
# # Example: SVM
# model_svm = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', SVC(kernel='rbf', probability=True, random_state=42))
# ])

# # Example: KNN
# model_knn = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', KNeighborsClassifier(n_neighbors=5))
# ])

# # Example: Naive Bayes
# model_nb = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', GaussianNB())
# ])

# # Example: Neural Network
# model_mlp = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42))
# ])


In [8]:
# for name, clf in [('Logistic Regression', model),
#                   ('Random Forest', model_rf),
#                   ('SVM', model_svm),
#                   ('KNN', model_knn),
#                   ('Naive Bayes', model_nb),
#                   ('Neural Network', model_mlp)]:
#     clf.fit(X_train, y_train)
#     score = clf.score(X_test, y_test)
#     print(f"{name} Accuracy: {score:.4f}")


In [10]:
def predict_user(model):
    user_data = {
        'age': float(input("Enter age: ")),
        'gender': input("Enter gender (Male/Female): "),
        'family_history': input("Family history (Yes/No): "),
        'smoking_status': input("Smoking status (Passive Smoker/Former Smoker/Never Smoked/Current Smoker): "),
        'bmi': float(input("Enter BMI: ")),
        'cholesterol_level': int(input("Enter cholesterol level: ")),
        'hypertension': int(input("Hypertension? (1=yes, 0=no): ")),
        'asthma': int(input("Asthma? (1=yes, 0=no): "))
    }

    user_df = pd.DataFrame([user_data])
    prediction = model.predict(user_df)
    proba = model.predict_proba(user_df)

    print("Prediction:", "Cirrhosis (1 - Cancer)" if prediction[0] == 1 else "No Cirrhosis (0 - No Cancer)")
    print(f"Probability: {proba[0][1]*100:.2f}% chance of cirrhosis")


# Example use:
predict_user(model)


Enter age:  23
Enter gender (Male/Female):  Male
Family history (Yes/No):  Yes
Smoking status (Passive Smoker/Former Smoker/Never Smoked/Current Smoker):  Passive Smoker
Enter BMI:  23.3
Enter cholesterol level:  180
Hypertension? (1=yes, 0=no):  1
Asthma? (1=yes, 0=no):  0


Prediction: No Cirrhosis (0 - No Cancer)
Probability: 23.19% chance of cirrhosis


In [11]:
import joblib

joblib.dump(model, 'lung_cancer_model.pkl')

['lung_cancer_model.pkl']

In [12]:
joblib.dump(preprocessor, 'lung_scaler.joblib')

['lung_scaler.joblib']