In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

data = pd.read_csv("survey_lung_cancer.csv")


print(data.head())
print("Columns in dataset:", data.columns)

data = data.dropna()


if 'GENDER' in data.columns:

    label_encoder = LabelEncoder()
    data['GENDER'] = label_encoder.fit_transform(data['GENDER'])
else:
    print("The 'GENDER' column is not found in the dataset.")

X = data.drop('LUNG_CANCER', axis=1)
y = data['LUNG_CANCER']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model 1: Logistic Regression
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)

# Model 2: Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)

# Model 3: K-Nearest Neighbors (KNN)
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)

print(f"Model Comparison:\n Logistic Regression Accuracy: {accuracy_lr}\n Support Vector Machine Accuracy: {accuracy_svm}\n K-Nearest Neighbors Accuracy: {accuracy_knn}")

best_accuracy = max(accuracy_lr, accuracy_svm, accuracy_knn)

if best_accuracy == accuracy_lr:
    print("The Logistic Regression model has the highest accuracy and performs the best on this dataset.")
elif best_accuracy == accuracy_svm:
    print("The Support Vector Machine model has the highest accuracy and performs the best on this dataset.")
else:
    print("The K-Nearest Neighbors model has the highest accuracy and performs the best on this dataset.")



  GENDER  AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
0      M   69        1               2        2              1   
1      M   74        2               1        1              1   
2      F   59        1               1        1              2   
3      M   63        2               2        2              1   
4      F   63        1               2        1              1   

   CHRONIC DISEASE  FATIGUE   ALLERGY   WHEEZING  ALCOHOL CONSUMING  COUGHING  \
0                1         2         1         2                  2         2   
1                2         2         2         1                  1         1   
2                1         2         1         2                  1         2   
3                1         1         1         1                  2         1   
4                1         1         1         2                  1         2   

   SHORTNESS OF BREATH  SWALLOWING DIFFICULTY  CHEST PAIN LUNG_CANCER  
0                    2                      