In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Load the original dataset
file_path = '/mnt/data/survey _lung_cancer.csv'  # Ensure the file path matches exactly
data = pd.read_csv(file_path)

# Convert 'YES'/'NO' in the 'LUNG_CANCER' column to 1/0
data['LUNG_CANCER'] = data['LUNG_CANCER'].map({'YES': 1, 'NO': 0})

# Encode the 'GENDER' column to numeric values
label_encoder = LabelEncoder()
data['GENDER'] = label_encoder.fit_transform(data['GENDER'])

# Selecting independent variables (features) and the dependent variable (target)
X = data.drop('LUNG_CANCER', axis=1)
y = data['LUNG_CANCER']

# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.32, random_state=42)

# Training the logistic regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)
logistic_kfold_model = LogisticRegression(max_iter=1000)
logistic_kfold_model.fit(X, y)

# Predicting the target values for the test set
y_pred = logistic_model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# k-fold Cross validation
fold_scores = cross_val_score(logistic_kfold_model, X, y, cv = 30)

print(f'Test Split Accuracy Score: {accuracy}')
print('\nTest Split Confusion Matrix:')
print(conf_matrix)
print(f'\n\nk-Fold Accuracy Scores: {fold_scores}')
print(f'\nk-Fold Mean Accuracy: {np.mean(fold_scores)}')


Test Split Accuracy Score: 0.9797979797979798

Test Split Confusion Matrix:
[[ 5  2]
 [ 0 92]]


k-Fold Accuracy Scores: [0.81818182 1.         0.81818182 0.90909091 0.81818182 0.90909091
 0.90909091 0.90909091 1.         0.9        0.9        1.
 1.         0.9        1.         0.9        1.         1.
 0.6        0.9        1.         1.         0.9        1.
 0.9        0.9        0.9        0.9        1.         1.        ]

k-Fold Mean Accuracy: 0.923030303030303
