Classify if Demented, Nondemented

In [None]:
#set up a ColumnTransformer with StandardScaler for numerical features and OneHotEncoder for categorical features.
#set up and training a LinearRegression model using scikit-learn, including data preprocessing steps within a Pipeline.
#implement polynomial regression
#perform hyperparameter tuning for a polynomial regression model
#evaluate the performance of a regression model on test data
#use OneHotEncoder with handle_unknown='ignore' within a preprocessing pipeline to handle unseen categories during model training and evaluation
#set up and execute cross_val_score or GridSearchCV to perform cross-validation

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

data = pd.read_csv('/content/dementia.csv')

data = data[data['Group'].isin(['Demented', 'Nondemented'])]
data['Group'] = data['Group'].map({'Nondemented': 0, 'Demented': 1})
data = data.drop(columns=['Subject ID', 'MRI ID'])
X = data.drop('Group', axis=1)
y = data['Group']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('classifier', LogisticRegression(max_iter=1000))
])

param_grid = {
    'poly__degree': [1, 2, 3],
    'classifier__C': [0.01, 0.1, 1, 10]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best score (cross-val):", grid_search.best_score_)

y_pred = grid_search.predict(X_test)
print("Accuracy on test set:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='accuracy')
print("Cross-validated scores:", scores)
print("Mean CV accuracy:", scores.mean())


Best params: {'classifier__C': 1, 'poly__degree': 1}
Best score (cross-val): 0.9925509779442363
Accuracy on test set: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        38
           1       1.00      1.00      1.00        30

    accuracy                           1.00        68
   macro avg       1.00      1.00      1.00        68
weighted avg       1.00      1.00      1.00        68

Cross-validated scores: [0.97058824 1.         1.         1.         1.        ]
Mean CV accuracy: 0.9941176470588236
