In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_csv("/content/drive/My Drive/Aiml project new/archive/New folder/preprocessed_balanced_ham10000_500.csv")

In [5]:
X = df[['age', 'sex', 'localization', 'dx_type']]
y = df['dx']

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Define preprocessing pipeline
numerical_features = ['age']
categorical_features = ['sex', 'localization', 'dx_type']

preprocessor_mean = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

preprocessor_median = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Function to train and evaluate a model
def train_evaluate(model, preprocessor, X_train, y_train, X_test, y_test, variety_name):
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    prec = precision_score(y_test, y_pred, average='macro')
    rec = recall_score(y_test, y_pred, average='macro')
    cm = confusion_matrix(y_test, y_pred)

    print(f"\n{variety_name} Results:")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score (macro): {f1:.4f}")
    print(f"Precision (macro): {prec:.4f}")
    print(f"Recall (macro): {rec:.4f}")
    print(f"Confusion Matrix:\n{cm}")
    return pipeline, f1

In [6]:
lr_basic = LogisticRegression(max_iter=1000, random_state=42)
_, f1_v1 = train_evaluate(lr_basic, preprocessor_mean, X_train, y_train, X_test, y_test, "Variety 1 (Basic)")


Variety 1 (Basic) Results:
Accuracy: 0.4857
F1 Score (macro): 0.4856
Precision (macro): 0.5403
Recall (macro): 0.4954
Confusion Matrix:
[[48 24  0 12 11  0  2]
 [24 44  0  8 13  0  0]
 [29 29 30 13 14  0 11]
 [ 4  5  0 73  9  0  4]
 [23 23  1 15 32  3  4]
 [ 3  4  3  6 10 73  6]
 [ 7 10  4 16  2  8 40]]


In [7]:
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__solver': ['lbfgs', 'saga']
}
lr_tuned = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
grid_search = GridSearchCV(Pipeline(steps=[('preprocessor', preprocessor_mean), ('classifier', lr_tuned)]),
                           param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)
best_model_v2 = grid_search.best_estimator_
y_pred_v2 = best_model_v2.predict(X_test)
f1_v2 = f1_score(y_test, y_pred_v2, average='macro')
print(f"\nVariety 2 (Tuned with Mean Imputation) Best Params: {grid_search.best_params_}")
print(f"F1 Score (macro): {f1_v2:.4f}")


Variety 2 (Tuned with Mean Imputation) Best Params: {'classifier__C': 1, 'classifier__solver': 'lbfgs'}
F1 Score (macro): 0.4916


In [8]:
grid_search_median = GridSearchCV(Pipeline(steps=[('preprocessor', preprocessor_median), ('classifier', lr_tuned)]),
                                  param_grid, cv=5, scoring='f1_macro')
grid_search_median.fit(X_train, y_train)
best_model_v3 = grid_search_median.best_estimator_
y_pred_v3 = best_model_v3.predict(X_test)
f1_v3 = f1_score(y_test, y_pred_v3, average='macro')
print(f"\nVariety 3 (Tuned with Median Imputation) Best Params: {grid_search_median.best_params_}")
print(f"F1 Score (macro): {f1_v3:.4f}")


Variety 3 (Tuned with Median Imputation) Best Params: {'classifier__C': 1, 'classifier__solver': 'lbfgs'}
F1 Score (macro): 0.4916


In [9]:
print("\nComparison:")
print(f"Variety 1 F1: {f1_v1:.4f}")
print(f"Variety 2 F1: {f1_v2:.4f}")
print(f"Variety 3 F1: {f1_v3:.4f}")
best_variety = max([(f1_v1, 1), (f1_v2, 2), (f1_v3, 3)], key=lambda x: x[0])[1]
print(f"Best Variety: {best_variety}")


Comparison:
Variety 1 F1: 0.4856
Variety 2 F1: 0.4916
Variety 3 F1: 0.4916
Best Variety: 2
