In [63]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import StackingClassifier
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.svm._classes")



In [65]:
# Load the datasets
train_data = pd.read_csv('Hepatitis-Train.csv')
test_data = pd.read_csv('Hepatitis-Test.csv')

In [67]:
# Separate features and target
X_train = train_data.drop('TARGET', axis=1)
y_train = train_data['TARGET']
X_test = test_data.drop('TARGET', axis=1)
y_test = test_data['TARGET']

In [69]:
# Encode categorical variables
categorical_columns = X_train.select_dtypes(include='object').columns
encoder = LabelEncoder()
for col in categorical_columns:
    X_train[col] = encoder.fit_transform(X_train[col])
    X_test[col] = encoder.transform(X_test[col])

In [71]:
# Standardize numerical features
scaler = StandardScaler()
numerical_columns = X_train.select_dtypes(include=['float64', 'int64']).columns
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

In [73]:
# Initialize classifiers
classifiers = {
    "LinearSVC": LinearSVC(max_iter=10000, random_state=42),
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=42),
    "RandomForestClassifier": RandomForestClassifier(random_state=42),
    "KNeighborsClassifier": KNeighborsClassifier()}

In [75]:
#Train and evaluate default classifiers
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='weighted'),
        "Recall": recall_score(y_test, y_pred, average='weighted'),
        "F1-Score": f1_score(y_test, y_pred, average='weighted')
    }
    return metrics

default_results = {}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    default_results[name] = evaluate_model(clf, X_test, y_test)

In [77]:
#  Record default classifier results
for name, metrics in default_results.items():
    print(f"Classifier: {name}")
    print(f"Accuracy: {metrics['Accuracy']:.2%}")
    print(f"Precision: {metrics['Precision']:.2%}")
    print(f"Recall: {metrics['Recall']:.2%}")
    print(f"F1-Score: {metrics['F1-Score']:.2%}")
    print("-" * 50)

Classifier: LinearSVC
Accuracy: 72.73%
Precision: 73.05%
Recall: 72.73%
F1-Score: 72.26%
--------------------------------------------------
Classifier: DecisionTreeClassifier
Accuracy: 81.82%
Precision: 81.82%
Recall: 81.82%
F1-Score: 81.82%
--------------------------------------------------
Classifier: RandomForestClassifier
Accuracy: 72.73%
Precision: 73.94%
Recall: 72.73%
F1-Score: 72.73%
--------------------------------------------------
Classifier: KNeighborsClassifier
Accuracy: 72.73%
Precision: 73.94%
Recall: 72.73%
F1-Score: 72.73%
--------------------------------------------------


In [79]:
# Hyperparameter tuning for Random Forest
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_distributions=param_dist,
                                   n_iter=20, cv=3, scoring='accuracy', random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)
best_params = random_search.best_params_

In [80]:
# Train Random Forest with best parameters
best_rf = RandomForestClassifier(**best_params, random_state=42)
best_rf.fit(X_train, y_train)
best_rf_metrics = evaluate_model(best_rf, X_test, y_test)

In [81]:
#Record tuned Random Forest metrics
print("RandomForestClassifier (Tuned):")
print(f"Accuracy: {best_rf_metrics['Accuracy']:.2%}")
print(f"Precision: {best_rf_metrics['Precision']:.2%}")
print(f"Recall: {best_rf_metrics['Recall']:.2%}")
print(f"F1-Score: {best_rf_metrics['F1-Score']:.2%}")
print("-" * 50)

RandomForestClassifier (Tuned):
Accuracy: 63.64%
Precision: 66.88%
Recall: 63.64%
F1-Score: 63.03%
--------------------------------------------------


In [82]:
# Compare default and tuned results
print("Comparison of RandomForestClassifier (Default) vs Tuned:")
default_rf_metrics = default_results['RandomForestClassifier']
for metric in ["Accuracy", "Precision", "Recall", "F1-Score"]:
    print(f"{metric}: Default = {default_rf_metrics[metric]:.2%}, Tuned = {best_rf_metrics[metric]:.2%}")
    print(f"Improvement: {best_rf_metrics[metric] - default_rf_metrics[metric]:.2%}")
print("-" * 50)

Comparison of RandomForestClassifier (Default) vs Tuned:
Accuracy: Default = 72.73%, Tuned = 63.64%
Improvement: -9.09%
Precision: Default = 73.94%, Tuned = 66.88%
Improvement: -7.06%
Recall: Default = 72.73%, Tuned = 63.64%
Improvement: -9.09%
F1-Score: Default = 72.73%, Tuned = 63.03%
Improvement: -9.70%
--------------------------------------------------


In [87]:
# Identify top features
top_features = pd.Series(best_rf.feature_importances_, index=X_train.columns).sort_values(ascending=False).head(5)

# Step 8: Ensemble predictions (stacking)
stacking_classifier = StackingClassifier(
    estimators=[
        ('svc', classifiers['LinearSVC']),
        ('tree', classifiers['DecisionTreeClassifier']),
        ('rf', classifiers['RandomForestClassifier']),
        ('knn', classifiers['KNeighborsClassifier'])
    ],
    final_estimator=MLPClassifier(random_state=42, max_iter=1000)
)
stacking_classifier.fit(X_train, y_train)
stacking_metrics = evaluate_model(stacking_classifier, X_test, y_test)

In [89]:
# Combine all results into a Word-compatible table
final_results = {
    "Classifier": [
        "LinearSVC", "DecisionTreeClassifier", "RandomForestClassifier (Default)",
        "RandomForestClassifier (Tuned)", "StackingClassifier"
    ],
    "Accuracy": [
        default_results['LinearSVC']['Accuracy'],
        default_results['DecisionTreeClassifier']['Accuracy'],
        default_results['RandomForestClassifier']['Accuracy'],
        best_rf_metrics['Accuracy'],
        stacking_metrics['Accuracy']
    ],
    "Precision": [
        default_results['LinearSVC']['Precision'],
        default_results['DecisionTreeClassifier']['Precision'],
        default_results['RandomForestClassifier']['Precision'],
        best_rf_metrics['Precision'],
        stacking_metrics['Precision']
    ],
    "Recall": [
        default_results['LinearSVC']['Recall'],
        default_results['DecisionTreeClassifier']['Recall'],
        default_results['RandomForestClassifier']['Recall'],
        best_rf_metrics['Recall'],
        stacking_metrics['Recall']
    ],
    "F1-Score": [
        default_results['LinearSVC']['F1-Score'],
        default_results['DecisionTreeClassifier']['F1-Score'],
        default_results['RandomForestClassifier']['F1-Score'],
        best_rf_metrics['F1-Score'],
        stacking_metrics['F1-Score']
    ]
}
results_df = pd.DataFrame(final_results)
results_df.to_csv('Classification_Results.csv', index=False)

In [91]:
# Save top features
top_features.to_csv('Top_Features.csv', header=True)

results_df, top_features

(                         Classifier  Accuracy  Precision    Recall  F1-Score
 0                         LinearSVC  0.727273   0.730519  0.727273  0.722611
 1            DecisionTreeClassifier  0.818182   0.818182  0.818182  0.818182
 2  RandomForestClassifier (Default)  0.727273   0.739394  0.727273  0.727273
 3    RandomForestClassifier (Tuned)  0.636364   0.668831  0.636364  0.630303
 4                StackingClassifier  0.909091   0.924242  0.909091  0.909091,
 Albumin      0.216566
 Bilirubin    0.165291
 PROTIME      0.133971
 Ascites      0.088482
 Varices      0.074124
 dtype: float64)