In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

In [17]:
df = pd.read_csv("/home/ichigo/Desktop/Medical diagnosis uisng AI/Diseases_dataset/Liver_disease_data.csv")
# Simplified DataFrame with essential features
df = df.drop(columns=["GeneticRisk", "LiverFunctionTest", "PhysicalActivity"])

In [18]:
print(df.head())

   Age  Gender        BMI  AlcoholConsumption  Smoking  Diabetes  \
0   58       0  35.857584           17.272828        0         0   
1   71       1  30.732470            2.201266        0         1   
2   48       0  19.971407           18.500944        0         0   
3   34       1  16.615417           12.632870        0         0   
4   62       1  16.065830            1.087815        0         1   

   Hypertension  Diagnosis  
0             0          1  
1             0          1  
2             0          0  
3             0          1  
4             0          1  


In [4]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,thalach,exang,oldpeak,ca,thal,target
0,63,1,3,145,233,150,0,2.3,0,1,1
1,37,1,2,130,250,187,0,3.5,0,2,1
2,41,0,1,130,204,172,0,1.4,0,2,1
3,56,1,1,120,236,178,0,0.8,0,2,1
4,57,0,0,120,354,163,1,0.6,0,2,1


In [19]:
# Splitting features and target
X = df.drop(columns=['Diagnosis'])  # Adjust target column name as needed
y = df['Diagnosis']

In [20]:
# Handling class imbalance with SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [21]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [22]:
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [23]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000),
    "Random Forest": RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42),
    "SVM": SVC(kernel='linear', class_weight='balanced'),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=y_train.value_counts()[0] / y_train.value_counts()[1])
}


In [24]:
# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = {
        "Accuracy": accuracy,
        "Classification Report": classification_report(y_test, y_pred)
    }
    print(f"\n---- {name} ----")
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred))


---- Logistic Regression ----
Accuracy: 0.7413333333333333
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.76      0.75       196
           1       0.73      0.73      0.73       179

    accuracy                           0.74       375
   macro avg       0.74      0.74      0.74       375
weighted avg       0.74      0.74      0.74       375


---- Random Forest ----
Accuracy: 0.7493333333333333
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.75      0.76       196
           1       0.73      0.75      0.74       179

    accuracy                           0.75       375
   macro avg       0.75      0.75      0.75       375
weighted avg       0.75      0.75      0.75       375


---- SVM ----
Accuracy: 0.7493333333333333
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.77      0.76       196
       

Parameters: { "use_label_encoder" } are not used.



In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
df = pd.read_csv("/home/ichigo/Desktop/Medical diagnosis uisng AI/Diseases_dataset/Liver_disease_data.csv")
# Simplified DataFrame with essential features
df = df.drop(columns=["GeneticRisk", "LiverFunctionTest", "PhysicalActivity"])
# Separate features (X) and target (y)
X = df.drop('Diagnosis', axis=1)
y = df['Diagnosis']

# Scale the features (important for Logistic Regression and SVM)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Dictionary to store models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Support Vector Machine': SVC(random_state=42)
}

# Function to evaluate models and return accuracy
def evaluate_models(models, X_train, X_test, y_train, y_test):
    results = {}
    
    for name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)
        
        # Predict on test set
        y_pred = model.predict(X_test)
        
        # Calculate accuracy on test set
        test_accuracy = accuracy_score(y_test, y_pred)
        
        # Perform 5-fold cross-validation to get a more robust estimate
        cv_scores = cross_val_score(model, X_scaled, y, cv=5)
        cv_mean_accuracy = cv_scores.mean()
        cv_std_accuracy = cv_scores.std()
        
        # Store results
        results[name] = {
            'Test Accuracy': test_accuracy,
            'Cross-Validation Mean Accuracy': cv_mean_accuracy,
            'Cross-Validation Std': cv_std_accuracy
        }
    
    return results

# Evaluate all models
results = evaluate_models(models, X_train, X_test, y_train, y_test)

# Print results in a formatted way
print("Model Comparison Based on Accuracy:\n")
print(f"{'Model':<25} {'Test Accuracy':<15} {'CV Mean Accuracy':<20} {'CV Std':<15}")
print("-" * 75)
for name, metrics in results.items():
    print(f"{name:<25} {metrics['Test Accuracy']:<15.4f} {metrics['Cross-Validation Mean Accuracy']:<20.4f} {metrics['CV Std']:<15.4f}")

# Find the best model based on cross-validation mean accuracy
best_model_name = max(results, key=lambda x: results[x]['Cross-Validation Mean Accuracy'])
best_model_accuracy = results[best_model_name]['Cross-Validation Mean Accuracy']
print(f"\nBest Model: {best_model_name} with Cross-Validation Mean Accuracy: {best_model_accuracy:.4f}")

# Optional: Train and save the best model for future use
best_model = models[best_model_name]
best_model.fit(X_scaled, y)
print(f"Best model trained and ready to use. Use 'best_model' to make predictions on new data.")

Model Comparison Based on Accuracy:

Model                     Test Accuracy   CV Mean Accuracy     CV Std         
---------------------------------------------------------------------------


KeyError: 'CV Std'