<a href="https://colab.research.google.com/github/CODERdeeps/MLCDAC/blob/main/EM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

# Load the dataset
data = pd.read_csv('/content/Glass.csv')

# Preprocess the data
X = data.drop('Type', axis=1)
y = data['Type']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(rf_classifier, param_grid, scoring='neg_log_loss', cv=5)
grid_search.fit(X_train, y_train)

# Get the best model and its predictions
best_rf = grid_search.best_estimator_
y_pred_proba = best_rf.predict_proba(X_test)

# Calculate log loss
logloss = log_loss(y_test, y_pred_proba)
print("Log Loss:", logloss)

# Print the best parameters found during grid search
print("Best Parameters:", grid_search.best_params_)


Log Loss: 0.48105252821736166
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

# Load the dataset
data = pd.read_csv('/content/HepatitisCdata.csv')

# Binarize the "Category" variable
data['Category'] = data['Category'].apply(lambda x: 0 if x == 'Blood Donor' else 1)
data['Sex'] = data['Sex'].apply(lambda x: 0 if x == 'm' else 1)
# Separate features and target variable
X = data.drop('Category', axis=1)
y = data['Category']
z = data['Sex']

# Impute missing values (You can choose any method of your choice)
X.fillna(X.mean(), inplace=True)  # Example: Filling missing values with mean

# Encode categorical variables
cat_cols = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=cat_cols)

# Balance the data with SMOTE (You can choose SMOTE or ADASYN)
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, z)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.1, random_state=42)

# Randomized Search CV with Random Forest
rf_classifier = RandomForestClassifier()
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
random_search_rf = RandomizedSearchCV(rf_classifier, param_grid_rf, scoring='roc_auc', cv=5)
random_search_rf.fit(X_train, y_train)
best_rf = random_search_rf.best_estimator_

# Randomized Search CV with XGBoost
xgb_classifier = XGBClassifier()
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}
random_search_xgb = RandomizedSearchCV(xgb_classifier, param_grid_xgb, scoring='roc_auc', cv=5)
random_search_xgb.fit(X_train, y_train)
best_xgb = random_search_xgb.best_estimator_

# Randomized Search CV with SVC-RBF
svc_classifier = SVC(kernel='rbf', probability=True)
param_grid_svc = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto']
}
random_search_svc = RandomizedSearchCV(svc_classifier, param_grid_svc, scoring='roc_auc', cv=5)
random_search_svc.fit(X_train, y_train)
best_svc = random_search_svc.best_estimator_

# Predict probabilities on the test set and calculate roc_auc
y_pred_proba_rf = best_rf.predict_proba(X_test)
roc_auc_rf = roc_auc_score(y_test, y_pred_proba_rf[:, 1])

y_pred_proba_xgb = best_xgb.predict_proba(X_test)
roc_auc_xgb = roc_auc_score(y_test, y_pred_proba_xgb[:, 1])

y_pred_proba_svc = best_svc.predict_proba(X_test)
roc_auc_svc = roc_auc_score(y_test, y_pred_proba_svc[:, 1])

print("Random Forest ROC AUC:", roc_auc_rf)
print("XGBoost ROC AUC:", roc_auc_xgb)
print("SVC-RBF ROC AUC:", roc_auc_svc)




Random Forest ROC AUC: 1.0
XGBoost ROC AUC: 1.0
SVC-RBF ROC AUC: 0.9944559944559944
