In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the data
X_train = pd.read_csv('/content/drive/MyDrive/Dataset/X_Train_Data_Input.csv')
Y_train = pd.read_csv('/content/drive/MyDrive/Dataset/Y_Train_Data_Target.csv')

# Drop ID columns
X_train.drop(columns=['ID'], inplace=True)
Y_train.drop(columns=['ID'], inplace=True)

# Sample a smaller subset of the dataset (e.g., 10% of the data)
X_train, _, Y_train, _ = train_test_split(X_train, Y_train['target'], test_size=0.9, random_state=42)  # Retaining only 10%

# Identify numeric and categorical features
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Create transformers for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply the transformations
X_train_preprocessed = preprocessor.fit_transform(X_train)

# Feature Selection using RandomForest to get feature importances
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_preprocessed, Y_train)
top_feature_indices = np.argsort(rf.feature_importances_)[-10:]  # Select top 10 features
X_train_selected = X_train_preprocessed[:, top_feature_indices]

# Split the data for validation (80% train, 20% validation)
X_train_split, X_val_split, Y_train_split, Y_val_split = train_test_split(X_train_selected, Y_train, test_size=0.2, random_state=42)

# Define models to evaluate
models = {
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Support Vector Classifier': SVC(probability=True),
    'Decision Tree': DecisionTreeClassifier()
}

# Store results for comparison
results = {}

# Iterate through each model and evaluate performance
for name, model in models.items():
    print(f"\nEvaluating {name}...")

    # Hyperparameter tuning for Random Forest and Gradient Boosting
    param_distributions = {}
    if name == 'Random Forest':
        param_distributions = {
            'n_estimators': randint(100, 300),
            'max_depth': randint(3, 8)
        }
    elif name == 'Gradient Boosting':
        param_distributions = {
            'n_estimators': randint(100, 300),
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': randint(3, 8)
        }

    # Use RandomizedSearchCV for hyperparameter tuning if applicable
    if param_distributions:
        random_search = RandomizedSearchCV(model, param_distributions, n_iter=30, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
        random_search.fit(X_train_split, Y_train_split)
        best_model = random_search.best_estimator_
    else:
        model.fit(X_train_split, Y_train_split)
        best_model = model

    # Make predictions
    Y_pred = best_model.predict(X_val_split)

    # Store evaluation metrics
    results[name] = {
        'accuracy': accuracy_score(Y_val_split, Y_pred),
        'report': classification_report(Y_val_split, Y_pred, output_dict=True),
        'confusion_matrix': confusion_matrix(Y_val_split, Y_pred)
    }

    # Print evaluation metrics
    print(f"Validation Accuracy for {name}: {results[name]['accuracy']}")
    print("Validation Classification Report:\n", classification_report(Y_val_split, Y_pred))
    print("Validation Confusion Matrix:\n", confusion_matrix(Y_val_split, Y_pred))

# Load the test data
Dtest = pd.read_csv('/content/drive/MyDrive/Dataset/X_Test_Data_Input.csv')
Y_test = pd.read_csv('/content/drive/MyDrive/Dataset/Y_Test_Data_Target.csv')  # Assuming you have the true labels
Dtest_id = Dtest['ID']
Dtest.drop(columns=['ID'], inplace=True)

# Apply the same transformations to the test data
Dtest_preprocessed = preprocessor.transform(Dtest)
Dtest_selected = Dtest_preprocessed[:, top_feature_indices]

# Evaluate models on the test data
for name, model in models.items():
    print(f"\nEvaluating {name} on test data...")

    # If using hyperparameter tuned model
    if name in ['Random Forest', 'Gradient Boosting']:
        random_search.fit(X_train_selected, Y_train)
        best_model = random_search.best_estimator_
    else:
        model.fit(X_train_selected, Y_train)
        best_model = model

    # Make predictions on the test set
    Y_pred_test = best_model.predict(Dtest_selected)

    # Print evaluation metrics for test data
    print(f"Test Accuracy for {name}: {accuracy_score(Y_test['target'], Y_pred_test)}")
    print("Test Classification Report:\n", classification_report(Y_test['target'], Y_pred_test))
    print("Test Confusion Matrix:\n", confusion_matrix(Y_test['target'], Y_pred_test))

    # Prepare the submission file
    submission = pd.DataFrame({'ID': Dtest_id, 'target': Y_pred_test})
    submission.to_csv(f'submission_{name}.csv', index=False)  # Save submission files with model names


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Evaluating Random Forest...
Validation Accuracy for Random Forest: 0.9764376233840667
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99     14214
           1       0.83      0.95      0.88      1489

    accuracy                           0.98     15703
   macro avg       0.91      0.96      0.94     15703
weighted avg       0.98      0.98      0.98     15703

Validation Confusion Matrix:
 [[13923   291]
 [   79  1410]]

Evaluating Gradient Boosting...
Validation Accuracy for Gradient Boosting: 0.9765649875819907
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99     14214
           1       0.83      0.94      0.88      1489

    accuracy                           0.98     15703
   macro avg 

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import randint

# Load the data
X_train = pd.read_csv('/content/drive/MyDrive/Dataset/X_Train_Data_Input.csv')
Y_train = pd.read_csv('/content/drive/MyDrive/Dataset/Y_Train_Data_Target.csv')

# Drop ID columns from both X_train and Y_train
X_train = X_train.drop(columns=['ID'])
Y_train = Y_train.drop(columns=['ID'])

# Identify numeric and categorical features
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Create transformers for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply the transformations
X_train_preprocessed = preprocessor.fit_transform(X_train)

# Feature Selection using RandomForest to get feature importances
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_preprocessed, Y_train['target'])
importances = rf.feature_importances_
top_feature_indices = np.argsort(importances)[-10:]  # Select top 10 features

X_train_selected = X_train_preprocessed[:, top_feature_indices]

# Split the data for validation (80% train, 20% validation)
X_train_split, X_val_split, Y_train_split, Y_val_split = train_test_split(X_train_selected, Y_train['target'], test_size=0.2, random_state=42)

# Define the model (GradientBoostingClassifier)
model = GradientBoostingClassifier()

# Define hyperparameter search space
param_distributions = {
    'n_estimators': randint(100, 300),
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': randint(3, 8)
}

# Hyperparameter tuning using RandomizedSearchCV
random_search = RandomizedSearchCV(model, param_distributions, n_iter=30, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
random_search.fit(X_train_split, Y_train_split)

# Best model from hyperparameter tuning
best_model = random_search.best_estimator_

# Predict on validation set
Y_pred_val = best_model.predict(X_val_split)

# Print evaluation metrics for validation set
print("Best Hyperparameters:", random_search.best_params_)
print("Validation Accuracy:", accuracy_score(Y_val_split, Y_pred_val))
print("Validation Classification Report:\n", classification_report(Y_val_split, Y_pred_val))
print("Validation Confusion Matrix:\n", confusion_matrix(Y_val_split, Y_pred_val))

# Load the test data
Dtest = pd.read_csv('/content/drive/MyDrive/Dataset/X_Test_Data_Input.csv')
Y_test = pd.read_csv('/content/drive/MyDrive/Dataset/Y_Test_Data_Target.csv')  # Assuming you have the true labels

# Drop ID column from test data
Dtest = Dtest.drop(columns=['ID'])

# Impute missing values in test data
Dtest = pd.DataFrame(SimpleImputer(strategy='mean').fit_transform(Dtest), columns=Dtest.columns)

# Apply the same preprocessing and feature selection to the test data
Dtest_preprocessed = preprocessor.transform(Dtest)
Dtest_selected = Dtest_preprocessed[:, top_feature_indices]

# Predict on the test set
Y_pred_test = best_model.predict(Dtest_selected)

# Print evaluation metrics for test set
print("Test Accuracy:", accuracy_score(Y_test['target'], Y_pred_test))
print("Test Classification Report:\n", classification_report(Y_test['target'], Y_pred_test))
print("Test Confusion Matrix:\n", confusion_matrix(Y_test['target'], Y_pred_test))

# Prepare the submission file
submission = pd.DataFrame({'ID': Dtest_id, 'target': Y_pred_test})
submission.to_csv('submission.csv', index=False)


Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 188}
Validation Accuracy: 0.9766728014927369
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99    142275
           1       0.84      0.93      0.88     14752

    accuracy                           0.98    157027
   macro avg       0.92      0.95      0.93    157027
weighted avg       0.98      0.98      0.98    157027

Validation Confusion Matrix:
 [[139702   2573]
 [  1090  13662]]
Test Accuracy: 0.9766384422571376
Test Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99    237034
           1       0.84      0.93      0.88     24678

    accuracy                           0.98    261712
   macro avg       0.92      0.96      0.93    261712
weighted avg       0.98      0.98      0.98    261712

Test Confusion Matrix:
 [[232585   4449]
 [  1665  23013]]
