# Model Selection and Training Notebook

## Objectives:
- Train multiple machine learning models on the dataset.
- Perform hyperparameter tuning to find the best model.
- Save trained models for further evaluation.

1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import joblib

In [14]:
import pandas as pd
import os

# Paths where the processed files should be saved
output_dir = 'outputs/datasets/featured'
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it doesn't exist

# Assuming TrainSet and TestSet are your processed datasets
# Save the datasets to the directory
TrainSet_encoded.to_csv(f'{output_dir}/TrainSet_Featured.csv', index=False)
TestSet_encoded.to_csv(f'{output_dir}/TestSet_Featured.csv', index=False)

print("Files saved successfully!")


NameError: name 'TrainSet_encoded' is not defined

2. Check column names in the dataset

In [13]:
import pandas as pd
import os

# Adjust the file path if necessary
file_path_train = 'outputs/datasets/featured/TrainSet_Featured.csv'
file_path_test = 'outputs/datasets/featured/TestSet_Featured.csv'

# Create the directory if it doesn't exist
os.makedirs('outputs/datasets/featured', exist_ok=True)

try:
    # Load a small portion of the data to inspect columns
    train_data = pd.read_csv(file_path_train, nrows=5)
    print(train_data.columns)

    test_data = pd.read_csv(file_path_test, nrows=5)
    print(test_data.columns)

except FileNotFoundError as e:
    print(f"File not found: {e}")

# List files in the featured directory
print(os.listdir('outputs/datasets/featured'))


File not found: [Errno 2] No such file or directory: 'outputs/datasets/featured/TrainSet_Featured.csv'
[]


2. Load Data

In [2]:
# Load preprocessed data
TrainSet_encoded = pd.read_csv('outputs/datasets/featured/TrainSet_Featured.csv')
TestSet_encoded = pd.read_csv('outputs/datasets/featured/TestSet_Featured.csv')

# Separating features and target variable
X_train = TrainSet_encoded.drop('TARGET_COLUMN', axis=1)  # Replace 'TARGET_COLUMN' with actual column name
y_train = TrainSet_encoded['TARGET_COLUMN']
X_test = TestSet_encoded.drop('TARGET_COLUMN', axis=1)  # Replace 'TARGET_COLUMN' with actual column name
y_test = TestSet_encoded['TARGET_COLUMN']

FileNotFoundError: [Errno 2] No such file or directory: 'outputs/datasets/featured/TrainSet_Featured.csv'

3. Model Selection
Weâ€™ll define a list of models that we want to evaluate.

In [None]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

4. Training Models
For each model, train it on the training data and store the results.

In [None]:
# Dictionary to store model performance
model_performance = {}

# Train and evaluate each model
for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Calculate accuracy and F1-score
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    train_f1 = f1_score(y_train, y_pred_train)
    test_f1 = f1_score(y_test, y_pred_test)
    
    # Store the performance
    model_performance[model_name] = {
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'train_f1': train_f1,
        'test_f1': test_f1
    }
    
    print(f"{model_name} -> Test Accuracy: {test_accuracy:.4f}, Test F1: {test_f1:.4f}")

5. Hyperparameter Tuning (Optional)
Use GridSearchCV for hyperparameter tuning on the selected model.

In [None]:
# Example: Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, scoring='f1', cv=5)
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best F1 score: ", grid_search.best_score_)

# Save the best model
best_rf_model = grid_search.best_estimator_
joblib.dump(best_rf_model, 'best_random_forest_model.pkl')

6. Save the Best Model
Save the trained models for further evaluation.

In [None]:
# Save each model
for model_name, model in models.items():
    joblib.dump(model, f'{model_name}_model.pkl')
    print(f"{model_name} saved as {model_name}_model.pkl")

7. Summary of Results
Summarize the performance of the trained models.

In [None]:
import pandas as pd
results_df = pd.DataFrame(model_performance).T
print(results_df)

# Save results to a CSV
results_df.to_csv('model_performance_summary.csv')

8. Next Steps
Provide a brief note on what to do next.

## Next Steps
- Perform Model Evaluation using cross-validation or other methods.
- Choose the best performing model based on metrics.
- Fine-tune the model further, if needed, before deployment.