# Model Selection and Training Notebook

## Objectives:
- Train multiple machine learning models on the dataset.
- Perform hyperparameter tuning to find the best model.
- Save trained models for further evaluation.

1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import joblib
import os

2. Load Data

In [19]:
# Display current working directory
print("Current working directory:", os.getcwd())

# Load preprocessed data
train_data_path = '/workspace/bicycle_thefts_berlin/outputs/datasets/featured/TrainSet_Featured.csv'
test_data_path = '/workspace/bicycle_thefts_berlin/outputs/datasets/featured/TestSet_Featured.csv'

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Inspect the target column and others
print(train_data['VERSUCH'].unique())
print(train_data['ERFASSUNGSGRUND'].unique())
print(train_data['DELIKT_Keller- und Bodeneinbruch'].unique())

# Define target column
target_column = 'VERSUCH'

# Separate features and target variable
X_train = train_data.drop(columns=[target_column])
y_train = train_data[target_column]
X_test = test_data.drop(columns=[target_column])
y_test = test_data[target_column]

print("Data has been loaded and split into features and target.")

Current working directory: /workspace/bicycle_thefts_berlin/jupyter_notebooks
['0' '1' 'Unbekannt' 0 1]
['Sonstiger schwerer Diebstahl von FahrrÃ¤dern'
 'Einfacher Diebstahl von FahrrÃ¤dern'
 'Sonstiger schwerer Diebstahl in/aus Keller/Boden von FahrrÃ¤dern'
 'Einfacher Diebstahl aus Keller/Boden von FahrrÃ¤dern']
[False  True]
Data has been loaded and split into features and target.


  train_data = pd.read_csv(train_data_path)


3. Preprocessing Data and Validating Columns

In [21]:
# Validate Target Variable
def clean_and_validate_target(target):
    # Identify non-numeric values
    print("Unique target values before cleaning:", target.unique())

    # Convert valid numeric-like strings to integers
    target = pd.to_numeric(target, errors='coerce')

    # Remove invalid entries (e.g., NaN values resulting from 'Unbekannt' or other non-numeric data)
    target = target.dropna().astype(int)

    print("Validated and cleaned target column values:", target.unique())
    return target

# Apply the function to y_train and y_test
y_train = clean_and_validate_target(y_train)
y_test = clean_and_validate_target(y_test)

# Preprocess Features
def preprocess_data(df):
    # Convert date columns to numeric timestamp if applicable
    if 'ANGELEGT_AM' in df.columns:
        df['ANGELEGT_AM'] = pd.to_datetime(df['ANGELEGT_AM'], errors='coerce').astype('int64') // 10**9

    # Convert all columns to numeric, applying one-hot encoding for categorical data
    df = pd.get_dummies(df, drop_first=True)
    df = df.apply(pd.to_numeric, errors='coerce').fillna(0)  # Handle missing values
    return df

X_train = preprocess_data(X_train)
X_test = preprocess_data(X_test)

# Ensure column alignment between train and test sets
X_train, X_test = X_train.align(X_test, join='inner', axis=1)

print("Data preprocessing completed.")

Unique target values before cleaning: [0 1]
Validated and cleaned target column values: [0 1]
Unique target values before cleaning: [0 1]
Validated and cleaned target column values: [0 1]
Data preprocessing completed.


4. Model Selection
We’ll define a list of models that we want to evaluate.

In [22]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

5. Training Models
For each model, train it on the training data and store the results.

In [23]:
# Initialize model performance dictionary
model_performance = {}

# Function to align and clean data
def clean_and_align_data(X, y):
    """
    Aligns X and y by ensuring their indices match and removing NaN values.
    """
    # Drop NaN values from the target variable
    y = y.dropna()

    # Align X and y based on their indices
    X = X.loc[y.index]

    # Fill any remaining missing values in X
    X = X.fillna(0)

    return X, y

# Align and clean the training and test data
X_train, y_train = clean_and_align_data(X_train, y_train)
X_test, y_test = clean_and_align_data(X_test, y_test)

# Debug: Check the shape of the aligned data
print(f"Aligned X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"Aligned X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

# Step 1: Ensure X and y have the same length
assert len(X_train) == len(y_train), "X_train and y_train have mismatched lengths."
assert len(X_test) == len(y_test), "X_test and y_test have mismatched lengths."

# Step 2: Train models and calculate performance metrics
for model_name, model in models.items():
    try:
        print(f"Training {model_name}...")

        # Train the model
        model.fit(X_train, y_train)

        # Save the trained model
        model_path = f'{model_name}_model.pkl'
        joblib.dump(model, model_path)
        print(f"{model_name} saved as {model_path}")

        # Predictions on train and test sets
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)

        # Calculate accuracy and F1-score
        train_accuracy = accuracy_score(y_train, y_pred_train)
        test_accuracy = accuracy_score(y_test, y_pred_test)
        train_f1 = f1_score(y_train, y_pred_train, average='weighted')
        test_f1 = f1_score(y_test, y_pred_test, average='weighted')

        # Store performance metrics
        model_performance[model_name] = {
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy,
            'train_f1': train_f1,
            'test_f1': test_f1
        }

        print(f"{model_name} -> Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}, Test F1: {test_f1:.4f}")

    except ValueError as ve:
        print(f"ValueError during {model_name} training: {ve}")
    except NotFittedError as nfe:
        print(f"NotFittedError during {model_name} training: {nfe}")
    except Exception as e:
        print(f"Unexpected error during {model_name} training: {e}")

# Save feature names for alignment during evaluation
joblib.dump(X_train.columns.tolist(), 'model_columns.pkl')
print("Feature columns saved for alignment during evaluation.")

# Save performance metrics to CSV
results_df = pd.DataFrame(model_performance).T
print(results_df)

results_df.to_csv('model_performance_summary.csv', index=True)
print("Model performance results saved to 'model_performance_summary.csv'.")

Aligned X_train shape: (34500, 708), y_train shape: (34500,)
Aligned X_test shape: (8627, 708), y_test shape: (8627,)
Training Logistic Regression...
Logistic Regression saved as Logistic Regression_model.pkl
Logistic Regression -> Train Accuracy: 0.9960, Test Accuracy: 0.9976, Test F1: 0.9964
Training Random Forest...
Random Forest saved as Random Forest_model.pkl
Random Forest -> Train Accuracy: 0.9998, Test Accuracy: 0.9973, Test F1: 0.9962
Training SVM...
SVM saved as SVM_model.pkl
SVM -> Train Accuracy: 0.9960, Test Accuracy: 0.9976, Test F1: 0.9964
Training K-Nearest Neighbors...
K-Nearest Neighbors saved as K-Nearest Neighbors_model.pkl
K-Nearest Neighbors -> Train Accuracy: 0.9960, Test Accuracy: 0.9974, Test F1: 0.9963
Feature columns saved for alignment during evaluation.
                     train_accuracy  test_accuracy  train_f1   test_f1
Logistic Regression        0.996000       0.997566  0.994004  0.996350
Random Forest              0.999768       0.997334  0.999766  0.9

6. Training Models and Saving Results

In [24]:
# Initialize model performance dictionary
model_performance = {}

for model_name, model in models.items():
    try:
        print(f"Training {model_name}...")

        # Train the model
        model.fit(X_train, y_train)

        # Save the trained model
        model_file_path = f'outputs/models/{model_name}_model.pkl'
        os.makedirs(os.path.dirname(model_file_path), exist_ok=True)
        joblib.dump(model, model_file_path)
        print(f"{model_name} saved as {model_file_path}")

        # Predictions
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)

        # Calculate accuracy and F1-score
        train_accuracy = accuracy_score(y_train, y_pred_train)
        test_accuracy = accuracy_score(y_test, y_pred_test)
        train_f1 = f1_score(y_train, y_pred_train, average='weighted')
        test_f1 = f1_score(y_test, y_pred_test, average='weighted')

        # Store performance metrics
        model_performance[model_name] = {
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy,
            'train_f1': train_f1,
            'test_f1': test_f1
        }

        print(f"{model_name} -> Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}, Test F1: {test_f1:.4f}")

    except Exception as e:
        print(f"Error training {model_name}: {e}")

# Save feature names for alignment during evaluation
joblib.dump(X_train.columns.tolist(), 'outputs/models/model_columns.pkl')
print("Feature columns saved for alignment during evaluation.")

Training Logistic Regression...
Logistic Regression saved as outputs/models/Logistic Regression_model.pkl
Logistic Regression -> Train Accuracy: 0.9960, Test Accuracy: 0.9976, Test F1: 0.9964
Training Random Forest...
Random Forest saved as outputs/models/Random Forest_model.pkl
Random Forest -> Train Accuracy: 0.9997, Test Accuracy: 0.9971, Test F1: 0.9961
Training SVM...
SVM saved as outputs/models/SVM_model.pkl
SVM -> Train Accuracy: 0.9960, Test Accuracy: 0.9976, Test F1: 0.9964
Training K-Nearest Neighbors...
K-Nearest Neighbors saved as outputs/models/K-Nearest Neighbors_model.pkl
K-Nearest Neighbors -> Train Accuracy: 0.9960, Test Accuracy: 0.9974, Test F1: 0.9963
Feature columns saved for alignment during evaluation.


7. Hyperparameter Tuning

In [25]:
# Example: Hyperparameter tuning for Random Forest
try:
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    }

    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, scoring='f1', cv=5)
    grid_search.fit(X_train, y_train)

    print("Best parameters found: ", grid_search.best_params_)
    print("Best F1 score: ", grid_search.best_score_)

    # Save the best model
    best_rf_model = grid_search.best_estimator_
    joblib.dump(best_rf_model, 'outputs/models/best_random_forest_model.pkl')
    print("Best Random Forest model saved.")
except Exception as e:
    print(f"Error during hyperparameter tuning: {e}")


Best parameters found:  {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Best F1 score:  0.025806451612903226
Best Random Forest model saved.


8. Summary of Results

In [15]:
# Summary of Results
results_df = pd.DataFrame(model_performance).T
print(results_df)

# Save results to a CSV file
results_path = 'outputs/model_performance_summary.csv'
os.makedirs(os.path.dirname(results_path), exist_ok=True)
results_df.to_csv(results_path)

print(f"Model performance summary saved to {results_path}")

                     train_accuracy  test_accuracy  train_f1   test_f1
Logistic Regression        0.996000       0.997566  0.994004  0.996350
Random Forest              0.999768       0.997334  0.999766  0.996234
SVM                        0.996000       0.997566  0.994004  0.996350
K-Nearest Neighbors        0.996000       0.997566  0.994004  0.996350
Model performance summary saved to outputs/model_performance_summary.csv


9. Cross-Validation for Model Evaluation:

Perform cross-validation to assess how well the model generalizes to new data. This involves dividing the dataset into multiple folds and training the model on different subsets, which provides a more robust evaluation of its performance. Use techniques like cross_val_score or GridSearchCV in sklearn to implement this.


In [16]:
from sklearn.model_selection import cross_val_score

# Example: Cross-validation on Random Forest
cv_model = RandomForestClassifier(n_estimators=100, max_depth=10)
cv_scores = cross_val_score(cv_model, X_train, y_train, cv=5, scoring='accuracy')

print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

Cross-Validation Accuracy Scores: [0.99608696 0.99608696 0.99594203 0.99594203 0.99594203]
Mean CV Accuracy: 0.9960000000000001


In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20]}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best Hyperparameters: ", grid_search.best_params_)

Best Hyperparameters:  {'max_depth': 10, 'n_estimators': 200}


In [12]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      8606
         1.0       0.00      0.00      0.00        21

    accuracy                           1.00      8627
   macro avg       0.50      0.50      0.50      8627
weighted avg       1.00      1.00      1.00      8627



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
