# Model Selection and Training Notebook

## Objectives:
- Train multiple machine learning models on the dataset.
- Perform hyperparameter tuning to find the best model.
- Save trained models for further evaluation.

1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import joblib

2. Load Data

In [2]:
import os
print(os.getcwd())

# Load preprocessed data
TrainSet_encoded = pd.read_csv('/workspace/bicycle_thefts_berlin/outputs/datasets/featured/TrainSet_Featured.csv')
TestSet_encoded = pd.read_csv('/workspace/bicycle_thefts_berlin/outputs/datasets/featured/TestSet_Featured.csv')

# Inspect the unique values in potential target columns
print(TrainSet_encoded['VERSUCH'].unique())
print(TrainSet_encoded['ERFASSUNGSGRUND'].unique())
print(TrainSet_encoded['DELIKT_Keller- und Bodeneinbruch'].unique())

# Assuming 'VERSUCH' is the target column
target_column = 'VERSUCH'

# Separating features and target variable
X_train = TrainSet_encoded.drop(target_column, axis=1)  
y_train = TrainSet_encoded[target_column]
X_test = TestSet_encoded.drop(target_column, axis=1)
y_test = TestSet_encoded[target_column]

print("Data has been loaded and split into features and target.")

/workspace/bicycle_thefts_berlin/jupyter_notebooks
['0' '1' 'Unbekannt' 0 1]
['Sonstiger schwerer Diebstahl von FahrrÃ¤dern'
 'Einfacher Diebstahl von FahrrÃ¤dern'
 'Sonstiger schwerer Diebstahl in/aus Keller/Boden von FahrrÃ¤dern'
 'Einfacher Diebstahl aus Keller/Boden von FahrrÃ¤dern']
[False  True]
Data has been loaded and split into features and target.


  TrainSet_encoded = pd.read_csv('/workspace/bicycle_thefts_berlin/outputs/datasets/featured/TrainSet_Featured.csv')


3. Model Selection
We’ll define a list of models that we want to evaluate.

In [3]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

4. Training Models
For each model, train it on the training data and store the results.

In [4]:
# Ensure models are saved correctly after training
model_performance = {}  # Initialize model performance dictionary

for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Save the trained model
    joblib.dump(model, f'{model_name}_model.pkl')
    print(f"{model_name} saved as {model_name}_model.pkl")
    
    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Calculate accuracy and F1-score
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    train_f1 = f1_score(y_train, y_pred_train, average='weighted')
    test_f1 = f1_score(y_test, y_pred_test, average='weighted')
    
    # Store performance metrics
    model_performance[model_name] = {
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'train_f1': train_f1,
        'test_f1': test_f1
    }
    print(f"{model_name} -> Test Accuracy: {test_accuracy:.4f}, Test F1: {test_f1:.4f}")

# Save feature names for alignment in evaluation
joblib.dump(X_train.columns.tolist(), 'model_columns.pkl')
print("Feature columns saved for alignment during evaluation.")


Training Logistic Regression...


ValueError: could not convert string to float: '2023-05-04'

5. Hyperparameter Tuning (Optional)
Use GridSearchCV for hyperparameter tuning on the selected model.

In [6]:
# Example: Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, scoring='f1', cv=5)
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best F1 score: ", grid_search.best_score_)

# Save the best model
best_rf_model = grid_search.best_estimator_
joblib.dump(best_rf_model, 'best_random_forest_model.pkl')

TypeError: '<' not supported between instances of 'int' and 'str'

6. Save the Best Model
Save the trained models for further evaluation.

In [7]:
# Save each model
for model_name, model in models.items():
    joblib.dump(model, f'{model_name}_model.pkl')
    print(f"{model_name} saved as {model_name}_model.pkl")

Logistic Regression saved as Logistic Regression_model.pkl
Random Forest saved as Random Forest_model.pkl
SVM saved as SVM_model.pkl
K-Nearest Neighbors saved as K-Nearest Neighbors_model.pkl


Bad pipe message: %s [b'titute-ide.net\r\nUser-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like G', b'ko) Chrome/132.0.0.0 Safari/537.36\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,im', b'e/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7\r\nAccept-Encoding: ', b'ip, deflate, br, zstd\r\nAccept-Language: en-US,en;q=0.9\r\nCookie: gitpod-user=true\r\nPriority: u=0, i\r\nRe']


7. Summary of Results
Summarize the performance of the trained models.

In [14]:
results_df = pd.DataFrame(model_performance).T
print(results_df)

# Save results to a CSV
results_df.to_csv('model_performance_summary.csv')

                     train_accuracy  test_accuracy  train_f1   test_f1
Logistic Regression        0.996000       0.997566  0.994004  0.996350
Random Forest              0.996116       0.997102  0.994287  0.996118
SVM                        0.996000       0.997566  0.994004  0.996350
K-Nearest Neighbors        0.996000       0.997566  0.994004  0.996350


8. Next Steps
The next steps to improve and finalize the model development process are as follows:

1. Cross-Validation for Model Evaluation:

Perform cross-validation to assess how well the model generalizes to new data. This involves dividing the dataset into multiple folds and training the model on different subsets, which provides a more robust evaluation of its performance. Use techniques like cross_val_score or GridSearchCV in sklearn to implement this.


In [8]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy Scores: ", scores)
print("Mean CV Accuracy: ", scores.mean())

Cross-Validation Accuracy Scores:  [0.99608696 0.99608696 0.99594203 0.99594203 0.99594203]
Mean CV Accuracy:  0.9960000000000001


In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20]}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best Hyperparameters: ", grid_search.best_params_)

Best Hyperparameters:  {'max_depth': 10, 'n_estimators': 200}


In [12]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      8606
         1.0       0.00      0.00      0.00        21

    accuracy                           1.00      8627
   macro avg       0.50      0.50      0.50      8627
weighted avg       1.00      1.00      1.00      8627



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
