# Model Selection and Training Notebook

## Objectives:
- Train multiple machine learning models on the dataset.
- Perform hyperparameter tuning to find the best model.
- Save trained models for further evaluation.

1. Import Libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import joblib

2. Check column names in the dataset

FileNotFoundError: [Errno 2] No such file or directory: 'outputs/datasets/featured/TrainSet_Featured.csv'

2. Load Data

In [4]:
import os
print(os.getcwd())

# Load preprocessed data
TrainSet_encoded = pd.read_csv('/workspace/bicycle_thefts_berlin/outputs/datasets/featured/TrainSet_Featured.csv')
TestSet_encoded = pd.read_csv('/workspace/bicycle_thefts_berlin/outputs/datasets/featured/TestSet_Featured.csv')

# Inspect the unique values in potential target columns
print(TrainSet_encoded['VERSUCH'].unique())
print(TrainSet_encoded['ERFASSUNGSGRUND'].unique())
print(TrainSet_encoded['DELIKT_Keller- und Bodeneinbruch'].unique())

# Assuming 'VERSUCH' is the target column
target_column = 'VERSUCH'

# Separating features and target variable
X_train = TrainSet_encoded.drop(target_column, axis=1)  
y_train = TrainSet_encoded[target_column]
X_test = TestSet_encoded.drop(target_column, axis=1)
y_test = TestSet_encoded[target_column]

print("Data has been loaded and split into features and target.")

/workspace/bicycle_thefts_berlin/jupyter_notebooks
['0' '1' 'Unbekannt' 0 1]
['Sonstiger schwerer Diebstahl von FahrrÃ¤dern'
 'Einfacher Diebstahl von FahrrÃ¤dern'
 'Sonstiger schwerer Diebstahl in/aus Keller/Boden von FahrrÃ¤dern'
 'Einfacher Diebstahl aus Keller/Boden von FahrrÃ¤dern']
[False  True]
Data has been loaded and split into features and target.


  TrainSet_encoded = pd.read_csv('/workspace/bicycle_thefts_berlin/outputs/datasets/featured/TrainSet_Featured.csv')


3. Model Selection
We’ll define a list of models that we want to evaluate.

In [5]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

4. Training Models
For each model, train it on the training data and store the results.

In [10]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# Check the data types in X_train and y_train
print(X_train.dtypes)
print(y_train.dtypes)

# Check if there are any non-numeric columns in X_train and y_train
non_numeric_columns = X_train.select_dtypes(exclude=['number']).columns
print("Non-numeric columns in X_train:", non_numeric_columns)

if len(non_numeric_columns) > 0:
    # Apply get_dummies to the non-numeric columns
    X_train = pd.get_dummies(X_train, drop_first=True)
    X_test = pd.get_dummies(X_test, drop_first=True)

    # Align the columns between train and test after encoding
    X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Ensure y_train is numeric
if y_train.dtypes != 'int64' and y_train.dtypes != 'float64':
    y_train = pd.to_numeric(y_train, errors='coerce')

# Check for NaN or invalid values after conversion
print("Any NaN in y_train:", y_train.isnull().sum())

# Drop NaN rows if any
if y_train.isnull().sum() > 0:
    y_train = y_train.dropna()

# Remove rows with NaN in y_train and the corresponding rows in X_train
valid_indices = y_train.dropna().index  # Get indices where y_train is not NaN

# Filter both X_train and y_train using these indices
X_train = X_train.loc[valid_indices]
y_train = y_train.loc[valid_indices]

# Now check if the lengths are the same
print("Length of X_train:", len(X_train))
print("Length of y_train:", len(y_train))

# Ensure y_test is numeric
if y_test.dtypes != 'int64' and y_test.dtypes != 'float64':
    y_test = pd.to_numeric(y_test, errors='coerce')

# Check for NaN in y_test
if y_test.isnull().sum() > 0:
    y_test = y_test.dropna()

# Remove rows with NaN in X_test that correspond to NaN in y_test
valid_indices_test = y_test.dropna().index  # Get indices where y_test is not NaN
X_test = X_test.loc[valid_indices_test]
y_test = y_test.loc[valid_indices_test]

# Now check if the lengths are the same
print("Length of X_test:", len(X_test))
print("Length of y_test:", len(y_test))

# Check and convert date columns
if 'TATZEIT_ANFANG_DATUM' in TrainSet_encoded.columns:
    TrainSet_encoded['TATZEIT_ANFANG_DATUM'] = pd.to_datetime(TrainSet_encoded['TATZEIT_ANFANG_DATUM'])
    TrainSet_encoded['TATZEIT_ANFANG_YEAR'] = TrainSet_encoded['TATZEIT_ANFANG_DATUM'].dt.year
    TrainSet_encoded['TATZEIT_ANFANG_MONTH'] = TrainSet_encoded['TATZEIT_ANFANG_DATUM'].dt.month
    TrainSet_encoded = TrainSet_encoded.drop(columns=['TATZEIT_ANFANG_DATUM', 'TATZEIT_ENDE_DATUM'])

if 'TATZEIT_ANFANG_DATUM' in TestSet_encoded.columns:
    TestSet_encoded['TATZEIT_ANFANG_DATUM'] = pd.to_datetime(TestSet_encoded['TATZEIT_ANFANG_DATUM'])
    TestSet_encoded['TATZEIT_ANFANG_YEAR'] = TestSet_encoded['TATZEIT_ANFANG_DATUM'].dt.year
    TestSet_encoded['TATZEIT_ANFANG_MONTH'] = TestSet_encoded['TATZEIT_ANFANG_DATUM'].dt.month
    TestSet_encoded = TestSet_encoded.drop(columns=['TATZEIT_ANFANG_DATUM', 'TATZEIT_ENDE_DATUM'])

# Ensure there are no object (string) columns left in the data
# Drop any non-numeric columns or convert them to numeric
X_train = X_train.select_dtypes(include=['number'])
X_test = X_test.select_dtypes(include=['number'])

# Dictionary to store model performance
model_performance = {}

# Train and evaluate each model
for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Calculate accuracy and F1-score
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    train_f1 = f1_score(y_train, y_pred_train)
    test_f1 = f1_score(y_test, y_pred_test)
    
    # Store the performance
    model_performance[model_name] = {
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'train_f1': train_f1,
        'test_f1': test_f1
    }
    
    print(f"{model_name} -> Test Accuracy: {test_accuracy:.4f}, Test F1: {test_f1:.4f}")

TATZEIT_ANFANG_STUNDE    float64
TATZEIT_ENDE_STUNDE        int64
TATZEIT_ANFANG_YEAR        int64
TATZEIT_ANFANG_MONTH       int64
dtype: object
float64
Non-numeric columns in X_train: Index([], dtype='object')
Any NaN in y_train: 0
Length of X_train: 34500
Length of y_train: 34500
Length of X_test: 8627
Length of y_test: 8627
Training Logistic Regression...
Logistic Regression -> Test Accuracy: 0.9976, Test F1: 0.0000
Training Random Forest...
Random Forest -> Test Accuracy: 0.9971, Test F1: 0.0000
Training SVM...
SVM -> Test Accuracy: 0.9976, Test F1: 0.0000
Training K-Nearest Neighbors...
K-Nearest Neighbors -> Test Accuracy: 0.9976, Test F1: 0.0000


5. Hyperparameter Tuning (Optional)
Use GridSearchCV for hyperparameter tuning on the selected model.

In [11]:
# Example: Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, scoring='f1', cv=5)
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best F1 score: ", grid_search.best_score_)

# Save the best model
best_rf_model = grid_search.best_estimator_
joblib.dump(best_rf_model, 'best_random_forest_model.pkl')

Best parameters found:  {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best F1 score:  0.0


['best_random_forest_model.pkl']

6. Save the Best Model
Save the trained models for further evaluation.

In [12]:
# Save each model
for model_name, model in models.items():
    joblib.dump(model, f'{model_name}_model.pkl')
    print(f"{model_name} saved as {model_name}_model.pkl")

Logistic Regression saved as Logistic Regression_model.pkl
Random Forest saved as Random Forest_model.pkl
SVM saved as SVM_model.pkl
K-Nearest Neighbors saved as K-Nearest Neighbors_model.pkl


7. Summary of Results
Summarize the performance of the trained models.

In [13]:
import pandas as pd
results_df = pd.DataFrame(model_performance).T
print(results_df)

# Save results to a CSV
results_df.to_csv('model_performance_summary.csv')

                     train_accuracy  test_accuracy  train_f1  test_f1
Logistic Regression        0.996000       0.997566  0.000000      0.0
Random Forest              0.996116       0.997102  0.056338      0.0
SVM                        0.996000       0.997566  0.000000      0.0
K-Nearest Neighbors        0.996000       0.997566  0.000000      0.0


8. Next Steps
Provide a brief note on what to do next.

## Next Steps
- Perform Model Evaluation using cross-validation or other methods.
- Choose the best performing model based on metrics.
- Fine-tune the model further, if needed, before deployment.