In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.impute import SimpleImputer


data = pd.read_csv('/content/The_Case_of_the_Predictive_Crime_Solver.csv')


data['DATE OCC'] = pd.to_datetime(data['DATE OCC'])
data['Date Rptd'] = pd.to_datetime(data['Date Rptd'])


data['year'] = data['DATE OCC'].dt.year
data['month'] = data['DATE OCC'].dt.month
data['day'] = data['DATE OCC'].dt.day
data['hour'] = data['TIME OCC'] // 100


features = ['AREA NAME', 'Rpt Dist No', 'Part 1-2', 'Vict Age', 'Vict Sex', 'Vict Descent',
            'Premis Desc', 'Weapon Used Cd', 'LAT', 'LON', 'year', 'month', 'day']
target = 'hour'

X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numerical_features = ['Vict Age', 'LAT', 'LON', 'year', 'month', 'day']
categorical_features = ['AREA NAME', 'Rpt Dist No', 'Part 1-2', 'Vict Sex', 'Vict Descent', 'Premis Desc', 'Weapon Used Cd']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


tree_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=42))
])


param_grid = {
    'regressor__max_depth': [10, 20, 30, None],
    'regressor__min_samples_split': [2, 10, 20],
    'regressor__min_samples_leaf': [1, 5, 10]
}

grid_search = GridSearchCV(tree_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)


y_pred = grid_search.predict(X_test)


print('Best parameters found: ', grid_search.best_params_)
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('R2 Score:', r2_score(y_test, y_pred))


best_model = grid_search.best_estimator_.named_steps['regressor']
plt.figure(figsize=(20,10))
tree.plot_tree(best_model, filled=True, feature_names=numerical_features + list(grid_search.best_estimator_.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_features)))
plt.show()

my_test_case = pd.DataFrame({
    'AREA NAME': ['Central'],
    'Rpt Dist No': [101],
    'Part 1-2': [1],
    'Vict Age': [34],
    'Vict Sex': ['M'],
    'Vict Descent': ['H'],
    'Premis Desc': ['Street'],
    'Weapon Used Cd': [0],
    'LAT': [34.0522],
    'LON': [-118.2437],
    'year': [2022],
    'month': [3],
    'day': [22]
})

my_test_case_transformed = grid_search.best_estimator_.named_steps['preprocessor'].transform(my_test_case)

my_prediction = grid_search.best_estimator_.named_steps['regressor'].predict(my_test_case_transformed)

print(f'Predicted hour for crime occuring: {my_prediction[0]}')