In [None]:
%pip install scikit-learn

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

df = pd.read_parquet("./Crimes_2019_to_2024_Final.parquet")

In [5]:
y = df['Arrest']

# remove a coluna 'Arrest' e outras colunas que atrapalham o modelo
X = df.drop(columns=[
    'Arrest', 
    'datetime', 
    'Community Area Name', 
    'Description'
])

# transforma variáveis categóricas em variáveis dummy
X = pd.get_dummies(X, drop_first=True)

# divide os dados em treino e teste, mantendo a proporção de classes em ambas as partes
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

#### Modelos simples

In [3]:
# Dummy Classifier - Estratégia da Classe Majoritária


dummy_model = DummyClassifier(strategy='most_frequent')
dummy_model.fit(X_train, y_train)

# faz previsões no conjunto de teste
y_pred_dummy = dummy_model.predict(X_test)
y_proba_dummy = dummy_model.predict_proba(X_test)[:, 1]


# avalia o modelo usando Acurácia(Não é confiavel, mas serve para ter uma ideia), F1-score e ROC-AUC
accuracy_dummy = accuracy_score(y_test, y_pred_dummy)
f1_dummy = f1_score(y_test, y_pred_dummy, pos_label=True)
roc_auc_dummy = roc_auc_score(y_test, y_proba_dummy)

print("Estratégia Dummy - Classe Majoritária (Arrest = 'False')")
print(f"Acurácia: {accuracy_dummy:.4f}")
print(f"F1-score (Para 'True'): {f1_dummy:.4f}")
print(f"ROC-AUC: {roc_auc_dummy:.4f}")

Estratégia Dummy - Classe Majoritária (Arrest = 'False')
Acurácia: 0.8533
F1-score (Para 'True'): 0.0000
ROC-AUC: 0.5000


In [None]:
# Decision Tree Classifier - Modelo Base (Regressão Logística é demorado demais para o dataset)

# max depth=5 para tornar a arvore menos complexa
tree_baseline = DecisionTreeClassifier(max_depth=5, 
                                       class_weight='balanced', 
                                       random_state=42)
from sklearn.model_selection import RandomizedSearchCV
tree_baseline.fit(X_train, y_train)

# previsões
y_pred_tree = tree_baseline.predict(X_test)
y_proba_tree = tree_baseline.predict_proba(X_test)[:, 1]

# avalia o modelo usando Acurácia, F1-score e ROC-AUC
print("Decision Tree Classifier - Modelo Base")
print(f"Acurácia: {accuracy_score(y_test, y_pred_tree):.4f}")
print(f"F1-score (Para 'True'): {f1_score(y_test, y_pred_tree, pos_label=True):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba_tree):.4f}")

# Decision Tree Classifier - Modelo Base
# Acurácia: 0.8157
# F1-score (Para 'True'): 0.4703
# ROC-AUC: 0.7462


#### Modelo principal

In [6]:
# Random Forest Classifier - Modelo Principal

rf_model = RandomForestClassifier(n_estimators=50,
                                  n_jobs=-1,
                                  max_depth=10,
                                  class_weight='balanced',
                                  random_state=42)

rf_model.fit(X_train, y_train)

# previsões
y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]

# avalia o modelo usando Acurácia, F1-score e ROC-AUC
print("Random Forest Classifier - Modelo Principal")
print(f"Acurácia: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"F1-score (Para 'True'): {f1_score(y_test, y_pred_rf, pos_label=True):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba_rf):.4f}")

: 

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf_random = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_grid,
    n_iter=10,
    cv=3,
    scoring='f1',
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# treina com uma amostra menor para economizar tempo
rf_random.fit(X_train.sample(frac=0.1, random_state=42), y_train.sample(frac=0.1, random_state=42))

print("Melhores Hiperparâmetros Encontrados:")
print(rf_random.best_params_)