In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix
from statsmodels.api import OLS, add_constant
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample

In [2]:
df = pd.read_csv("df.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,arrest_date,ofns_desc,law_cat_cd,age_group,perp_sex,perp_race,arrest_boro,arrest_precinct,jurisdiction_code,:@computed_region_f5dn_yrer,:@computed_region_92fq_4b7q,arrest_year,arrest_month,arrest_day,PRCP,SNOW,TMIN,TDELTA
0,0,2019-01-26,86,1,3,0,1,2,25,0.0,7.0,36.0,2019,1,26,0.0,0.0,24,11
1,1,2019-01-26,8,2,2,1,1,4,105,0.0,63.0,47.0,2019,1,26,0.0,0.0,24,11
2,2,2019-01-26,25,1,2,1,2,3,43,0.0,58.0,31.0,2019,1,26,0.0,0.0,24,11
3,3,2019-01-26,8,2,2,0,1,3,52,0.0,24.0,40.0,2019,1,26,0.0,0.0,24,11
4,4,2019-01-26,17,2,2,0,3,5,120,0.0,4.0,13.0,2019,1,26,0.0,0.0,24,11


In [4]:
columns_to_standardize = ['PRCP', 'SNOW', 'TMIN', 'TDELTA']


scaler = StandardScaler()

df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])
# Displaying the normalized values for verification
df[['PRCP', 'SNOW', 'TMIN', 'TDELTA']].head()
# Displaying the standardized values for verification
df[columns_to_standardize].head()

Unnamed: 0,PRCP,SNOW,TMIN,TDELTA
0,-0.35824,-0.114487,-1.470692,-0.65208
1,-0.35824,-0.114487,-1.470692,-0.65208
2,-0.35824,-0.114487,-1.470692,-0.65208
3,-0.35824,-0.114487,-1.470692,-0.65208
4,-0.35824,-0.114487,-1.470692,-0.65208


# Predict level of Crime level

In [5]:
# Selecting potential independent variables
independent_vars = ['ofns_desc', 'age_group', 'perp_sex', 'perp_race',
                    'arrest_boro', 'arrest_precinct', 'jurisdiction_code', 
                    'PRCP', 'SNOW', 'TMIN', 'TDELTA','arrest_year','arrest_month','arrest_day']

# Target variable
target_var = 'law_cat_cd'


# Splitting data into features and target
X = df[independent_vars]
y = df[target_var]


### Logistic Regression

In [6]:
data_combined = pd.concat([X, y], axis=1)

# Separate the data by classes
class_0 = data_combined[data_combined['law_cat_cd'] == 0]
class_1 = data_combined[data_combined['law_cat_cd'] == 1]
class_2 = data_combined[data_combined['law_cat_cd'] == 2]
class_3 = data_combined[data_combined['law_cat_cd'] == 3]

# Determine the target size (equal to the largest class)
target_size = max(len(class_1), len(class_2), len(class_3))

# Oversample minority classes
class_0_upsampled = resample(class_0, replace=True, n_samples=target_size, random_state=42)
class_1_upsampled = resample(class_1, replace=True, n_samples=target_size, random_state=42)
class_3_upsampled = resample(class_3, replace=True, n_samples=target_size, random_state=42)

# Combine all classes into a balanced dataset
data_balanced = pd.concat([class_0_upsampled, class_1_upsampled, class_2, class_3_upsampled])

# Separate features and target from the balanced dataset
X_balanced = data_balanced[independent_vars]
y_balanced = data_balanced[target_var]

# Checking the new class distribution
balanced_distribution = y_balanced.value_counts()

In [7]:
balanced_distribution

law_cat_cd
0    1096828
1    1096828
2    1096828
3    1096828
Name: count, dtype: int64

In [8]:
X_balanced

Unnamed: 0,ofns_desc,age_group,perp_sex,perp_race,arrest_boro,arrest_precinct,jurisdiction_code,PRCP,SNOW,TMIN,TDELTA,arrest_year,arrest_month,arrest_day
1579489,77,2,0,1,4,103,0.0,-0.358240,-0.114487,-0.244364,-0.850164,2013,10,23
1631860,77,1,1,1,2,6,0.0,-0.358240,-0.114487,0.164413,1.526841,2013,5,7
341386,54,3,0,1,1,73,0.0,-0.358240,-0.114487,0.748379,-0.255913,2017,5,2
1235784,77,2,0,4,4,107,0.0,0.289298,-0.114487,0.106016,-0.453996,2018,5,12
1184061,77,3,0,2,4,103,0.0,-0.358240,-0.114487,0.339602,0.338339,2014,9,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
855283,74,1,0,1,2,14,1.0,-0.358240,-0.114487,-0.769933,-0.453996,2014,4,17
1440807,74,1,0,5,2,17,1.0,-0.358240,-0.114487,1.215551,0.536422,2015,7,24
886145,74,3,0,3,2,24,1.0,-0.358240,-0.114487,-1.003519,-1.246331,2015,1,20
661938,74,2,0,1,1,75,1.0,4.174525,-0.114487,0.514792,-0.850164,2014,5,16


In [9]:
y_balanced = y_balanced//2
y_balanced

1579489    0
1631860    0
341386     0
1235784    0
1184061    0
          ..
855283     1
1440807    1
886145     1
661938     1
405411     1
Name: law_cat_cd, Length: 4387312, dtype: int64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)
clf = LogisticRegression(max_iter=10000,random_state=42)
clf.fit(X_train,y_train)

In [11]:
clf.score(X_test, y_test)

0.5953225740278408

In [12]:
y_pred = clf.predict(X_test)

In [13]:
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [14]:
conf_matrix

array([[430087, 228004],
       [304630, 353473]])