In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics


In [2]:
#Load Data
crime_data = pd.read_csv('crime.csv')

In [3]:
crime_data.columns = crime_data.columns.str.lower()

In [4]:
#Define the offense category mapping
offense_category_id = {
    "larceny": 1,
    "public-disorder": 2,
    "theft-from-motor-vehicle": 3,
    "auto-theft": 4,
    "drug-alcohol": 5,
    "burglary": 6,
    "other-crimes-against-persons": 7,
    "aggravated-assault": 8,
    "white-collar-crime": 9,
    "sexual-assault": 10,
    "robbery": 11,
    "arson": 12,
    "murder": 13,
    "traffic-accident": 14,
    "all-other-crimes": 15
}
  

In [5]:
#Replace offense categories with mapped intergers
crime_data["crime_id"] = crime_data["offense_category_id"].replace(offense_category_id)

In [6]:
# Define columns to drop after PCA
columns_to_drop = ['offense_id', 'geo_x', 'geo_y', 'last_occurrence_date', 'incident_address', 'offense_category_id']

In [7]:
# Drop rows with missing values in columns_to_drop
crime_data.dropna(subset=columns_to_drop, inplace=True)

In [8]:
# Drop non-numeric columns before scaling
crime_data_numeric = crime_data.drop(columns=columns_to_drop)

In [9]:
# Check the data types of columns and drop non-numeric columns
non_numeric_columns = crime_data_numeric.select_dtypes(exclude=[np.number]).columns
if len(non_numeric_columns) > 0:
    print("Non-numeric columns found. Dropping them.")
    print("Dropped columns:", non_numeric_columns)
    crime_data_numeric = crime_data_numeric.select_dtypes(include=[np.number])

Non-numeric columns found. Dropping them.
Dropped columns: Index(['offense_type_id', 'first_occurrence_date', 'reported_date',
       'neighborhood_id'],
      dtype='object')


In [10]:
# Standardize the numeric data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(crime_data_numeric)

In [11]:
# Apply PCA
pca = PCA(n_components=0.95)  # Retain components explaining 95% of the variance
transformed_data = pca.fit_transform(scaled_data)

In [12]:
# Create DataFrame with transformed data and crime_id column
columns_names = [f"PC{i+1}" for i in range(transformed_data.shape[1])]
transformed_df = pd.DataFrame(transformed_data, columns=columns_names)
transformed_df['crime_id'] = crime_data['crime_id'].values

In [13]:
# Final DataFrame after dropping columns and rows
final_df = transformed_df

In [14]:
# Split the data for training and testing
X = final_df.drop(columns='crime_id')
y = final_df['crime_id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Train Random Forest classifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)

In [16]:
# Predictions
y_pred = rfc.predict(X_test)

In [17]:
# Evaluate the model
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, average='weighted')
recall = metrics.recall_score(y_test, y_pred, average='weighted')
f1_score = metrics.f1_score(y_test, y_pred, average='weighted')


In [18]:
print("Accuracy:", accuracy * 100)
print("Precision:", precision * 100)
print("Recall:", recall * 100)
print("F1 Score:", f1_score * 100)

Accuracy: 99.99104156712852
Precision: 99.99104350980224
Recall: 99.99104156712852
F1 Score: 99.9910404144529


In [43]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier


In [46]:
# Load the data
crime_data = pd.read_csv('crime.csv')
crime_data.columns = crime_data.columns.str.lower()

In [47]:
# Define the offense category mapping
offense_category_id = {
    "larceny": 1,
    "public-disorder": 2,
    "theft-from-motor-vehicle": 3,
    "auto-theft": 4,
    "drug-alcohol": 5,
    "burglary": 6,
    "other-crimes-against-persons": 7,
    "aggravated-assault": 8,
    "white-collar-crime": 9,
    "sexual-assault": 10,
    "robbery": 11,
    "arson": 12,
    "murder": 13,
    "traffic-accident": 14,
    "all-other-crimes": 15
}

In [48]:
# Replace offense categories with mapped integers
crime_data["crime_id"] = crime_data["offense_category_id"].replace(offense_category_id)

In [49]:
# Define columns to drop after PCA
columns_to_drop = ['offense_id', 'geo_x', 'geo_y', 'last_occurrence_date', 'incident_address', 'offense_category_id']

In [50]:
# Drop rows with missing values in columns_to_drop
crime_data.dropna(subset=columns_to_drop, inplace=True)

In [51]:
# Drop non-numeric columns before scaling
crime_data_numeric = crime_data.drop(columns=columns_to_drop)

In [52]:
# Check the data types of columns and drop non-numeric columns
non_numeric_columns = crime_data_numeric.select_dtypes(exclude=[np.number]).columns
if len(non_numeric_columns) > 0:
    print("Non-numeric columns found. Dropping them.")
    print("Dropped columns:", non_numeric_columns)
    crime_data_numeric = crime_data_numeric.select_dtypes(include=[np.number])

Non-numeric columns found. Dropping them.
Dropped columns: Index(['offense_type_id', 'first_occurrence_date', 'reported_date',
       'neighborhood_id'],
      dtype='object')


In [53]:
# Standardize the numeric data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(crime_data_numeric)

In [54]:
# Apply PCA
pca = PCA(n_components=0.95)  # Retain components explaining 95% of the variance
transformed_data = pca.fit_transform(scaled_data)

In [55]:
# Create DataFrame with transformed data and crime_id column
columns_names = [f"PC{i+1}" for i in range(transformed_data.shape[1])]
transformed_df = pd.DataFrame(transformed_data, columns=columns_names)
transformed_df['crime_id'] = crime_data['crime_id'].values

In [56]:
# Final DataFrame after dropping columns and rows
final_df = transformed_df

In [57]:
# Split the data for training and testing
X = final_df.drop(columns='crime_id')
y = final_df['crime_id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
# Define the TensorFlow model with dropout layers
def create_model(dropout_rate=0.2):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dropout(dropout_rate),
        Dense(64, activation='relu'),
        Dropout(dropout_rate),
        Dense(15, activation='softmax')  # Assuming 15 crime categories
    ])
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [59]:
# Wrap Keras model for use with scikit-learn
model = KerasClassifier(build_fn=create_model, verbose=0)

  model = KerasClassifier(build_fn=create_model, verbose=0)


In [60]:
# Define hyperparameters grid for grid search
param_grid = {
    'dropout_rate': [0.1, 0.2, 0.3],
    'batch_size': [32, 64],
    'epochs': [10, 15]
}


In [61]:
# Perform grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, verbose=2)
grid_result = grid.fit(X_train, y_train)



Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END .........batch_size=32, dropout_rate=0.1, epochs=10; total time=  59.6s
[CV] END .........batch_size=32, dropout_rate=0.1, epochs=10; total time= 1.0min
[CV] END .........batch_size=32, dropout_rate=0.1, epochs=10; total time= 1.0min
[CV] END .........batch_size=32, dropout_rate=0.1, epochs=15; total time= 1.5min
[CV] END .........batch_size=32, dropout_rate=0.1, epochs=15; total time= 1.5min
[CV] END .........batch_size=32, dropout_rate=0.1, epochs=15; total time= 1.5min
[CV] END .........batch_size=32, dropout_rate=0.2, epochs=10; total time= 1.0min
[CV] END .........batch_size=32, dropout_rate=0.2, epochs=10; total time= 1.0min
[CV] END .........batch_size=32, dropout_rate=0.2, epochs=10; total time= 1.0min
[CV] END .........batch_size=32, dropout_rate=0.2, epochs=15; total time= 1.5min
[CV] END .........batch_size=32, dropout_rate=0.2, epochs=15; total time= 1.5min
[CV] END .........batch_size=32, dropout_rate=0.

In [62]:
# Print best hyperparameters
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.999858 using {'batch_size': 32, 'dropout_rate': 0.2, 'epochs': 15}


In [63]:
# Evaluate the best model
best_model = grid_result.best_estimator_
y_pred = best_model.predict(X_test)



In [64]:
# Evaluate the model
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, average='weighted')
recall = metrics.recall_score(y_test, y_pred, average='weighted')
f1_score = metrics.f1_score(y_test, y_pred, average='weighted')

In [65]:
print("Accuracy:", accuracy * 100)
print("Precision:", precision * 100)
print("Recall:", recall * 100)
print("F1 Score:", f1_score * 100)

Accuracy: 99.99402771141902
Precision: 99.99402902969257
Recall: 99.99402771141902
F1 Score: 99.99402657212103
