In [25]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
import joblib

In [26]:
np.random.seed(42)
n_samples = 5000

data = {
    'age': np.random.randint(18, 60, n_samples),
    'previous_criminal_record': np.random.choice([0, 1], n_samples),
    'history_of_violence': np.random.choice([0, 1], n_samples),
    'alcohol_use': np.random.choice([0, 1], n_samples),
    'mental_health_issues': np.random.choice([0, 1], n_samples),
    'education_level': np.random.choice(['High School', 'College', 'None'], n_samples),
    'social_media_activity': np.random.choice([0, 1], n_samples),
    'income_stress': np.random.choice([0, 1], n_samples),
    'victim_relationship': np.random.choice(['Family', 'Friend', 'Stranger'], n_samples),
    'crime_committed': np.random.choice([0, 1], n_samples)  # 0 = Not likely, 1 = Likely to commit crime
}

In [None]:
df = pd.DataFrame(data)

print("Class distribution before balancing:")
df['crime_committed'].value_counts()

Class distribution before balancing:


crime_committed
0    2542
1    2458
Name: count, dtype: int64

In [None]:
label_enc = LabelEncoder()
df['education_level'] = label_enc.fit_transform(df['education_level'])
df['victim_relationship'] = label_enc.fit_transform(df['victim_relationship'])

In [29]:
X = df.drop(columns=['crime_committed'])
y = df['crime_committed']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [31]:
# Apply SMOTE to balance dataset
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [32]:
# Standardize features for MLP and KNN
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "LogisticRegression": LogisticRegression(),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "NeuralNetwork": MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=500, random_state=42)
}

In [None]:
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy of {name}: {accuracy:.4f}")
    print(classification_report(y_test, y_pred, zero_division=1))
    print("-" * 50)


Training RandomForest...
Accuracy of RandomForest: 0.5270
              precision    recall  f1-score   support

           0       0.54      0.51      0.52       509
           1       0.52      0.55      0.53       491

    accuracy                           0.53      1000
   macro avg       0.53      0.53      0.53      1000
weighted avg       0.53      0.53      0.53      1000

--------------------------------------------------
Training GradientBoosting...
Accuracy of GradientBoosting: 0.5000
              precision    recall  f1-score   support

           0       0.51      0.48      0.49       509
           1       0.49      0.52      0.51       491

    accuracy                           0.50      1000
   macro avg       0.50      0.50      0.50      1000
weighted avg       0.50      0.50      0.50      1000

--------------------------------------------------
Training LogisticRegression...
Accuracy of LogisticRegression: 0.4840
              precision    recall  f1-score   supp

In [35]:
joblib.dump(models["RandomForest"], "crime_prediction_model.pkl")

['crime_prediction_model.pkl']

In [None]:
import pandas as pd

df = pd.read_csv('communities.data', header=None)  
df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,8,?,?,Lakewoodcity,1,0.19,0.33,0.02,0.9,0.12,...,0.12,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14,0.2
1,53,?,?,Tukwilacity,1,0.0,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,?,?,?,?,0.0,?,0.67
2,24,?,?,Aberdeentown,1,0.0,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,?,?,?,?,0.0,?,0.43
3,34,5,81440,Willingborotownship,1,0.04,0.77,1.0,0.08,0.12,...,0.02,0.39,0.28,?,?,?,?,0.0,?,0.12
4,42,95,6096,Bethlehemtownship,1,0.01,0.55,0.02,0.95,0.09,...,0.04,0.09,0.02,?,?,?,?,0.0,?,0.03
