In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load dataset
df = pd.read_csv('crimedata.csv', sep=',', encoding='ISO-8859-1')

# Data Cleaning
df = df.rename(columns={'Êcommunityname': 'Community Name'})
df = df.replace('?', '0')

def extractSubstring(myStr):
    if "-" in myStr:
        lowVal, hiVal = myStr.split("-")
        lowVal, hiVal = float(re.sub(r'[^\d.]', '', lowVal)), float(re.sub(r'[^\d.]', '', hiVal))
        return (lowVal + hiVal) / 2
    return float(myStr.replace(",", ""))

df['PolicReqPerOffic'] = df['PolicReqPerOffic'].apply(extractSubstring)
df['ViolentCrimesPerPop'] = df['ViolentCrimesPerPop'].apply(extractSubstring)

# Label Creation
violent_crimes = df['ViolentCrimesPerPop'].astype(float)
violent_crimes_mean = violent_crimes.mean()
df['violent_crime_occurence'] = np.where(violent_crimes >= violent_crimes_mean, '1', '0')

# Feature Selection for Clustering
features = ['householdsize', 'racepctblack']
X = df[features].values
y = df['violent_crime_occurence'].astype(float).values

# K-Means Clustering
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X)
y_pred = kmeans.predict(X)

# GMM Clustering
gmm = GaussianMixture(n_components=3, random_state=42).fit(X)
gmm_labels = gmm.predict(X)

# Linear Regression
X_lr = df[['PctUnemployed']].astype(float).values
y_lr = df['ViolentCrimesPerPop'].astype(float).values
X_train, X_test, y_train, y_test = train_test_split(X_lr, y_lr, test_size=0.2, random_state=42)
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lr = lin_reg.predict(X_test)

# Logistic Regression
X_log = df[['agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', 'PctUnemployed']].astype(float)
y_log = df['violent_crime_occurence'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X_log, y_log, test_size=0.2, random_state=42)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)

# Decision Tree
X_tree = df[['population', 'householdsize', 'medIncome', 'PctUnemployed', 'PolicReqPerOffic']]
y_tree = df['violent_crime_occurence'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X_tree, y_tree, test_size=0.2, random_state=42)
dt = DecisionTreeClassifier(criterion='gini', max_depth=20, min_samples_split=9, min_samples_leaf=6, random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# SVM
sc = StandardScaler()
X_svm = sc.fit_transform(X_tree)
X_train, X_test, y_train, y_test = train_test_split(X_svm, y_tree, test_size=0.2, random_state=42)
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_tree)

# Accuracy Metrics
def print_metrics(model_name, y_test, y_pred):
    print(f"{model_name} Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(f"{model_name} Precision: {precision_score(y_test, y_pred, average='binary'):.2f}")
    print(f"{model_name} Recall: {recall_score(y_test, y_pred, average='binary'):.2f}")
    print(f"{model_name} F1 Score: {f1_score(y_test, y_pred, average='binary'):.2f}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}\n")

print_metrics("K-Means", y, y_pred)
print_metrics("Logistic Regression", y_test, y_pred_log)
print_metrics("Decision Tree", y_test, y_pred_dt)
print_metrics("Random Forest", y_test, y_pred_rf)
print_metrics("SVM", y_test, y_pred_svm)
print_metrics("Naive Bayes", y_test, y_pred_nb)