<a href="https://colab.research.google.com/github/AlessandraParziale/Fairness-Thesis/blob/main/COMMUNITIES_AND_CRIME.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **COMMUNITIES AND CRIME**
La variabile pro capite dei crimini violenti calcolata utilizzando la popolazione e la somma delle variabili di criminalità considerate crimini violenti negli Stati Uniti (omicidio, stupro, rapina e aggressione).


## ***Classification***

### Loading the Data

In [1]:
import pandas as pd
import os
import requests

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder


In [2]:
DATA_DIR = "/content/CRIME_Classification"

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

def ensure_download(url, fname):
    """
    Controlla se il dataset è già presente nella directory dei dati.
    Lo scarica se non presente.
    """
    fpath = f"{DATA_DIR}/{fname}"
    if not os.path.isfile(fpath):
        response = requests.get(url)
        with open(fpath, 'wb') as file:
            file.write(response.content)

def load_communities_crime():
    url = "http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data"
    fname = "communities_crime.csv"
    ensure_download(url, fname)

    fpath = f"{DATA_DIR}/{fname}"

    df = pd.read_csv(fpath, header=None, na_values='?')
    df.dropna(inplace=True)

    X = df.iloc[:, :127]
    y = (df.iloc[:, 127] > 0.7).astype(int)
    y = pd.Series(LabelEncoder().fit_transform(y))


    print(df)
    print("\nNumero di righe e colonne del DataFrame:")
    print(df.shape)

    return X, y


X, y = load_communities_crime()

      0      1        2                 3    4     5     6     7     8    \
16     36    1.0   1000.0        Albanycity    1  0.15  0.31  0.40  0.63   
23     19  193.0  93926.0     SiouxCitycity    1  0.11  0.43  0.04  0.89   
33     51  680.0  47672.0     Lynchburgcity    1  0.09  0.43  0.51  0.58   
68     34   23.0  58200.0    PerthAmboycity    1  0.05  0.59  0.23  0.39   
74      9    9.0  46520.0       Meridentown    1  0.08  0.39  0.08  0.85   
...   ...    ...      ...               ...  ...   ...   ...   ...   ...   
1880   34   39.0  40350.0        Lindencity   10  0.04  0.39  0.39  0.65   
1963   36   27.0  59641.0  Poughkeepsiecity   10  0.03  0.32  0.61  0.47   
1981    9    9.0  35650.0        Hamdentown   10  0.07  0.38  0.17  0.84   
1991    9    9.0  80070.0     Waterburytown   10  0.16  0.37  0.25  0.69   
1992   25   17.0  72600.0       Walthamcity   10  0.08  0.51  0.06  0.87   

       9    ...   118   119   120   121   122   123  124   125   126   127  
16    0.14

In [None]:
# Applicazione label encoding a tutte le colonne categoriche


categorical_cols = X.select_dtypes(include=['object']).columns

label_encoder = LabelEncoder()

for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col])

print(X)

      0      1        2    3    4     5     6     7     8     9    ...   117  \
16     36    1.0   1000.0    0    1  0.15  0.31  0.40  0.63  0.14  ...  0.22   
23     19  193.0  93926.0   94    1  0.11  0.43  0.04  0.89  0.09  ...  0.29   
33     51  680.0  47672.0   52    1  0.09  0.43  0.51  0.58  0.04  ...  0.10   
68     34   23.0  58200.0   79    1  0.05  0.59  0.23  0.39  0.09  ...  0.31   
74      9    9.0  46520.0   58    1  0.08  0.39  0.08  0.85  0.04  ...  0.31   
...   ...    ...      ...  ...  ...   ...   ...   ...   ...   ...  ...   ...   
1880   34   39.0  40350.0   50   10  0.04  0.39  0.39  0.65  0.09  ...  0.32   
1963   36   27.0  59641.0   85   10  0.03  0.32  0.61  0.47  0.09  ...  0.44   
1981    9    9.0  35650.0   36   10  0.07  0.38  0.17  0.84  0.11  ...  0.25   
1991    9    9.0  80070.0  110   10  0.16  0.37  0.25  0.69  0.04  ...  0.25   
1992   25   17.0  72600.0  107   10  0.08  0.51  0.06  0.87  0.22  ...  0.19   

       118   119   120   121   122   12

---
###**Decision Tree**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier(random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)


In [None]:
# Valutazione modello

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Precision
precision = precision_score(y_test, y_pred)

# Recall
recall = recall_score(y_test, y_pred)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)

print("Confusion Matrix Decision Tree:", conf_matrix)
print("Accuracy Decision Tree:", accuracy)
print("Precision Decision Tree:", precision)
print("Recall Decision Tree:", recall)

Confusion Matrix Decision Tree: [[22  1]
 [ 1  1]]
Accuracy Decision Tree: 0.92
Precision Decision Tree: 0.5
Recall Decision Tree: 0.5



---
### **Naïve Bayesian**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

naive_bayes_classifier = GaussianNB()

naive_bayes_classifier.fit(X_train, y_train)

y_pred_nb = naive_bayes_classifier.predict(X_test)

In [None]:
# Valutazione modello

# Confusion Matrix
conf_matrix_nb = confusion_matrix(y_test, y_pred_nb)

# Precision
#precision_nb = precision_score(y_test, y_pred_nb)

# Recall
#recall_nb = recall_score(y_test, y_pred_nb)

# Accuracy
accuracy_nb = accuracy_score(y_test, y_pred_nb)

print("Confusion Matrix Naive Bayes:", conf_matrix_nb)
print("Accuracy Naive Bayes:", accuracy_nb)
#print("Precision Naive Bayes:", precision_nb)
#print("Recall Naive Bayes:", recall_nb)

Confusion Matrix Naive Bayes: [[22  1]
 [ 2  0]]
Accuracy Naive Bayes: 0.88



---
### **Logistic Regression**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logistic_regression = LogisticRegression(solver='sag', max_iter=1000)

logistic_regression.fit(X_train, y_train)

y_pred_lr = logistic_regression.predict(X_test)




In [None]:
# Valutazione modello

# Confusion Matrix
conf_matrix_lr = confusion_matrix(y_test, y_pred_lr)

# Precision
#precision_lr = precision_score(y_test, y_pred_lr)

# Recall
#recall_lr = recall_score(y_test, y_pred_lr)

# Accuracy
accuracy_lr = accuracy_score(y_test, y_pred_lr)

print("Confusion Matrix Logistic Regression:", conf_matrix_lr)
print("Accuracy Logistic Regression:", accuracy_lr)
#print("Precision Logistic Regression:", precision_lr)
#print("Recall Logistic Regression:", recall_lr)

Confusion Matrix Logistic Regression: [[23  0]
 [ 2  0]]
Accuracy Logistic Regression: 0.92


## ***Regression***

### Loading the Data

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor

In [None]:
DATA_DIR = "/content/CRIME_Regression"

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

def ensure_download(url, fname):
    """
    Controlla se il dataset è già presente nella directory dei dati.
    Lo scarica se non presente.
    """
    fpath = f"{DATA_DIR}/{fname}"
    if not os.path.isfile(fpath):
        response = requests.get(url)
        with open(fpath, 'wb') as file:
            file.write(response.content)

def load_communities_crime():
    url = "http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data"
    fname = "communities_crime.csv"
    ensure_download(url, fname)

    fpath = f"{DATA_DIR}/{fname}"

    df = pd.read_csv(fpath, header=None, na_values='?')
    df.dropna(inplace=True)

    X = df.iloc[:, :127]
    y = df.iloc[:, 127]

    print(df)
    print("\nNumero di righe e colonne del DataFrame:")
    print(df.shape)
    print("Variabile target:")
    print(y)

    return X, y


X, y = load_communities_crime()


      0      1        2                 3    4     5     6     7     8    \
16     36    1.0   1000.0        Albanycity    1  0.15  0.31  0.40  0.63   
23     19  193.0  93926.0     SiouxCitycity    1  0.11  0.43  0.04  0.89   
33     51  680.0  47672.0     Lynchburgcity    1  0.09  0.43  0.51  0.58   
68     34   23.0  58200.0    PerthAmboycity    1  0.05  0.59  0.23  0.39   
74      9    9.0  46520.0       Meridentown    1  0.08  0.39  0.08  0.85   
...   ...    ...      ...               ...  ...   ...   ...   ...   ...   
1880   34   39.0  40350.0        Lindencity   10  0.04  0.39  0.39  0.65   
1963   36   27.0  59641.0  Poughkeepsiecity   10  0.03  0.32  0.61  0.47   
1981    9    9.0  35650.0        Hamdentown   10  0.07  0.38  0.17  0.84   
1991    9    9.0  80070.0     Waterburytown   10  0.16  0.37  0.25  0.69   
1992   25   17.0  72600.0       Walthamcity   10  0.08  0.51  0.06  0.87   

       9    ...   118   119   120   121   122   123  124   125   126   127  
16    0.14

In [None]:
# Applicazione label encoding a tutte le colonne categoriche

categorical_cols = X.select_dtypes(include=['object']).columns

label_encoder = LabelEncoder()

for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col])

print(X)


      0      1        2    3    4     5     6     7     8     9    ...   117  \
16     36    1.0   1000.0    0    1  0.15  0.31  0.40  0.63  0.14  ...  0.22   
23     19  193.0  93926.0   94    1  0.11  0.43  0.04  0.89  0.09  ...  0.29   
33     51  680.0  47672.0   52    1  0.09  0.43  0.51  0.58  0.04  ...  0.10   
68     34   23.0  58200.0   79    1  0.05  0.59  0.23  0.39  0.09  ...  0.31   
74      9    9.0  46520.0   58    1  0.08  0.39  0.08  0.85  0.04  ...  0.31   
...   ...    ...      ...  ...  ...   ...   ...   ...   ...   ...  ...   ...   
1880   34   39.0  40350.0   50   10  0.04  0.39  0.39  0.65  0.09  ...  0.32   
1963   36   27.0  59641.0   85   10  0.03  0.32  0.61  0.47  0.09  ...  0.44   
1981    9    9.0  35650.0   36   10  0.07  0.38  0.17  0.84  0.11  ...  0.25   
1991    9    9.0  80070.0  110   10  0.16  0.37  0.25  0.69  0.04  ...  0.25   
1992   25   17.0  72600.0  107   10  0.08  0.51  0.06  0.87  0.22  ...  0.19   

       118   119   120   121   122   12

---


### **Linear Regression**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()

model.fit(X_train, y_train)


# Valutazione modello con l'errore quadratico medio(MSE) sul test set
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error sui dati di test:", mse)




Mean Squared Error sui dati di test: 0.09093877179067214


---


### **Decision Tree**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tree_reg = DecisionTreeRegressor(random_state=42)

tree_reg.fit(X_train, y_train)


# Valutazione modello con l'errore quadratico medio(MSE) sul test set
y_pred = tree_reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 0.03628
