In [451]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
import tensorflow as tf

# Caricamento del df 
df_train = pd.read_csv('dataset/train.csv')

Il dataset presenta alcune colonne ridondanti o inutili al fine di prevedere la mia variabile target, quali:
> Cross_Street

    (Già presente Area ID)

> Latitude & Longitude

    (Ridondante)

> Date_Reported

    (Irrilevante)

> Date_Occurred

    (Meglio l'ora)

> Area_Name

    (Già presente Area_ID)

> Reporting_District_no

    (Irrilevante  per l'indagine)

> Modus_Operandi

    (Codici senza descrizione, irrilevanti)

> Victim_Descent

    (Non utile al target)

> Victim_Sex

    (Non utile al target)

> Premise_Code

    (Teniamo solo la descrizione di questa variabile)

> Weapon_Used_Code

    (Teniamo la descrizione di questa variabile)

> Stato

    (Non occorre)

> Status_Description

    (Non serve)

In [452]:
# check su valori unici target per confermare l'inutilità di queste variabili
df_train['Crime_Category'].unique()

array(['Property Crimes', 'Violent Crimes', 'Other Crimes',
       'Crimes against Public Order', 'Fraud and White-Collar Crimes',
       'Crimes against Persons'], dtype=object)

In [453]:
# Elenco delle colonne da eliminare
cols_to_drop = [
    'Latitude',                 # Area ID
    'Longitude',                 # Area ID
    'Cross_Street',             # Ridondante con lat/long
    'Location',                 # Abbiamo latitudine, longitudine e area
    'Date_Reported',            # Effetto, non causa
    'Date_Occurred',            # Meglio sapere l'orario che il giorno
    'Area_Name',                  # Ridondante con Area ID
    'Reporting_District_no',    # Granulare 
    'Modus_Operandi',           # Difficile da interpretare
    'Victim_Descent',           # Ininfluente
    'Premise_Code',             # Ridondante con descrizione
    'Weapon_Used_Code',         # Ridondante con descrizione 
    'Status',                   # Leakage
    'Status_Description',       # Leakage
    'Victim_Sex'                # Irrilevante per le categorie presenti
]

# Rimozione dal DataFrame
df_train = df_train.drop(columns=cols_to_drop)

df_train.head(5)

Unnamed: 0,Time_Occurred,Area_ID,Part 1-2,Victim_Age,Premise_Description,Weapon_Description,Crime_Category
0,1800.0,15.0,1.0,75.0,STREET,,Property Crimes
1,1345.0,13.0,1.0,41.0,SWAP MEET,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",Property Crimes
2,605.0,13.0,2.0,67.0,SINGLE FAMILY DWELLING,,Property Crimes
3,1800.0,19.0,1.0,61.0,STREET,,Property Crimes
4,1130.0,12.0,1.0,0.0,MINI-MART,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",Property Crimes


## Sistemazione delle variabili

Controllo e gestione NaN

In [454]:
# Controlliamo quanti NaN ci sono ora
cols_to_check = [
    'Time_Occurred',
    'Area_ID',
    'Part 1-2',
    'Victim_Age',
    'Premise_Description',
    'Weapon_Description'
]

# Conta i NaN per ciascuna colonna
df_train[cols_to_check].isna().sum()



Time_Occurred              0
Area_ID                    0
Part 1-2                   0
Victim_Age                 0
Premise_Description        5
Weapon_Description     12665
dtype: int64

In [455]:
# E decido di far diventare Unknown i valori che non sono presenti per evitare di eliminare dati importanti
df_train['Premise_Description'] = df_train['Premise_Description'].fillna('Unknown')
df_train['Weapon_Description'] = df_train['Weapon_Description'].fillna('UNKNOWN WEAPON/OTHER WEAPON')

# Ricontrolliamo i NaN
check_nan = ['Premise_Description',
             'Weapon_Description']
df_train[check_nan].isna().sum()

Premise_Description    0
Weapon_Description     0
dtype: int64

Time Occurred raggruppamento in macrocategorie

In [456]:
# Andiamo a lavorare su Time_Occurred
def get_time_period(hour):
    if hour < 6:
        return 'Night'
    elif hour < 12:
        return 'Morning'
    elif hour < 18:
        return 'Afternoon'
    else:
        return 'Evening'

df_train['Time_Period'] = df_train['Time_Occurred'].apply(get_time_period)

# Ora elimino Time_Occurred
delete = ['Time_Occurred']
df_train = df_train.drop(columns= delete)

df_train.head(20)


Unnamed: 0,Area_ID,Part 1-2,Victim_Age,Premise_Description,Weapon_Description,Crime_Category,Time_Period
0,15.0,1.0,75.0,STREET,UNKNOWN WEAPON/OTHER WEAPON,Property Crimes,Evening
1,13.0,1.0,41.0,SWAP MEET,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",Property Crimes,Evening
2,13.0,2.0,67.0,SINGLE FAMILY DWELLING,UNKNOWN WEAPON/OTHER WEAPON,Property Crimes,Evening
3,19.0,1.0,61.0,STREET,UNKNOWN WEAPON/OTHER WEAPON,Property Crimes,Evening
4,12.0,1.0,0.0,MINI-MART,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",Property Crimes,Evening
5,11.0,2.0,50.0,"MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)","STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",Violent Crimes,Evening
6,4.0,1.0,0.0,LIQUOR STORE,UNKNOWN WEAPON/OTHER WEAPON,Property Crimes,Evening
7,14.0,1.0,68.0,"MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)",UNKNOWN WEAPON/OTHER WEAPON,Property Crimes,Evening
8,9.0,1.0,0.0,DRIVEWAY,UNKNOWN WEAPON/OTHER WEAPON,Property Crimes,Evening
9,14.0,2.0,22.0,PARKING LOT,UNKNOWN WEAPON/OTHER WEAPON,Property Crimes,Evening


One Hot Encoding macrocategorie temporali

In [457]:
# One Hot Encoding del time_period
df_train = pd.get_dummies(df_train, columns=['Time_Period'], drop_first=True)

| Time\_Period\_Afternoon | Time\_Period\_Evening | Time\_Period\_Morning |
| ----------------------- | --------------------- | --------------------- |
| 0                       | 0                     | 1                     |
| 0                       | 0                     | 0                     |
| 1                       | 0                     | 0                     |
| 0                       | 1                     | 0                     |

(Funzionamento)

Raggruppamento macrocategorie luogo del delitto

In [458]:
# controlliamo quanti valori ci sono in premise description per capire come procedere
df_train['Premise_Description'].unique()

array(['STREET', 'SWAP MEET', 'SINGLE FAMILY DWELLING', 'MINI-MART',
       'MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)', 'LIQUOR STORE',
       'DRIVEWAY', 'PARKING LOT', 'YARD (RESIDENTIAL/BUSINESS)',
       'SIDEWALK', 'OTHER BUSINESS', 'VEHICLE, PASSENGER/TRUCK',
       'GARAGE/CARPORT', 'HOTEL', 'ABANDONED BUILDING ABANDONED HOUSE',
       "MOBILE HOME/TRAILERS/CONSTRUCTION TRAILERS/RV'S/MOTORHOME",
       'OTHER PREMISE', 'ALLEY', 'BANK', 'SHORT-TERM VACATION RENTAL',
       'OTHER STORE', 'WEBSITE', 'RESTAURANT/FAST FOOD',
       'PARKING UNDERGROUND/BUILDING', 'MTA BUS', 'SAVINGS & LOAN',
       "DIY CENTER (LOWE'S,HOME DEPOT,OSH,CONTRACTORS WAREHOUSE)",
       'DISCOUNT STORE (99 CENT,DOLLAR,ETC.', 'OTHER RESIDENCE',
       'BUS STOP', 'MOTEL', 'PORCH, RESIDENTIAL',
       "COFFEE SHOP (STARBUCKS, COFFEE BEAN, PEET'S, ETC.)",
       'PARK/PLAYGROUND', 'ELEMENTARY SCHOOL', 'UNDERPASS/BRIDGE*',
       'STORAGE SHED', 'GAS STATION', 'DEPARTMENT STORE',
       'JUNIOR HIGH SCHOO

Troppe categorie quindi le tratteremo così:

**Residential**	= 'SINGLE FAMILY DWELLING', 'APARTMENT', 'YARD', 'GARAGE', ...

**Commercial** = 'MARKET', 'LIQUOR STORE', 'RESTAURANT', 'BANK', ...

**Transportation** = 'BUS', 'STATION', 'MTA', 'METROLINK', ...

**Public_Space** = 'STREET', 'SIDEWALK', 'PARK', 'ALLEY', ...

**Education** = 'SCHOOL', 'COLLEGE', 'UNIVERSITY', ...

**Healthcare** = 'HOSPITAL', 'DENTAL', 'CLINIC', ...

**Other** = 'Unknown', 'OTHER'

In [459]:
# Funzione per raggruppare descrizioni del luogo del crimine in macro-categorie
def simplify_premise(premise):
    # Se la descrizione contiene parole che indicano ambienti residenziali...
    if 'DWELLING' in premise or 'HOUSE' in premise or 'HOME' in premise or 'RESIDENCE' in premise:
        return 'Residential'  # ...classifica come "Residential"
    
    # Se contiene parole legate al commercio...
    elif 'STORE' in premise or 'MARKET' in premise or 'MALL' in premise:
        return 'Commercial'
    
    # Se si riferisce a scuole o università...
    elif 'SCHOOL' in premise or 'UNIVERSITY' in premise:
        return 'Education'
    
    # Se si tratta di strutture sanitarie...
    elif 'HOSPITAL' in premise or 'DENTAL' in premise or 'CLINIC' in premise:
        return 'Healthcare'
    
    # Se riguarda mezzi di trasporto o stazioni...
    elif 'BUS' in premise or 'STATION' in premise or 'TRAIN' in premise or 'MTA' in premise:
        return 'Transportation'
    
    # Se si tratta di luoghi pubblici generici come strade, parchi, marciapiedi...
    elif 'STREET' in premise or 'SIDEWALK' in premise or 'PARK' in premise or 'LOT' in premise:
        return 'Public_Space'
    
    # Se contiene "UNKNOWN" o "OTHER", cioè casi non specificati...
    elif 'UNKNOWN' in premise or 'OTHER' in premise:
        return 'Other'
    
    # Per tutti gli altri casi non gestiti sopra, classifica comunque come "Other"
    else:
        return 'Other'

# Applica la funzione a ogni valore della colonna 'Premise_Description'
# e crea una nuova colonna 'Premise_Group' con il risultato
df_train['Premise_Group'] = df_train['Premise_Description'].apply(simplify_premise)

#elimina 'Premise_description'
del2 = ['Premise_Description']
df_train = df_train.drop(columns = del2)


One Hot Encoding macrocategorie luogo del delitto

In [460]:
# Una volta fatto questo facciamo One Hot Encoding anche su queste dato che sono < 10 ed il OHE è consigliato
df_train = pd.get_dummies(df_train, columns=['Premise_Group'], drop_first=True)

#check
df_train.head(5)

Unnamed: 0,Area_ID,Part 1-2,Victim_Age,Weapon_Description,Crime_Category,Time_Period_Evening,Time_Period_Morning,Time_Period_Night,Premise_Group_Education,Premise_Group_Healthcare,Premise_Group_Other,Premise_Group_Public_Space,Premise_Group_Residential,Premise_Group_Transportation
0,15.0,1.0,75.0,UNKNOWN WEAPON/OTHER WEAPON,Property Crimes,True,False,False,False,False,False,True,False,False
1,13.0,1.0,41.0,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",Property Crimes,True,False,False,False,False,True,False,False,False
2,13.0,2.0,67.0,UNKNOWN WEAPON/OTHER WEAPON,Property Crimes,True,False,False,False,False,False,False,True,False
3,19.0,1.0,61.0,UNKNOWN WEAPON/OTHER WEAPON,Property Crimes,True,False,False,False,False,False,True,False,False
4,12.0,1.0,0.0,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",Property Crimes,True,False,False,False,False,True,False,False,False


Raggruppamento macrocategorie descrizione arma

In [461]:
# controlliamo quanti valori ci sono in weapon description per capire come procedere
df_train['Weapon_Description'].unique()

array(['UNKNOWN WEAPON/OTHER WEAPON',
       'STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)', 'VERBAL THREAT',
       'OTHER KNIFE', 'HAND GUN', 'VEHICLE', 'FIRE', 'PIPE/METAL PIPE',
       'KNIFE WITH BLADE 6INCHES OR LESS', 'BLUNT INSTRUMENT', 'CLUB/BAT',
       'SEMI-AUTOMATIC PISTOL', 'ROCK/THROWN OBJECT', 'MACHETE',
       'UNKNOWN FIREARM', 'AIR PISTOL/REVOLVER/RIFLE/BB GUN', 'TOY GUN',
       'FIXED OBJECT', 'UNKNOWN TYPE CUTTING INSTRUMENT', 'FOLDING KNIFE',
       'HAMMER', 'PHYSICAL PRESENCE', 'MACE/PEPPER SPRAY',
       'OTHER CUTTING INSTRUMENT', 'BOARD', 'BOTTLE', 'KITCHEN KNIFE',
       'RIFLE', 'KNIFE WITH BLADE OVER 6 INCHES IN LENGTH', 'SCREWDRIVER',
       'STICK', 'SIMULATED GUN', 'BELT FLAILING INSTRUMENT/CHAIN',
       'CONCRETE BLOCK/BRICK', 'AXE', 'ICE PICK', 'REVOLVER',
       'OTHER FIREARM', 'SCISSORS', 'STARTER PISTOL/REVOLVER', 'GLASS',
       'SHOTGUN', 'BRASS KNUCKLES', 'SWITCH BLADE', 'TIRE IRON',
       'SAWED OFF RIFLE/SHOTGUN', 'CAUSTIC CHEMICAL/POISO

Anche qui, meglio raggruppare:

**Unknown** = 'UNKNOWN WEAPON/OTHER WEAPON', 'UNKNOWN FIREARM', 'UNKNOWN TYPE CUTTING INSTRUMENT'

**Bodily Force** ='STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)', 'VERBAL THREAT', 'PHYSICAL PRESENCE'

**Firearm** = 'HAND GUN', 'SEMI-AUTOMATIC PISTOL', 'REVOLVER', 'RIFLE', 'SHOTGUN', 'TOY GUN', 'AIR PISTOL/REVOLVER/RIFLE/BB GUN', 'ASSAULT WEAPON/UZI/AK47/ETC', 'SAWED OFF RIFLE/SHOTGUN', 'STARTER PISTOL/REVOLVER', 'OTHER FIREARM', 'SIMULATED GUN', 'HECKLER & KOCH 93 SEMIAUTOMATIC ASSAULT RIFLE'

**Cutting Weapon** = 'KNIFE WITH BLADE 6INCHES OR LESS', 'FOLDING KNIFE', 'KITCHEN KNIFE', 'SWITCH BLADE', 'BOWIE KNIFE', 'CLEAVER', 'OTHER KNIFE', 'OTHER CUTTING INSTRUMENT', 'SCISSORS', 'RAZOR BLADE', 'ICE PICK', 'SAWED OFF RIFLE/SHOTGUN'

**Blunt Object** = 'CLUB/BAT', 'PIPE/METAL PIPE', 'BLUNT INSTRUMENT', 'ROCK/THROWN OBJECT', 'TIRE IRON', 'BOARD', 'BOTTLE', 'HAMMER', 'BRASS KNUCKLES', 'STICK', 'BELT FLAILING INSTRUMENT/CHAIN', 'SCREWDRIVER', 'GLASS', 'CONCRETE BLOCK/BRICK', 'MARTIAL ARTS WEAPONS'

**Chemical/Other** = 'MACE/PEPPER SPRAY', 'CAUSTIC CHEMICAL/POISON', 'SCALDING LIQUID', 'DEMAND NOTE', 'BOMB THREAT', 'STUN GUN', 'DEMAND NOTE'

**Vehicle/Fixed** = 'VEHICLE', 'FIXED OBJECT'

In [462]:
# Stessa formula di prima
def simplify_weapon(weapon):
    if pd.isna(weapon):
        return 'Unknown'
    weapon = weapon.upper()
    
    if 'UNKNOWN' in weapon or 'OTHER' in weapon:
        return 'Unknown'
    elif any(x in weapon for x in ['FIST', 'FEET', 'BODILY FORCE', 'VERBAL', 'PHYSICAL']):
        return 'Bodily Force'
    elif any(x in weapon for x in ['GUN', 'PISTOL', 'RIFLE', 'REVOLVER', 'FIREARM', 'ASSAULT']):
        return 'Firearm'
    elif any(x in weapon for x in ['KNIFE', 'BLADE', 'SCISSORS', 'ICE PICK', 'RAZOR', 'CLEAVER']):
        return 'Cutting Weapon'
    elif any(x in weapon for x in ['PIPE', 'BAT', 'ROCK', 'BLUNT', 'BOARD', 'BOTTLE', 'HAMMER', 'STICK', 'SCREWDRIVER', 'GLASS', 'CONCRETE', 'CHAIN']):
        return 'Blunt Object'
    elif any(x in weapon for x in ['MACE', 'PEPPER SPRAY', 'CHEMICAL', 'SCALDING', 'BOMB', 'STUN', 'DEMAND']):
        return 'Chemical/Other'
    elif any(x in weapon for x in ['VEHICLE', 'FIXED OBJECT']):
        return 'Vehicle/Fixed'
    else:
        return 'Other'

df_train['Weapon_Group'] = df_train['Weapon_Description'].apply(simplify_weapon)
del3 = 'Weapon_Description'
df_train = df_train.drop(columns=del3)



One Hot Encoding macrocategorie descrizione arma

In [463]:
# Anche qui One Hot Encoding
df_train = pd.get_dummies(df_train, columns=['Weapon_Group'], drop_first=True)

#check
df_train.head(5)

Unnamed: 0,Area_ID,Part 1-2,Victim_Age,Crime_Category,Time_Period_Evening,Time_Period_Morning,Time_Period_Night,Premise_Group_Education,Premise_Group_Healthcare,Premise_Group_Other,Premise_Group_Public_Space,Premise_Group_Residential,Premise_Group_Transportation,Weapon_Group_Bodily Force,Weapon_Group_Chemical/Other,Weapon_Group_Cutting Weapon,Weapon_Group_Firearm,Weapon_Group_Other,Weapon_Group_Unknown,Weapon_Group_Vehicle/Fixed
0,15.0,1.0,75.0,Property Crimes,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False
1,13.0,1.0,41.0,Property Crimes,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False
2,13.0,2.0,67.0,Property Crimes,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False
3,19.0,1.0,61.0,Property Crimes,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False
4,12.0,1.0,0.0,Property Crimes,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False


Gestione Età

In [464]:
# Vediamo quanto è grande il dataset per capire quante età = 0 ci sono
print(df_train.shape)

# E ora vediamo le età
print(df_train[df_train['Victim_Age'] == 0].shape[0] )

(20000, 20)
4828


In [465]:
# Facciamo due conti
4828 / 20000 * 100

24.14

In [466]:
# Se le eliminiamo perdiamo il 24% dei dati, non conviene, usiamo la tecnica dei quantili
# Prima rendo int tutti i float
float_cols = df_train.select_dtypes(include='float').columns
df_train[float_cols] = df_train[float_cols].astype(int)

# Isolo le età diverse da 0 per analizzarle
age_non_zero = df_train[df_train['Victim_Age'] > 0]['Victim_Age']

# Calcolo quartili
q1 = age_non_zero.quantile(0.25)
q2 = age_non_zero.quantile(0.50)  # Mediana
q3 = age_non_zero.quantile(0.75)

# Funzione per assegnare valori realistici randomici
def generate_age():
    r = np.random.rand()
    if r < 0.25:
        return np.random.randint(0, q1)
    elif r < 0.5:
        return np.random.randint(q1, q2)
    elif r < 0.75:
        return np.random.randint(q2, q3)
    else:
        return np.random.randint(q3, age_non_zero.max())

# Applica la funzione solo alle righe con età = 0
df_train.loc[df_train['Victim_Age'] == 0, 'Victim_Age'] = df_train[df_train['Victim_Age'] == 0]['Victim_Age'].apply(lambda x: generate_age())

#check
df_train.head(20)

Unnamed: 0,Area_ID,Part 1-2,Victim_Age,Crime_Category,Time_Period_Evening,Time_Period_Morning,Time_Period_Night,Premise_Group_Education,Premise_Group_Healthcare,Premise_Group_Other,Premise_Group_Public_Space,Premise_Group_Residential,Premise_Group_Transportation,Weapon_Group_Bodily Force,Weapon_Group_Chemical/Other,Weapon_Group_Cutting Weapon,Weapon_Group_Firearm,Weapon_Group_Other,Weapon_Group_Unknown,Weapon_Group_Vehicle/Fixed
0,15,1,75,Property Crimes,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False
1,13,1,41,Property Crimes,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False
2,13,2,67,Property Crimes,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False
3,19,1,61,Property Crimes,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False
4,12,1,4,Property Crimes,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False
5,11,2,50,Violent Crimes,True,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False
6,4,1,81,Property Crimes,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
7,14,1,68,Property Crimes,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False
8,9,1,68,Property Crimes,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False
9,14,2,22,Property Crimes,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False


In [467]:
# Adesso converto tutti i true e false in 1 e 0
bool_cols = df_train.select_dtypes(include='bool').columns
df_train[bool_cols] = df_train[bool_cols].astype(int)

# check
df_train.head(20)

Unnamed: 0,Area_ID,Part 1-2,Victim_Age,Crime_Category,Time_Period_Evening,Time_Period_Morning,Time_Period_Night,Premise_Group_Education,Premise_Group_Healthcare,Premise_Group_Other,Premise_Group_Public_Space,Premise_Group_Residential,Premise_Group_Transportation,Weapon_Group_Bodily Force,Weapon_Group_Chemical/Other,Weapon_Group_Cutting Weapon,Weapon_Group_Firearm,Weapon_Group_Other,Weapon_Group_Unknown,Weapon_Group_Vehicle/Fixed
0,15,1,75,Property Crimes,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
1,13,1,41,Property Crimes,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
2,13,2,67,Property Crimes,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
3,19,1,61,Property Crimes,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
4,12,1,4,Property Crimes,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
5,11,2,50,Violent Crimes,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0
6,4,1,81,Property Crimes,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
7,14,1,68,Property Crimes,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
8,9,1,68,Property Crimes,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
9,14,2,22,Property Crimes,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0


Label Encoding Variabile Target

In [468]:
le = LabelEncoder()
df_train['Crime_Category_Encoded'] = le.fit_transform(df_train['Crime_Category'])

#check numero e nomi classi
for i, label in enumerate(le.classes_):
    print(f"{i} = {label}")

0 = Crimes against Persons
1 = Crimes against Public Order
2 = Fraud and White-Collar Crimes
3 = Other Crimes
4 = Property Crimes
5 = Violent Crimes


### Salvataggio df

In [469]:
# Rinomino il df
df = df_train.copy()

# E lo salvo come CSV nella cartella dataset
df.to_csv('dataset/df_clean.csv', index=False)