# **LSTM model**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, models
import tensorflow_hub as hub
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Crime_Data_from_2020_to_Present.csv")
df.head()

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Status,Status Desc,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON
0,10304468,01/08/2020 12:00:00 AM,01/08/2020 12:00:00 AM,2230,3,Southwest,377,2,624,BATTERY - SIMPLE ASSAULT,...,AO,Adult Other,624.0,,,,1100 W 39TH PL,,34.0141,-118.2978
1,190101086,01/02/2020 12:00:00 AM,01/01/2020 12:00:00 AM,330,1,Central,163,2,624,BATTERY - SIMPLE ASSAULT,...,IC,Invest Cont,624.0,,,,700 S HILL ST,,34.0459,-118.2545
2,200110444,04/14/2020 12:00:00 AM,02/13/2020 12:00:00 AM,1200,1,Central,155,2,845,SEX OFFENDER REGISTRANT OUT OF COMPLIANCE,...,AA,Adult Arrest,845.0,,,,200 E 6TH ST,,34.0448,-118.2474
3,191501505,01/01/2020 12:00:00 AM,01/01/2020 12:00:00 AM,1730,15,N Hollywood,1543,2,745,VANDALISM - MISDEAMEANOR ($399 OR UNDER),...,IC,Invest Cont,745.0,998.0,,,5400 CORTEEN PL,,34.1685,-118.4019
4,191921269,01/01/2020 12:00:00 AM,01/01/2020 12:00:00 AM,415,19,Mission,1998,2,740,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",...,IC,Invest Cont,740.0,,,,14400 TITUS ST,,34.2198,-118.4468


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 843514 entries, 0 to 843513
Data columns (total 28 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   DR_NO           843514 non-null  int64  
 1   Date Rptd       843514 non-null  object 
 2   DATE OCC        843514 non-null  object 
 3   TIME OCC        843514 non-null  int64  
 4   AREA            843514 non-null  int64  
 5   AREA NAME       843514 non-null  object 
 6   Rpt Dist No     843514 non-null  int64  
 7   Part 1-2        843514 non-null  int64  
 8   Crm Cd          843514 non-null  int64  
 9   Crm Cd Desc     843514 non-null  object 
 10  Mocodes         726560 non-null  object 
 11  Vict Age        843514 non-null  int64  
 12  Vict Sex        732198 non-null  object 
 13  Vict Descent    732190 non-null  object 
 14  Premis Cd       843504 non-null  float64
 15  Premis Desc     843004 non-null  object 
 16  Weapon Used Cd  293900 non-null  float64
 17  Weapon Des

In [None]:
df = df.dropna(subset=['Vict Descent'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 732190 entries, 0 to 843513
Data columns (total 28 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   DR_NO           732190 non-null  int64  
 1   Date Rptd       732190 non-null  object 
 2   DATE OCC        732190 non-null  object 
 3   TIME OCC        732190 non-null  int64  
 4   AREA            732190 non-null  int64  
 5   AREA NAME       732190 non-null  object 
 6   Rpt Dist No     732190 non-null  int64  
 7   Part 1-2        732190 non-null  int64  
 8   Crm Cd          732190 non-null  int64  
 9   Crm Cd Desc     732190 non-null  object 
 10  Mocodes         726099 non-null  object 
 11  Vict Age        732190 non-null  int64  
 12  Vict Sex        732186 non-null  object 
 13  Vict Descent    732190 non-null  object 
 14  Premis Cd       732189 non-null  float64
 15  Premis Desc     731690 non-null  object 
 16  Weapon Used Cd  293759 non-null  float64
 17  Weapon Des

In [None]:
object_cols = [i for i in df.columns if df[i].dtype == 'O']
num_cols = [i for i in df.columns if df[i].dtype != 'O']

print("All objective columns: ", object_cols)
print("All numeric columns: ", num_cols)

print("No. of unique values in each objective column")
df[object_cols].nunique()

All objective columns:  ['Date Rptd', 'DATE OCC', 'AREA NAME', 'Crm Cd Desc', 'Mocodes', 'Vict Sex', 'Vict Descent', 'Premis Desc', 'Weapon Desc', 'Status', 'Status Desc', 'LOCATION', 'Cross Street']
All numeric columns:  ['DR_NO', 'TIME OCC', 'AREA', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Vict Age', 'Premis Cd', 'Weapon Used Cd', 'Crm Cd 1', 'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'LAT', 'LON']
No. of unique values in each objective column


Date Rptd         1420
DATE OCC          1420
AREA NAME           21
Crm Cd Desc        137
Mocodes         280211
Vict Sex             5
Vict Descent        20
Premis Desc        306
Weapon Desc         79
Status               6
Status Desc          6
LOCATION         61278
Cross Street      8860
dtype: int64

In [None]:
df[num_cols].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 732190 entries, 0 to 843513
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   DR_NO           732190 non-null  int64  
 1   TIME OCC        732190 non-null  int64  
 2   AREA            732190 non-null  int64  
 3   Rpt Dist No     732190 non-null  int64  
 4   Part 1-2        732190 non-null  int64  
 5   Crm Cd          732190 non-null  int64  
 6   Vict Age        732190 non-null  int64  
 7   Premis Cd       732189 non-null  float64
 8   Weapon Used Cd  293759 non-null  float64
 9   Crm Cd 1        732180 non-null  float64
 10  Crm Cd 2        61768 non-null   float64
 11  Crm Cd 3        2091 non-null    float64
 12  Crm Cd 4        61 non-null      float64
 13  LAT             732190 non-null  float64
 14  LON             732190 non-null  float64
dtypes: float64(8), int64(7)
memory usage: 89.4 MB


In [None]:
num_cols.remove("Weapon Used Cd")
num_cols.remove("AREA")
num_cols.remove("Crm Cd 2")
num_cols.remove("Crm Cd 3")
num_cols.remove("Crm Cd 4")
df[num_cols].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 732190 entries, 0 to 843513
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   DR_NO        732190 non-null  int64  
 1   TIME OCC     732190 non-null  int64  
 2   Rpt Dist No  732190 non-null  int64  
 3   Part 1-2     732190 non-null  int64  
 4   Crm Cd       732190 non-null  int64  
 5   Vict Age     732190 non-null  int64  
 6   Premis Cd    732189 non-null  float64
 7   Crm Cd 1     732180 non-null  float64
 8   LAT          732190 non-null  float64
 9   LON          732190 non-null  float64
dtypes: float64(4), int64(6)
memory usage: 61.4 MB


In [None]:
cat_cols = ["AREA NAME", "Vict Sex", "Status"]

feat_cols = num_cols + cat_cols
label_col = "Vict Descent"

In [None]:
sel_df = df[feat_cols].copy()

sel_df.head()

Unnamed: 0,DR_NO,TIME OCC,Rpt Dist No,Part 1-2,Crm Cd,Vict Age,Premis Cd,Crm Cd 1,LAT,LON,AREA NAME,Vict Sex,Status
0,10304468,2230,377,2,624,36,501.0,624.0,34.0141,-118.2978,Southwest,F,AO
1,190101086,330,163,2,624,25,102.0,624.0,34.0459,-118.2545,Central,M,IC
2,200110444,1200,155,2,845,0,726.0,845.0,34.0448,-118.2474,Central,X,AA
3,191501505,1730,1543,2,745,76,502.0,745.0,34.1685,-118.4019,N Hollywood,F,IC
4,191921269,415,1998,2,740,31,409.0,740.0,34.2198,-118.4468,Mission,X,IC


In [None]:
sel_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 732190 entries, 0 to 843513
Data columns (total 13 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   DR_NO        732190 non-null  int64  
 1   TIME OCC     732190 non-null  int64  
 2   Rpt Dist No  732190 non-null  int64  
 3   Part 1-2     732190 non-null  int64  
 4   Crm Cd       732190 non-null  int64  
 5   Vict Age     732190 non-null  int64  
 6   Premis Cd    732189 non-null  float64
 7   Crm Cd 1     732180 non-null  float64
 8   LAT          732190 non-null  float64
 9   LON          732190 non-null  float64
 10  AREA NAME    732190 non-null  object 
 11  Vict Sex     732186 non-null  object 
 12  Status       732190 non-null  object 
dtypes: float64(4), int64(6), object(3)
memory usage: 78.2+ MB


In [None]:
#filling all the feature column with mode value
sel_df = sel_df.apply(lambda col: col.fillna(col.mode()[0]))


sel_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 732190 entries, 0 to 843513
Data columns (total 13 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   DR_NO        732190 non-null  int64  
 1   TIME OCC     732190 non-null  int64  
 2   Rpt Dist No  732190 non-null  int64  
 3   Part 1-2     732190 non-null  int64  
 4   Crm Cd       732190 non-null  int64  
 5   Vict Age     732190 non-null  int64  
 6   Premis Cd    732190 non-null  float64
 7   Crm Cd 1     732190 non-null  float64
 8   LAT          732190 non-null  float64
 9   LON          732190 non-null  float64
 10  AREA NAME    732190 non-null  object 
 11  Vict Sex     732190 non-null  object 
 12  Status       732190 non-null  object 
dtypes: float64(4), int64(6), object(3)
memory usage: 78.2+ MB


In [None]:
feature_df = pd.get_dummies(sel_df)
label_df = pd.get_dummies(df[label_col])

In [None]:
feature_df.head()

Unnamed: 0,DR_NO,TIME OCC,Rpt Dist No,Part 1-2,Crm Cd,Vict Age,Premis Cd,Crm Cd 1,LAT,LON,...,Vict Sex_F,Vict Sex_H,Vict Sex_M,Vict Sex_X,Status_AA,Status_AO,Status_CC,Status_IC,Status_JA,Status_JO
0,10304468,2230,377,2,624,36,501.0,624.0,34.0141,-118.2978,...,1,0,0,0,0,1,0,0,0,0
1,190101086,330,163,2,624,25,102.0,624.0,34.0459,-118.2545,...,0,0,1,0,0,0,0,1,0,0
2,200110444,1200,155,2,845,0,726.0,845.0,34.0448,-118.2474,...,0,0,0,1,1,0,0,0,0,0
3,191501505,1730,1543,2,745,76,502.0,745.0,34.1685,-118.4019,...,1,0,0,0,0,0,0,1,0,0
4,191921269,415,1998,2,740,31,409.0,740.0,34.2198,-118.4468,...,0,0,0,1,0,0,0,1,0,0


In [None]:
print(sel_df.columns)
print(feature_df.columns)

Index(['DR_NO', 'TIME OCC', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Vict Age',
       'Premis Cd', 'Crm Cd 1', 'LAT', 'LON', 'AREA NAME', 'Vict Sex',
       'Status'],
      dtype='object')
Index(['DR_NO', 'TIME OCC', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Vict Age',
       'Premis Cd', 'Crm Cd 1', 'LAT', 'LON', 'AREA NAME_77th Street',
       'AREA NAME_Central', 'AREA NAME_Devonshire', 'AREA NAME_Foothill',
       'AREA NAME_Harbor', 'AREA NAME_Hollenbeck', 'AREA NAME_Hollywood',
       'AREA NAME_Mission', 'AREA NAME_N Hollywood', 'AREA NAME_Newton',
       'AREA NAME_Northeast', 'AREA NAME_Olympic', 'AREA NAME_Pacific',
       'AREA NAME_Rampart', 'AREA NAME_Southeast', 'AREA NAME_Southwest',
       'AREA NAME_Topanga', 'AREA NAME_Van Nuys', 'AREA NAME_West LA',
       'AREA NAME_West Valley', 'AREA NAME_Wilshire', 'Vict Sex_-',
       'Vict Sex_F', 'Vict Sex_H', 'Vict Sex_M', 'Vict Sex_X', 'Status_AA',
       'Status_AO', 'Status_CC', 'Status_IC', 'Status_JA', 'Status_JO'],
      dty

In [None]:
label_df.head()

Unnamed: 0,-,A,B,C,D,F,G,H,I,J,K,L,O,P,S,U,V,W,X,Z
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [None]:
print("Shape of input feature df: ", feature_df.shape)
print("Shape of output label df: ", label_df.shape)

Shape of input feature df:  (732190, 42)
Shape of output label df:  (732190, 20)


In [None]:
# Assuming your data is loaded into feature_df and label_df
# feature_df has shape (1000, 42) and label_df has shape (100, 16)
n_feat = feature_df.shape[1]
n_class = label_df.shape[1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    feature_df.values, label_df.values, test_size=0.2, random_state=42
)

# **Concatenating different columns into one**

In [None]:
df["new_text"] = [" ".join([str(j) for j in df.iloc[i].values]) for i in range(len(df))]

final_df = df[["new_text", label_col]].copy()

In [None]:
final_df

Unnamed: 0,new_text,Vict Descent
0,10304468 01/08/2020 12:00:00 AM 01/08/2020 12:...,B
1,190101086 01/02/2020 12:00:00 AM 01/01/2020 12...,H
2,200110444 04/14/2020 12:00:00 AM 02/13/2020 12...,X
3,191501505 01/01/2020 12:00:00 AM 01/01/2020 12...,W
4,191921269 01/01/2020 12:00:00 AM 01/01/2020 12...,X
...,...,...
843509,231606525 03/22/2023 12:00:00 AM 03/22/2023 12...,H
843510,231210064 04/12/2023 12:00:00 AM 04/12/2023 12...,B
843511,230115220 07/02/2023 12:00:00 AM 07/01/2023 12...,H
843512,230906458 03/05/2023 12:00:00 AM 03/05/2023 12...,H


# **label Encoding the target column**

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder



# Step 1: Data Preprocessing
max_sequence_length = 144
# Encode labels
label_encoder = LabelEncoder()
final_df['label'] = label_encoder.fit_transform(final_df[label_col])

# Split the data into training and testing sets
train_df, test_df = train_test_split(final_df, test_size=0.2, random_state=42)

# **Tokenizing the text data and applying post padding**

In [None]:
# Tokenize the text data
max_words = 10000  t
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_df['new_text'])

# Convert text to sequences
X_train_sequences = tokenizer.texts_to_sequences(train_df['new_text'])
X_test_sequences = tokenizer.texts_to_sequences(test_df['new_text'])

# Pad sequences for consistent length

X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Convert labels to NumPy arrays
y_train = train_df['label'].values
y_test = test_df['label'].values

# Number of classes
num_classes = len(label_encoder.classes_)

# **Defining model**

In [None]:
# Step 2: Model Definition

embedding_dim = 256
lstm_units = 128

model1 = tf.keras.Sequential([
    layers.Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length),
    layers.LSTM(units=lstm_units, return_sequences=True),
    layers.LSTM(units=lstm_units),
    layers.Dense(num_classes, activation='softmax')
])
model1.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
checkpoint = ModelCheckpoint("best_model.h5", monitor='val_accuracy', save_best_only=True, mode='max', verbose=1)

#  EarlyStopping callback to stop training if the model performance doesn't improve
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, mode='max', verbose=1)

In [None]:
# Step 3: Training

epochs = 5
batch_size = 32  t

model1.fit(X_train_padded, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test_padded, y_test), callbacks=[checkpoint,early_stopping])

# Step 4: Evaluation

loss, accuracy = model1.evaluate(X_test_padded, y_test)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy * 100:.2f}%')

Epoch 1/5
Epoch 1: val_accuracy improved from -inf to 0.35185, saving model to best_model.h5
Epoch 2/5
    4/18305 [..............................] - ETA: 5:45 - loss: 1.6249 - accuracy: 0.3828

  saving_api.save_model(


Epoch 2: val_accuracy did not improve from 0.35185
Epoch 3/5
Epoch 3: val_accuracy did not improve from 0.35185
Epoch 4/5
Epoch 4: val_accuracy did not improve from 0.35185
Epoch 4: early stopping
Test Loss: 1.6823, Test Accuracy: 35.18%


In [None]:
y_pred_prob = model1.predict(X_test_padded)
y_pred = tf.argmax(y_pred_prob, axis=1)

y_pred = y_pred.numpy()

true_labels = label_encoder.inverse_transform(y_test)


predicted_labels = label_encoder.inverse_transform(y_pred)

# Step 6: Print Classification Report
print(classification_report(true_labels, predicted_labels, zero_division=1))

              precision    recall  f1-score   support

           -       1.00      0.00      0.00         1
           A       1.00      0.00      0.00      3728
           B       1.00      0.00      0.00     23860
           C       1.00      0.00      0.00       607
           D       1.00      0.00      0.00        15
           F       1.00      0.00      0.00       745
           G       1.00      0.00      0.00        20
           H       0.35      1.00      0.52     51524
           I       1.00      0.00      0.00       172
           J       1.00      0.00      0.00       218
           K       1.00      0.00      0.00       870
           L       1.00      0.00      0.00        13
           O       1.00      0.00      0.00     13437
           P       1.00      0.00      0.00        48
           S       1.00      0.00      0.00         8
           U       1.00      0.00      0.00        40
           V       1.00      0.00      0.00       190
           W       1.00    