<a href="https://www.kaggle.com/code/andymejia/la-crimes-times?scriptVersionId=144791277" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow
from tensorflow.keras.models import Sequential 

In [None]:
df = pd.read_csv('/kaggle/input/los-angeles-crime-data-2020-2023/Crime_Data_from_2020_to_Present.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.keys()

In [None]:
df_LA = df.drop(columns=['DR_NO', 'Date Rptd', 'Part 1-2', 'Mocodes', 'Crm Cd 1', 'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'LOCATION', 'Cross Street'])

In [None]:
df_LA.head(20)

In [None]:
df_LA.describe()

In [None]:
#print(df_LA['AREA'].value_counts())
#Looping through the whole dataset to see the amount that I have in the data.
for col in df_LA.columns:
    print(f"\nColumn Name: {col}")
    print(df_LA[col].value_counts())

In [None]:
grouped_data = df_LA.groupby(['AREA NAME', 'Crm Cd Desc']).size()

In [None]:
grouped_data = grouped_data.reset_index(name='counts')

In [None]:
top_areas = grouped_data.sort_values(by='counts', ascending=False).head(21)

top_areas.plot.barh(x="AREA NAME", y="counts", legend=False)
plt.xlabel('Count')
plt.ylabel('Area Name')
plt.title('Area with Highest Crime Counts')
plt.gca().invert_yaxis() #Reversing the orders of the areas in the y-axis
plt.show()

In [None]:
desc_counts = df_LA['Vict Descent'].value_counts()

desc_legend = {'H': 'Hispanic', 'W': 'White', 'B': 'Black', 'O': 'Other', 'A': 'Asian', 
               'F': 'Pacific Islander', 'K': 'Korean', 'C': 'Chinese', 'U': 'Unknown', 
               'I': 'American Indian', 'J': 'Japanese', 'P': 'Filipino', 'V': 'Vietnamese', 
               'Z': 'Asian Indian', 'G': 'Guamanian', 'S': 'Samoan', 'D': 'Cambodian', 
               'L': 'Laotian', 'X': 'Other Asian', 'R': 'Other Pacific Islander'}


In [None]:
# Set a threshold for which categories to display
threshold = 0.01

# Calculate the proportions of each category
desc_proportions = desc_counts / desc_counts.sum()

# Select the categories above the threshold and combine the rest as 'Other'
selected_categories = desc_proportions[desc_proportions > threshold]
selected_categories['Other'] = desc_proportions[desc_proportions <= threshold].sum()
labels = [desc_legend[code] if code in desc_legend else code for code in selected_categories.index]

# Create the pie chart
plt.figure(figsize=(10, 10))
plt.pie(selected_categories, labels=selected_categories.index, autopct='%1.1f%%', startangle=140)
plt.title('Victim Descent')
plt.legend(labels, loc='upper left')
plt.show()

In [None]:
unique_weapons = df_LA['Weapon Desc'].unique()
print(unique_weapons)

num_unique_weapons = df_LA['Weapon Desc'].nunique()
print(num_unique_weapons)

In [None]:
unique_crimes = df_LA['Crm Cd Desc'].unique()
print(unique_crimes)

num_unique_crimes = df_LA['Crm Cd Desc'].nunique()
print(num_unique_crimes)

In [None]:
top_n = df_LA['Weapon Desc'].value_counts().head(15)
top_n.plot(kind='bar')
plt.show()

In [None]:
#Long tail plot
#cumulative_counts = df_LA['Crm Cd Desc'].value_counts().cumsum()
#cumulative_counts.plot()
#plt.show()

In [None]:
counts = df_LA['Crm Cd Desc'].value_counts()
mask = df_LA['Crm Cd Desc'].isin(counts[counts < 10000].index)
df_LA['Crm Cd Desc'][mask] = 'Other'
df_LA['Crm Cd Desc'].value_counts().plot(kind='bar')
plt.show()

In [None]:
print(df_LA['LAT'].describe())
print(df_LA['LON'].describe())

In [None]:
df_filtered = df_LA[(df_LA['LAT'] !=0) & (df_LA['LON'] !=0)]

In [None]:
plt.figure(figsize=(12, 8))
plt.scatter(df_filtered['LAT'], df_filtered['LON'], s=1, alpha=0.1)
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.title('Crime Locations in LA')
plt.show()

In [None]:
df_LA.keys()

In [None]:
correlation_matrix = df_LA.corr()
print(correlation_matrix)

In [None]:
plt.figure(figsize=(10, 8))
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='none')
plt.colorbar()
plt.xticks(np.arange(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
plt.yticks(np.arange(len(correlation_matrix.columns)), correlation_matrix.columns)
plt.show()

In [None]:
#Aggregating the data by date
crime_count_per_day = df_LA.groupby('DATE OCC').size().reset_index(name='count')

In [None]:
#Converting into a time series
crime_count_per_day['DATE OCC'] = pd.to_datetime(crime_count_per_day['DATE OCC'])
crime_count_per_day.set_index('DATE OCC', inplace=True)

In [None]:
#spliting the data into a training and testing sets
train_size = int(len(crime_count_per_day) * 0.8)
train, test = crime_count_per_day[0:train_size], crime_count_per_day[train_size:]

In [None]:
#Creating the dataset function
def create_dataset(dataset, look_back=1):
    X, Y =[], []
    
    for i in range(len(dataset) - look_back):
        X.append(dataset[i: (i  + look_back), 0])
        Y.append(dataset[i + look_back, 0])
    return np.array(X), np.array(Y)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)

# Reshaping to [samples, time steps, features]
X_train, y_train = create_dataset(train_scaled, look_back=1)
X_test, y_test = create_dataset(test_scaled, look_back=1)

X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))

In [None]:
#Building the RNN model
from keras.models import Sequential
from keras.layers import LSTM, Dense

model = Sequential()
model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

In [None]:
#Training the model
model.fit(X_train, y_train, epochs=50, batch_size=72, validation_data=(X_test, y_test), verbose=1, shuffle= False)

In [None]:
y_pred = model.predict(X_test)
y_pred = scaler.inverse_transform(y_pred)

In [None]:
y_test = y_test.reshape(-1, 1)

In [None]:
y_test = scaler.inverse_transform(y_test)

In [None]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', rmse)

In [None]:
plt.plot(y_test, label='True Values')
plt.plot(y_pred, label='Predictions')
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend()
plt.show()