<a href="https://www.kaggle.com/code/reyanshbhardwaj12/usa-crime-data-eda-wrangling-visualization?scriptVersionId=144850982" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df= pd.read_csv('/kaggle/input/los-angeles-crime-data-from-2020-to-present/Crime_Data_from_2020_to_Present.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.dtypes
# Here we can see date columns dtype is "object" , we have to change it to "datetime" for further analyses.

In [None]:
df.describe()
# here , we can see part 1 crime is 50% & part 2 crime is more than 50%.
# HERE, WE CAN ALSO OBSERVE THAT MEAN VICTIM AGE IS 29 YEARS.

In [None]:
df.isnull().sum()
# As we can see we have very high missing values

In [None]:
df.duplicated().sum()
# we have 0 duplicate values.

# Data Manipulation.

In [None]:
# dropping unecessary columns
df.drop(['LAT','LON'],axis=1,inplace=True)

In [None]:
df.drop(['Cross Street'],axis=1,inplace=True)

In [None]:
df.drop(['Premis Desc'],axis=1,inplace=True)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
# Now correcting date columns dtypes
df['Date Rptd']= pd.to_datetime(df['Date Rptd'])
df['DATE OCC'] = pd.to_datetime(df['DATE OCC'])

In [None]:
# checking the result
df.dtypes

In [None]:
df['Crm Cd 1'].mean()

In [None]:
df['Crm Cd 1'].fillna(500.48170113556824,inplace=True)

In [None]:
df['Crm Cd 2'].mean()

In [None]:
df['Crm Cd 2'].fillna('957.4310446852757',inplace=True)

In [None]:
df['Crm Cd 3'].mean()

In [None]:
df['Crm Cd 3'].fillna('983.8928934010153',inplace=True)

In [None]:
df['Crm Cd 4'].mean()

In [None]:
df['Crm Cd 4'].fillna('990.3684210526316',inplace=True)

In [None]:
df['Premis Cd'].mean()

In [None]:
df['Crm Cd 4'].fillna('305.76369797757513',inplace=True)

In [None]:
df['Weapon Used Cd'].mean()

In [None]:
df['Weapon Used Cd'].fillna('362.84096592433593',inplace=True)

In [None]:
df['Mocodes'].fillna('Unknown',inplace=True)
df['Vict Sex'].fillna('Unknown',inplace=True)
df['Vict Descent'].fillna('Unknown',inplace=True)
df['Weapon Desc'].fillna('Unknown',inplace=True)


In [None]:
# Now lets check the missing values
df.isnull().sum()
# Done.

# Extracting insights out of the data

What is the overall trend in crime incidents in Los Angeles from 2020 to the present


In [None]:
df['Year'] = df['DATE OCC'].dt.year


In [None]:
df['Year'].value_counts()

In [None]:
plt.figure(figsize=(12,3))

yearly_occurrences = df.groupby('Year')['DATE OCC'].count()
plt.plot(yearly_occurrences.index, yearly_occurrences.values,color='red', marker='o', linestyle='-')

for x, y in zip(yearly_occurrences.index, yearly_occurrences.values):
    plt.annotate(f"{y}", (x, y), textcoords="offset points", xytext=(0, 10), ha='center')

plt.title('Crime Trends from 2020 to Present',loc='left')
plt.xlabel('Year')
plt.ylabel('')
plt.grid(True)


plt.show()


INCIDENT OCCURRENCES & REPORTS OVER TIME

In [None]:
plt.figure(figsize=(12, 4))

occurrences = df['DATE OCC'].value_counts().sort_index()
reports = df['Date Rptd'].value_counts().sort_index()


plt.plot(occurrences.index,occurrences .values, label='DATE OCC')
plt.plot(reports.index, reports.values, label='Date Rptd')
plt.xlabel('Date')
plt.ylabel('Number of Incidents')
plt.title('Incident Occurrences and Reports Over Time')
plt.legend()
plt.grid()

 Which month had the highest number of reported crimes in 2020?

In [None]:
df['Month'] = df['Date Rptd'].dt.strftime('%B')


In [None]:
plt.figure(figsize=(12,5))
ax=sns.countplot(x=df['Month'].sort_values(),data=df['Date Rptd']==2023)
for bars in ax.containers:
    ax.bar_label(bars)

plt.title('Monthwise reported crimes in 2020')  
plt.ylabel('')  
plt.show()
# As we can see month July has most reported crimes 75098 , and month November has the least reported crime. 

How has the distribution of crime types evolved over the past three years?


In [None]:

end_date = pd.Timestamp.now()
start_date = end_date - pd.DateOffset(years=3)
filtered_data = df[(df['DATE OCC'] >= start_date) & (df['DATE OCC'] <= end_date)]


crime_type_counts = filtered_data.groupby([filtered_data['DATE OCC'].dt.year,'Crm Cd Desc']).size().unstack(fill_value=0)
top_5_crime_types = crime_type_counts.sum().sort_values(ascending=False).head(5).index
crime_type_counts_top_5 = crime_type_counts[top_5_crime_types]

plt.figure(figsize=(15,8))
crime_type_counts_top_5.plot(kind='line', marker='o', legend=False)
plt.xlabel('Year-Month')
plt.ylabel('Crime Count')
plt.title('Evolution of Crime Types Over the Past Three Years')
plt.legend(title='Crime Type', prop={'size': 4})
plt.xticks(rotation=45)
plt.show()

# Now here we can see in past 3 years "vehicle stolen" tops the list,but "theft of identity" increased in year 2022.
# And in Year 2023  "theft of identity" showed the highest dip.

 What are the top 10 most common crimes in the dataset?


In [None]:
df['Crm Cd Desc'].value_counts()[:10]
# As we can see Vehicle stolen is at the top followed by battery & theft of identity

In [None]:
plt.figure(figsize=(12,5))
vc=df['Crm Cd Desc'].value_counts()[:10]
g= sns.barplot(x=vc.index, y= vc.values, data=df, palette='mako')

for i in range(10):
    value=vc[i]
    g.text(x=i +0.125,y=value-2, s= value,ha='center')


plt.title('Top 10 most common crimes')
plt.xticks(rotation=20)
plt.show()

 Are there any geographical patterns in crime distribution across different neighborhoods in Los Angeles?


In [None]:
# calculating average crime rate 
neighborhood_crime_counts = df['AREA NAME'].value_counts()
average_crime_per_neighborhood = neighborhood_crime_counts.mean()
average_crime_per_neighborhood


In [None]:
# Identify neighborhoods with above-average crime rates
high_crime_neighborhoods = neighborhood_crime_counts[neighborhood_crime_counts > average_crime_per_neighborhood]

print("Neighborhoods with Above-Average Crime Rates:")
print(high_crime_neighborhoods)

# Here we can see central LA,77th Street,Pacific Top three neighborhood have above average-crime rate    

In [None]:
plt.figure(figsize=(12,5))
sns.barplot(x=high_crime_neighborhoods.index, y=high_crime_neighborhoods.values, data=df,palette='rocket')

plt.title('Neighborhoods with Above-Average Crime Rates')
plt.show()

 What is the most common time of day for reported crimes?


In [None]:

crime_by_hour = df['TIME OCC'].value_counts().sort_index()

# Find the most common time of day for reported crimes (the hour with the highest count)
most_common_hour = crime_by_hour.idxmax()
most_common_hour_count = crime_by_hour.max()

print(f"The most common time for reported crimes is {most_common_hour} with {most_common_hour_count} incidents.")

Q7 What is the crime rate against genders?

In [None]:
df['Vict Sex'].value_counts()

In [None]:
Gen=df['Vict Sex'].value_counts()
plt.figure(figsize=(4,4))
myexplode=[0.1,0,0,0,0,0]
labels = Gen.index.tolist()  
colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue','lightseagreen'] 

plt.pie(Gen ,autopct='%0.1f%%',radius=1.25,wedgeprops={'edgecolor':'white'},labels=labels
        ,textprops={'size':11,},shadow=True,startangle=70,explode=myexplode
        ,colors=colors)

plt.title('Crime Rate against genders\n')
plt.show()
# Male Faced 41.3% Crime followed by  Female with 36.9%


Yearwise crime rate against genders

In [None]:
plt.figure(figsize=(12,5))
ax=sns.countplot(x=df['Year'],hue=df['Vict Sex'],data=df)
for bars in ax.containers:
    ax.bar_label(bars)
plt.legend(title='Gender', prop={'size': 8})
plt.title('Yearwise crime rate against gender')
plt.show()

# Here we can see overall  year 2022 has the highest crime rate against genders, & 2023  has the list crime rate. 

Areawise crime rate against genders

In [None]:
plt.figure(figsize=(14,6))


ax=sns.countplot(x=df['AREA NAME'],hue=df['Vict Sex'],data=df)
for bars in ax.containers:
    ax.bar_label(bars)

plt.legend(title='Gender', prop={'size': 8})
plt.title('Statewise crime rate against gender')
plt.xticks(rotation=35)
plt.yticks(rotation=15)
plt.ylabel('')
plt.show()

# Here we can observe, that 77th street , Southeast & Southwest  has the highest crime rate against female, 
# Rest in all areas male faces the high crime rate 

Crime rate against age

In [None]:
df['Vict Age'].value_counts()[1:10]

In [None]:
plt.figure(figsize=(5,2))
sns.displot(df['Vict Age'])
plt.show()

Crimes higher than average crime rate

In [None]:
crime_counts = df['Crm Cd Desc'].value_counts()
average_crime_counts = crime_counts.mean()
average_crime_counts

In [None]:
high_crime_counts = crime_counts[crime_counts > average_crime_counts]

# Print crime description with above-average crime rates
print("Crime with Above-Average Crime Rates:")
print(high_crime_counts)

 Weapons used for crime

In [None]:
import plotly.express as px

In [None]:
df_filtered = df[df['Weapon Desc'] != 'Unknown']
fig = px.treemap(df_filtered, path=['Weapon Desc'] 
                 ,title='Weapon Distribution')

fig.update_traces(textfont_size=16)
fig.show()
# Here we can see strong -Arming top the list followed by verbal threat & handgun


INCIDENT CLASSIFICATION

In [None]:
df['Part 1-2'].value_counts()

In [None]:
plt.figure(figsize=(10,4))

ax=sns.countplot(x=df['Part 1-2'],hue=df['Year'],data=df,palette='rocket')
for bars in ax.containers:
    ax.bar_label(bars)
plt.legend(title='Years', prop={'size': 7})
plt.title('Incident classification withrespect to Years')
plt.xlabel('Incident Classificatin')
plt.ylabel('')
plt.show()
# Crime classified as  Part 1  is higher as compared to part 2 crime. 
# Part 1 crime was at peak in 2022 followed by 2021 &2020. 
# Part 2 crime also showed the same trend as of Part 1 crime.

In [None]:
plt.figure(figsize=(12,4))

ax=sns.countplot(x=df['Part 1-2'],hue=df['Vict Sex'],data=df,palette='rocket')
for bars in ax.containers:
    ax.bar_label(bars)
plt.legend(title='Genders', prop={'size': 6})
plt.title('Incident classification vs Gender')
plt.xlabel('Incident Classification')
plt.ylabel('')
plt.show()

# Male faced more Part-1 crime  as compared to female & X gender .
# on the contrary Female faced more Part-2 crime as compared to gender male & X.  

In [None]:
plt.figure(figsize=(15,5))

ax=sns.countplot(x=df['AREA NAME'],hue=df['Part 1-2'],data=df,palette='rocket')
for bars in ax.containers:
    ax.bar_label(bars)
plt.legend(title='Incident Classification', prop={'size': 6})
plt.title('Incident classification Areawise')
plt.xlabel('Area')
plt.xticks(rotation=25)
plt.ylabel('')
plt.show()


STATUS DESCRIPTION

In [None]:
df['Status Desc'].value_counts()

In [None]:
Status_type_counts = df.groupby([df['Year'],'Status Desc']).size().unstack(fill_value=0)

plt.figure(figsize=(15,8))
Status_type_counts.plot(kind='line', marker='o', legend=False)
plt.xlabel('Year-Month')
plt.ylabel('')
plt.title('Evolution of Status Over the Past Three Years')
plt.legend(title='Status', prop={'size': 6})
plt.xticks(rotation=45)
plt.show()