In [None]:
#Import necessary packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from encodings.aliases import aliases

from uszipcode import ZipCodeSearchEngine

%matplotlib inline

In [None]:
#Normalize encoding
alias_values =set(aliases.values())

for encoding in set(aliases.values()):
    try:
        df=pd.read_csv("crime.csv", nrows=10, encoding=encoding)
        
        print("successful", encoding)
    except:
        pass
    

In [None]:
#import .csv file
crime = pd.read_csv("crime.csv")

In [None]:
crime.shape

In [None]:
#drops duplicates of entire rows    
crime.drop_duplicates(inplace=True)

#Drop rows with Null Values
crime.dropna(axis=0, inplace=True)

In [None]:
crime.shape

In [None]:
#show dtype for each Column 
crime.info()

In [None]:
#convert Reported date from object -> Datetime   
crime.ReportedDate = pd.to_datetime(crime.ReportedDate)

#convert Zip codes to integers
crime['Zip'] = crime['Zip'].astype(np.int64)

#Format different spellings of Cleveland to say "Cleveland" in the Cities Column
exclude_city = ['Cleveland Heights']

crime[crime['City'].replace(exclude_city,'', regex=True)
              .str.contains('Cleveland', case=False)
  ]

#Normalize text style of City column
crime['City'] = crime['City'].str.title()

crime['City'].value_counts(ascending=True)

In [None]:
crime.info()

In [None]:
#Statistics for Numerical Values   
crime.describe()

In [None]:
#Statistics for Numerical and String Values
crime.describe(include='object')

In [None]:
#List the names of the headers
crime.columns

In [None]:
#Find columns that have null cells
crime.columns[np.sum(crime.isnull()) !=0]

In [None]:
#Find columns with no null cells
crime.columns[np.sum(crime.isnull()) ==0]

In [None]:
#Find the number of unique values in each column
for col in crime.columns:
    unique_count = crime[col].nunique()
    print(col + " has " + str(unique_count) + " unique values ")

In [None]:
#Find total Crime Description Occurances
crime.UCRdesc.value_counts()

In [None]:
#find and plot the top 10 most common crimes
offense_group_vals = crime.UCRdesc.value_counts()[:10]

display(offense_group_vals/crime.shape[0])

(offense_group_vals / crime.shape[0]).plot(kind='bar')
plt.title('Top 10 Offense Groups (as % of crimes)')

In [None]:
#Find and plot the 10 least common crimes
crime.UCRdesc.value_counts().sort_values(ascending=True)[:10]

In [None]:
#Create dataframe of crimes from 2016 to 2024
recentCrime = crime.drop(crime[crime.OffenseYear < 2016].index)

#Plot recentCrime Dataframe to show counts per year
recentCrime.groupby('OffenseYear').count()['OBJECTID'].plot(kind='bar')
plt.title('Number of Crimes')

In [None]:
#Plot Number of crimes by Day of the week

recentCrime.groupby('DOWname').count()['OBJECTID'].plot(kind='bar', grid=1)
plt.title('Number of Crimes By Day of Month')

In [None]:
#Plot Number of crimes by Hour of the day
recentCrime.groupby('HourofDay').count()['OBJECTID'].plot(kind='bar');

In [None]:
#Rearrange Columns to run through days of week
week_and_hour = recentCrime.groupby(['HourofDay','DOWname']).count()['OBJECTID'].unstack()
week_and_hour = week_and_hour[['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']]

#Create a Seaborn Heatmap of the Hour of the day, Day of the Week, and Frequency of Crimes

sns.heatmap(week_and_hour, cmap=sns.cubehelix_palette(as_cmap=True));
plt.title('Crime Frequency by Hour Throughout the Week')

In [None]:
#Create a dataframe (s2) where months where average crime frequency <.3 and >-.3 are highlighted in blue

avg_crime = recentCrime.groupby(['OffenseYear', 'OffenseMonth']).count()['OBJECTID'].mean()
print("The Average Number of Crimes is" + str(avg_crime))

year_and_month = recentCrime.groupby(['OffenseMonth','OffenseYear']).count()['OBJECTID'].unstack()

def style_negative(v, props=''):
    return props if v<avg_crime else None
s2 = year_and_month.style.applymap(style_negative, props = 'color:blue;').applymap(lambda v:'opacity:20%;' if(v<0.3) and (v>-.3) else None)
s2

In [None]:
#Use the apply function to highlight the maximum value in a column in dark green

def highlight_max(s, props=''):
    return np.where(s==np.nanmax(s.values), props, '')
s2.apply(highlight_max, props='color:white;background-color:darkgreen', axis=0)

In [None]:
#Create a heatmap of the District Crime offenses by year saved as District_year_heatmap

district_and_year = recentCrime.groupby(['District', 'OffenseYear']).count()['OBJECTID'].unstack()

District_year_heatmap = sns.heatmap(district_and_year, cmap=sns.cubehelix_palette(as_cmap=True));
District_year_heatmap.set(title='Crimes per year in each District', xlabel='Year', ylabel='District')


In [None]:
#Create chart of average crimes per district, highlighting lower than average in blue

avg_crime_district = recentCrime.groupby(['District', 'OffenseYear']).count()['OBJECTID'].mean()
print("The average crime per district per year is: " + str(avg_crime_district))

def style_negative(v, props=''):
    return props if v < avg_crime_district else None
s3 = district_and_year.style.applymap(style_negative, props='color:blue;')\
              .applymap(lambda v: 'opacity: 20%;' if (v < 0.3) and (v > -0.3) else None)
s3

In [None]:
# Highlight the maximum value in each colummn with dark red

def highlight_max(s, props=''):
    return np.where(s == np.nanmax(s.values), props, '')
s3.apply(highlight_max, props='color:white;background-color:darkred', axis=0)

In [None]:
city_and_year = recentCrime.groupby(['City', 'OffenseYear']).count()['OBJECTID'].unstack()

city_year_heatmap = sns.heatmap(city_and_year, cmap=sns.cubehelix_palette(as_cmap=True))

city_year_heatmap.set(title= 'Frequency of Offenses by City',xlabel='Year', ylabel ='City')

In [None]:
#find and plot the top zip code crime areas

zip_and_year = recentCrime.groupby(['Zip', 'OffenseYear']).count()['OBJECTID'].unstack()

zip_year_heatmap = sns.heatmap(zip_and_year, cmap=sns.cubehelix_palette(as_cmap=True))

zip_year_heatmap.set(title= 'Frequency of Offenses by Zip Code',xlabel='Year', ylabel ='Zip')




In [None]:
#find and plot the top 10 most common Zip Codes
Zip_group_vals = recentCrime.Zip.value_counts()[:10]

display(Zip_group_vals/crime.shape[0])

(Zip_group_vals / crime.shape[0]).plot(kind='bar')
plt.title('Top 10 Zip (as % of crimes)')