<a href="https://colab.research.google.com/github/Akif29/AkifKhan.github.io/blob/main/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import calendar
pd.set_option('display.max_rows', None)

# Loading and Previewing Crime Data

In [2]:
df = pd.read_csv("Crime.csv", low_memory=False)
df.head()

Unnamed: 0,Incident ID,Offence Code,CR Number,Dispatch Date / Time,NIBRS Code,Victims,Crime Name1,Crime Name2,Crime Name3,Police District Name,...,Street Prefix,Street Name,Street Suffix,Street Type,Start_Date_Time,End_Date_Time,Latitude,Longitude,Police District Number,Location
0,201202980,3550,180042096,08/23/2018 09:52:08 PM,35B,1,Crime Against Society,Drug Equipment Violations,DRUGS - NARCOTIC EQUIP - POSSESS,GERMANTOWN,...,,MIDDLEBROOK,,RD,08/23/2018 09:52:00 PM,,39.177744,-77.265619,5D,"(39.1777, -77.2656)"
1,201181293,3522,180015424,03/30/2018 01:00:55 AM,35A,1,Crime Against Society,Drug/Narcotic Violations,DRUGS - OPIUM OR DERIVATIVE - POSSESS,BETHESDA,...,,WOODMONT,,AVE,03/30/2018 01:01:00 AM,,38.992693,-77.097063,2D,"(38.9927, -77.0971)"
2,201181293,3562,180015424,03/30/2018 01:00:55 AM,35A,1,Crime Against Society,Drug/Narcotic Violations,DRUGS - MARIJUANA - POSSESS,BETHESDA,...,,WOODMONT,,AVE,03/30/2018 01:01:00 AM,,38.992693,-77.097063,2D,"(38.9927, -77.0971)"
3,201193163,3520,180029476,06/14/2018 10:26:45 PM,35A,1,Crime Against Society,Drug/Narcotic Violations,DRUGS - OPIUM OR DERIVATIVE - SELL,MONTGOMERY VILLAGE,...,,QUINCE ORCHARD,,RD,06/14/2018 10:26:00 PM,06/15/2018 03:00:00 AM,39.147954,-77.218189,6D,"(39.148, -77.2182)"
4,201204355,2204,180043926,09/03/2018 12:06:54 PM,220,1,Crime Against Property,Burglary/Breaking and Entering,BURGLARY - NO FORCED ENTRY-RESIDENTIAL,BETHESDA,...,,FALSTONE,,AVE,09/02/2018 11:30:00 PM,09/03/2018 12:30:00 AM,38.966174,-77.096561,2D,"(38.9662, -77.0966)"


# Dataframe shape

In [3]:
df.shape

(306094, 30)

# Counting NON-Null Values from each column

In [4]:
df.count()

Incident ID               306094
Offence Code              306094
CR Number                 306094
Dispatch Date / Time      257065
NIBRS Code                306094
Victims                   306094
Crime Name1               305822
Crime Name2               305822
Crime Name3               305822
Police District Name      306000
Block Address             279888
City                      304818
State                     306094
Zip Code                  302915
Agency                    306094
Place                     306094
Sector                    304564
Beat                      304564
PRA                       305855
Address Number            279985
Street Prefix              13631
Street Name               306093
Street Suffix               5432
Street Type               305755
Start_Date_Time           306094
End_Date_Time             144436
Latitude                  306094
Longitude                 306094
Police District Number    306094
Location                  306094
dtype: int

# Checking Null Values in DataFrame

In [5]:
df.isnull().sum()

Incident ID                    0
Offence Code                   0
CR Number                      0
Dispatch Date / Time       49029
NIBRS Code                     0
Victims                        0
Crime Name1                  272
Crime Name2                  272
Crime Name3                  272
Police District Name          94
Block Address              26206
City                        1276
State                          0
Zip Code                    3179
Agency                         0
Place                          0
Sector                      1530
Beat                        1530
PRA                          239
Address Number             26109
Street Prefix             292463
Street Name                    1
Street Suffix             300662
Street Type                  339
Start_Date_Time                0
End_Date_Time             161658
Latitude                       0
Longitude                      0
Police District Number         0
Location                       0
dtype: int

# Dropping Rows with Null Values

In [6]:
df.dropna(inplace=True)

# Checking Null Values after deletion

In [7]:
df.isnull().sum()

Incident ID               0.0
Offence Code              0.0
CR Number                 0.0
Dispatch Date / Time      0.0
NIBRS Code                0.0
Victims                   0.0
Crime Name1               0.0
Crime Name2               0.0
Crime Name3               0.0
Police District Name      0.0
Block Address             0.0
City                      0.0
State                     0.0
Zip Code                  0.0
Agency                    0.0
Place                     0.0
Sector                    0.0
Beat                      0.0
PRA                       0.0
Address Number            0.0
Street Prefix             0.0
Street Name               0.0
Street Suffix             0.0
Street Type               0.0
Start_Date_Time           0.0
End_Date_Time             0.0
Latitude                  0.0
Longitude                 0.0
Police District Number    0.0
Location                  0.0
dtype: float64

# Functions for visualisation

In [8]:
# Function template for treemap graphs
def treemap(categories,title,path,values):
    fig = px.treemap(categories, path=path, values=values, height=700,
                 title=title, color_discrete_sequence = px.colors.sequential.RdBu)
    fig.data[0].textinfo = 'label+text+value'
    fig.show()

In [9]:
# Function to create histograms
def histogram(data,path,color,title,xaxis,yaxis):
    fig = px.histogram(data, x=path,color=color)
    fig.update_layout(
        title_text=title,
        xaxis_title_text=xaxis,
        yaxis_title_text=yaxis,
        bargap=0.2,
        bargroupgap=0.1
    )
    fig.show()

In [10]:
# Function to create stacked bar graphs
def stackbar(data,x,y,title,xlab,ylab, barmode, legend = False):
    fig = px.bar(data, x=x, y=y,
             height=500)
    fig.update_layout(
    title_text=title,
    xaxis_title_text=xlab,
    yaxis_title_text=ylab,
    bargap=0.2,
    bargroupgap=0.1,
    plot_bgcolor='white',
    showlegend = legend,
    legend_title=None,
    barmode = barmode
    )

        # Customize axis appearance
    fig.update_xaxes(showline=True, showgrid=True)
    fig.update_yaxes(showline=True, showgrid=True, gridcolor = "black")

    # Set a larger title font size
    fig.update_layout(title_font=dict(size=24), title_x=0.5)
    fig.show()

In [11]:
# Function to create bar graphs
def bar(categories,x,y,color,title,xlab,ylab):
    fig = px.bar(categories, x=x, y=y,
             color=color,
             height=500)
    fig.update_layout(
    title_text=title,
    xaxis_title_text=xlab,
    yaxis_title_text=ylab,
    bargap=0.2,
    bargroupgap=0.1,
    plot_bgcolor='white',
    showlegend = False,
    )

        # Customize axis appearance
    fig.update_xaxes(showline=True, showgrid=False)
    fig.update_yaxes(showline=True, showgrid=True, gridcolor = "black")

    # Set a larger title font size
    fig.update_layout(title_font=dict(size=24), title_x=0.5)
    fig.show()

In [12]:
# Function to create a line graph

def line_plot(x, y, title, x_label, y_label, line_color='blue', line_width=2, marker_color='darkblue'):
    # Create an interactive line plot with Plotly
    fig = px.line(x=x, y=y, title=title, labels={'x': x_label, 'y': y_label})

    # Customize plot styling
    fig.update_traces(
        mode='lines+markers',
        line=dict(color=line_color, width=line_width),
        marker=dict(size=8, line=dict(width=2, color=marker_color))
    )

    fig.update_layout(
        plot_bgcolor='white',
        xaxis=dict(showline=True, showgrid=False),
        yaxis=dict(showline=True, showgrid=False),
        title_font=dict(size=24),
        title_x=0.5,
    )

    fig.show()

# Visualisations

## Creating and Cleaning "Crime Title" Column

In [13]:
# Creating a new dataframe to avoid conflict
crime_data = pd.read_csv('Crime.csv', low_memory=False)

# Creating a new column called "Crime Title" by extracting the first part before "-"
crime_data["Crime Title"] = crime_data["Crime Name3"].str.split("-").str[0].str.strip()

# Re-formatting the data for better consistency
# Extracting the relevant part of the "Crime Title" using a regular expression
crime_data['Crime Title'] = crime_data['Crime Title'].str.extract(r'^(.*?)\s*\(\s*DESCRIBE OFFENSE|SPECIFY WEAPON\s*\)', expand=False).fillna(crime_data['Crime Title']).str.strip()

# Handling a specific case: Replacing a more detailed description with a simplified label
crime_data['Crime Title'] = crime_data['Crime Title'].str.replace(r'^DRIVING UNDER THE INFLUENCE.*', 'DRIVING UNDER THE INFLUENCE', regex=True)

# Displaying the first few rows of the "Crime Title" column
crime_data['Crime Title'].head()

0       DRUGS
1       DRUGS
2       DRUGS
3       DRUGS
4    BURGLARY
Name: Crime Title, dtype: object

## Visualizing Major Crimes in Montgomery with a Treemap

In [14]:
# Preparing the graph data
crime_counts = crime_data["Crime Title"].value_counts()
values = crime_counts.values
categories = pd.DataFrame(data=crime_counts.index, columns=["Crime Title"])
categories['values'] = values

# Plotting the data using a Treemap
treemap(categories,
        "Major Crimes in Montgomery",
        ["Crime Title"],
        categories['values'])

## Top 10 Major Crimes in Montgomery - Bar Chart

In [15]:
# Plotting a bar chart for the top 10 major crimes in Montgomery
bar(categories,
    categories["Crime Title"][0:10],
    categories["values"][0:10],
    categories["Crime Title"][0:10],
    "Top 10 Major Crimes in Montgomery",
    "Crime",
    'Total Reported Crimes')

## Crime Intensity in Montgomery Cities - Treemap

In [16]:
# Function template for treemap graphs
def treemap_temp(categories, title, path, values, colors):
    fig = px.treemap(categories, path=[px.Constant(" "), 'City'], values=values, color=colors, height=700,
                    title=title, color_continuous_scale='OrRd')
    fig.data[0].textinfo = 'label+text+value'
    fig.show()

# Extracting unique cities
cities = crime_data['City'].unique()

# Preparing the graph data
city_counts = crime_data["City"].value_counts()
values = city_counts.values
categories = pd.DataFrame(data=city_counts.index, columns=["City"])
categories['Reported Crimes'] = values

# Plotting the data using the treemap function template
treemap_temp(
    categories,
    title="Crime Intensity in Montgomery Cities",
    path=["City"],
    values=categories['Reported Crimes'],
    colors=categories['Reported Crimes']
)

## Analysis of Reported Crimes Over the Years (2018-2022)

In [17]:
# Convert the 'Start_Date_Time' column to a datetime object and extract the year
crime_data['Start_Date_Time'] = pd.to_datetime(crime_data['Start_Date_Time'], errors='coerce')  # Convert to datetime
crime_data['Year'] = crime_data['Start_Date_Time'].dt.year  # Extract the year

# Remove rows with missing or invalid dates (NaT)
crime_data = crime_data.dropna(subset=['Start_Date_Time', 'Year'])

# Group the data by year and count the number of crimes for each year
yearly_crime_counts = crime_data['Year'].value_counts().sort_index()

# Plotting the Data
line_plot(
    x=yearly_crime_counts.index,
    y=yearly_crime_counts.values,
    title='Total Reported Crimes Over the Years (2018-2022)',
    x_label='Year',
    y_label='Total Reported Crimes'
)

## Analysis of Specific Crimes Over Time

In [18]:
# Selecting Desired Crimes for Analysis
desired_crimes = ['LARCENY', 'DRUGS', 'ASSAULT']  # Customize to specific crimes of interest

# Extracting Year from 'Start_Date_Time' and creating a new 'Year' column
crime_data['Year'] = pd.to_datetime(crime_data['Start_Date_Time'], format='%m/%d/%Y %I:%M:%S %p').dt.year

# Extracting Crime Category and creating a new 'Category' column based on desired crimes
crime_data['Category'] = crime_data['Crime Title'].str.extract(f"({'|'.join(desired_crimes)})")

# Grouping data by Year and Crime Category, counting occurrences
grouped_data = crime_data.groupby(['Year', 'Category']).size().unstack(fill_value=0).reset_index()

# Plotting a Stacked Bar Chart
stackbar(
    grouped_data,
    x='Year',
    y=desired_crimes,
    title='Stacked Bar Chart of Specific Crimes by Year',
    xlab='Year',
    ylab='Total Reported Crimes',
    barmode='stack',
    legend=True
)

## Analysis of Monthly Reported Crimes in 2017

In [19]:
# Extracting the month from the 'Start_Date_Time' column
crime_data['Month'] = crime_data['Start_Date_Time'].dt.month  # Extract the month

# Filtering data for only the year 2017
crime_data_2017 = crime_data[crime_data['Year'] == 2017]

# Removing rows with missing or invalid dates (NaT)
crime_data_2017 = crime_data_2017.dropna(subset=['Start_Date_Time', 'Year', 'Month'])

# Grouping the data by year and month, then counting the number of crimes for each month
monthly_crime_counts = crime_data_2017.groupby(['Year', 'Month']).size().reset_index(name='Crimes')

# Mapping month numbers to month names
monthly_crime_counts['Month'] = monthly_crime_counts['Month'].apply(lambda x: calendar.month_abbr[x])

# Plotting the Data
line_plot(
    x=monthly_crime_counts['Month'],
    y=monthly_crime_counts['Crimes'],
    title='Monthly Reported Crimes in 2017',
    x_label='Month',
    y_label='Total Reported Crimes'
)

## Identifying the Date with the Most Reported Crimes

In [20]:
# Convert 'Start_Date_Time' column to datetime format
crime_data['Start_Date_Time'] = pd.to_datetime(crime_data['Start_Date_Time'], errors='coerce')

# Extract the date from 'Start_Date_Time'
crime_data['Date'] = crime_data['Start_Date_Time'].dt.date

# Find the date with the most reported crimes
most_crime_date = crime_data['Date'].value_counts().idxmax()
most_crime_count = crime_data['Date'].value_counts().max()

# Print the result
print(f"On {most_crime_date}, the most crimes were reported, with a total of {most_crime_count} crimes.")

On 2017-11-01, the most crimes were reported, with a total of 252 crimes.


## Data for the Date with the Most Reported Crimes

In [21]:
# Convert 'Start_Date_Time' column to datetime format
crime_data['Start_Date_Time'] = pd.to_datetime(crime_data['Start_Date_Time'], errors='coerce')

# Extract the date from 'Start_Date_Time'
crime_data['Date'] = crime_data['Start_Date_Time'].dt.date

# Find the date with the most reported crimes
most_crime_date = crime_data['Date'].value_counts().idxmax()

# Filter data for the date with the most crimes
most_crime_date_data = crime_data[crime_data['Date'] == most_crime_date]
most_crime_date_data.head()

Unnamed: 0,Incident ID,Offence Code,CR Number,Dispatch Date / Time,NIBRS Code,Victims,Crime Name1,Crime Name2,Crime Name3,Police District Name,...,End_Date_Time,Latitude,Longitude,Police District Number,Location,Crime Title,Year,Category,Month,Date
1373,201186287,9105,180020386,04/26/2018 09:53:31 AM,90Z,1,Other,All Other Offenses,LOST PROPERTY,GERMANTOWN,...,,39.192447,-77.240022,5D,"(39.1924, -77.24)",LOST PROPERTY,2017,,11,2017-11-01
64342,201160206,9199,170540409,11/02/2017 10:34:01 AM,90Z,1,Other,All Other Offenses,POLICE INFORMATION,BETHESDA,...,,39.036261,-77.124092,2D,"(39.0363, -77.1241)",POLICE INFORMATION,2017,,11,2017-11-01
64346,201160113,9109,170540370,,90Z,1,Other,All Other Offenses,RECOVERED PROPERTY - OTHER,SILVER SPRING,...,,38.985805,-77.025505,3D,"(38.9858, -77.0255)",RECOVERED PROPERTY,2017,,11,2017-11-01
64715,201160080,9107,170540303,11/01/2017 07:41:21 PM,90Z,1,Other,All Other Offenses,MISSING PERSON,SILVER SPRING,...,,39.072056,-76.947074,3D,"(39.0721, -76.9471)",MISSING PERSON,2017,,11,2017-11-01
64865,201163095,9107,170544107,11/21/2017 12:45:05 PM,90Z,1,Other,All Other Offenses,MISSING PERSON,SILVER SPRING,...,,38.996903,-77.007364,3D,"(38.9969, -77.0074)",MISSING PERSON,2017,,11,2017-11-01


## Hourly Distribution of Total Reported Crimes on the Date with the Most Crimes

In [22]:
# Remove rows with missing Start_Date_Time
crime_data = crime_data.dropna(subset=['Start_Date_Time'])

# Group data by hour and count the number of crimes reported each hour
hourly_crime_counts = most_crime_date_data['Start_Date_Time'].dt.hour.value_counts().sort_index()

# Create a line chart using Plotly
fig = px.line(
    x=hourly_crime_counts.index,
    y=hourly_crime_counts.values,
    labels={'x': 'Hour of Day', 'y': 'Number of Crimes'},
    title=f'Hourly Distribution of Total Reported Crimes on {most_crime_date}'
)

# Customize plot layout
fig.update_layout(
    xaxis=dict(
        tickmode='array',
        tickvals=list(range(24)),
        ticktext=[str(i) for i in range(24)]
    ),
    xaxis_title='Hour of Day',
    yaxis_title='Number of Crimes'
)

# Customize plot styling
fig.update_traces(
    mode='lines+markers',
    line=dict(color="blue", width=2),
    marker=dict(size=8, line=dict(width=2, color="darkblue"))
)

# Customize overall plot layout
fig.update_layout(
    plot_bgcolor='white',
    xaxis=dict(showline=True, showgrid=False),
    yaxis=dict(showline=True, showgrid=False),
    title_font=dict(size=24),
    title_x=0.5
)

# Show the plot
fig.show()

## Hourly Distribution of Specific Crimes on the Date with the Most Crimes

In [23]:
# Filter data for the date with the most crimes and specific crime types
specific_crimes = ['LARCENY', 'DRUGS', 'ASSAULT']  # Edit to include or remove a crime
specific_crimes_data = crime_data[(crime_data['Date'] == most_crime_date) & (crime_data['Crime Title'].isin(specific_crimes))]

# Create a histogram
fig = px.histogram(
    specific_crimes_data,
    x=specific_crimes_data['Start_Date_Time'].dt.hour,
    color='Crime Title',
    nbins=24,
    title=f'Hourly Distribution of Specific Crimes on {most_crime_date}',
    labels={'Start_Date_Time': 'Hour of Day', 'Crime Title': 'Number of Crimes'}
)
fig.update_layout(xaxis_title='Hour', yaxis_title='Number of Crimes')
fig.show()