# NYC Weather Data
The NYC weather data is from the National Centers for Environmental Information, https://www.ncdc.noaa.gov/cdo-web/datasets/GHCND/stations/GHCND:USW00094728/detail. The data is from the weather station at `NY CITY CENTRAL PARK, NY US` from `1869-01-01` to `2023-02-27`. And from the dates we need weather data, from 2006-01-01 to 2021-12-31, we see that are no null values. So there's minimal processing besides converting the date string to datetime, filtering by a time range, and selecting the columns we want.

## Data Dictionary
The data dictionary is available in the link above under the `Available Data Types` in `Station Data Inventory, Access & History`. It will show that `TAVG`, `TMIN`, `TMAX` is average, minimum, and maximum temperature respectively.

## Purpose
The only columns I'll use for in weather data is average temperature, minimum temperature, maximum temperature, precipitation, and snow fall. I think these columns will be meaningful to see the correlation between crime rates in NYC and the variables.

In [137]:
from os.path import join

import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px
from plotly.subplots import make_subplots

from pyspark.sql.dataframe import DataFrame as PySparkDataFrame

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    count, 
    when, 
    lit,
    isnan,
    col,
    concat_ws,
    from_unixtime,
    unix_timestamp,
    to_timestamp,
    year, 
    month,
    dayofmonth,
    hour
)

pio.renderers.default = "iframe"

In [2]:
def read_weather_data(fname: str=join("..", "data", "USW00094728.csv")) -> pd.DataFrame:
    df = pd.read_csv(fname, parse_dates=["DATE"])
    subset_df = df[["DATE", "TAVG", "TMAX", "TMIN", "PRCP", "SNOW"]]
    timerange = subset_df.query("DATE.between('2006-01-01', '2021-12-31')")
    return timerange.reset_index(drop=True)

In [3]:
df = read_weather_data()
df

Unnamed: 0,DATE,TAVG,TMAX,TMIN,PRCP,SNOW
0,2006-01-01,4.93,8.64,1.23,126.8,51.0
1,2006-02-01,2.07,5.78,-1.63,73.1,683.0
2,2006-03-01,6.15,10.33,1.98,20.3,33.0
3,2006-04-01,13.15,18.48,7.82,141.2,3.0
4,2006-05-01,17.27,21.97,12.57,117.5,0.0
...,...,...,...,...,...,...
187,2021-08-01,25.30,28.81,21.80,262.1,0.0
188,2021-09-01,21.27,24.79,17.76,254.8,0.0
189,2021-10-01,16.67,19.93,13.41,133.7,0.0
190,2021-11-01,7.91,11.54,4.29,28.6,0.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   DATE    192 non-null    datetime64[ns]
 1   TAVG    192 non-null    float64       
 2   TMAX    192 non-null    float64       
 3   TMIN    192 non-null    float64       
 4   PRCP    192 non-null    float64       
 5   SNOW    192 non-null    float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 9.1 KB


In [5]:
df.describe()

Unnamed: 0,TAVG,TMAX,TMIN,PRCP,SNOW
count,192.0,192.0,192.0,192.0,192.0
mean,13.373802,17.307656,9.439583,111.375521,66.770833
std,8.664113,8.979786,8.384961,61.889651,161.756101
min,-4.41,0.08,-8.9,9.2,0.0
25%,5.4075,9.2875,1.5725,71.0,0.0
50%,13.31,17.595,9.225,102.75,0.0
75%,21.67,25.8625,17.615,138.675,34.25
max,27.41,32.27,22.97,481.3,937.0


In [6]:
df.isna().any()

DATE    False
TAVG    False
TMAX    False
TMIN    False
PRCP    False
SNOW    False
dtype: bool

## Crime Data

In [None]:
def initializeSpark() -> SparkSession:
    """Create a Spark Session for Streamlit app"""
    conf = SparkConf().setAppName("crime-processor").setMaster("local")
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    return spark, spark.sparkContext

In [None]:
def getCrimesPerMonth(sdf: PySparkDataFrame) -> pd.DataFrame:
    crimeTimes = sdf\
    .select(
        year("CMPLNT_FR").alias("CMPLNT_FR_YEAR"),
        month("CMPLNT_FR").alias("CMPLNT_FR_MONTH"),
        dayofmonth("CMPLNT_FR").alias("CMPLNT_FR_DAY"),
        hour("CMPLNT_FR").alias("CMPLNT_FR_HOUR")
    ).cache()
    
    crimesPerMonth = crimeTimes\
        .groupBy(["CMPLNT_FR_YEAR", "CMPLNT_FR_MONTH"])\
        .count()\
        .sort([col("CMPLNT_FR_YEAR"), col("CMPLNT_FR_MONTH")])\
        .toPandas()
    
    crimesPerMonth["Date"] = crimesPerMonth.CMPLNT_FR_MONTH.map(str) + "/" + crimesPerMonth.CMPLNT_FR_YEAR.map(str)
    crimesPerMonth.Date = pd.to_datetime(crimesPerMonth.Date)
    crimesPerMonth.set_index("Date", inplace=True)
    crimesPerMonth.drop(columns=["CMPLNT_FR_YEAR", "CMPLNT_FR_MONTH"], inplace=True)
    return crimesPerMonth

In [None]:
spark, _ = initializeSpark()

In [None]:
processedSDF = spark.read.load(path=join("..", "data", "NYPD_Complaint_Data_Historic.parquet"), format="parquet")
processedSDF.limit(5).toPandas()

In [None]:
crimesPerMonth = getCrimesPerMonth(sdf=processedSDF)

## Compare Crime Rate and Temperature

In [150]:
def compareCrimeRateAndTemperature(weatherData: pd.DataFrame, crimeData: pd.DataFrame):
    fig = make_subplots(
        rows=2, 
        cols=1,
        subplot_titles=("Temperature", "Crime Rate"),
        shared_xaxes=True
    )

    average_temperature = go.Scatter(
        x=weatherData.DATE,
        y=weatherData.TAVG,
        hovertemplate="<i>Date</i>: %{x}"
              "<br><i>Temperature</i>: %{y}°C<br>"
              "<extra></extra>",
        mode="lines",
        name="Average Temperature"
    )


    monthlyCrime = go.Scatter(
        x=crimeData.index,
        y=crimeData["count"],
        hovertemplate="<i>Date</i>: %{x}"
          "<br><i>Count</i>: %{y}<br>"
          "<extra></extra>",
        mode="lines",
        name="Monthly Crime Rates"
    )

    fig.add_trace(average_temperature, row=1, col=1)
    fig.add_trace(monthlyCrime, row=2, col=1)


    # Update xaxis properties
    fig.update_xaxes(title_text="Month", row=2, col=1)

    # Update yaxis properties
    fig.update_yaxes(title_text="Temperature (°C)", row=1, col=1)
    fig.update_yaxes(title_text="Count", row=2, col=1)

    fig.update_layout(
        title="Temperature and Crime Rates from 2006-2021",
    )
    return fig

In [151]:
compareCrimeRateAndTemperature(weatherData=df, crimeData=crimesPerMonth)

Here we see seasionality that it's hotter in the summer months (June, July, August) and colder in the winter months (December, January, February)

## Weather Affects Crime Rates?

## How Does Temperature Affect Crime Rates?
* How strong is the correlation? Inside vs Outside?
* Which type of crimes are most affected by temeprature?

We see that temperature is a poor indicator of crime rates. The r-squared value, which ranges from 0 to 1, is 0.28. This indicates that the linear regression model poorly fits the data. Perhaps we should change the scope from finding the correlation of temperature and crime rates to finding correlation of temperature and crimes that have occurred outside. 

In [36]:
fig = px.scatter(
    x=df.TAVG, 
    y=crimesPerMonth["count"],
    trendline="ols",
    title="Crimes per Month vs. Temperature",
)

fig.update_layout(
    xaxis_title="Temperature (°C)",
    yaxis_title="Crime Rate",
)
fig

In [43]:
fig = px.scatter(
    x=df.TAVG, 
    y=crimesPerMonth["count"],
    trendline="ols",
    title="Crimes per Month vs. Temperature",
)

fig.update_layout(
    xaxis_title="Temperature (°C)",
    yaxis_title="Crime Rate",
)
fig

In [41]:
processedSDF.select("LOC_OF_OCCUR_DESC").distinct().show()
    



+-----------------+
|LOC_OF_OCCUR_DESC|
+-----------------+
|      OPPOSITE OF|
|          REAR OF|
|             null|
|           INSIDE|
|         FRONT OF|
+-----------------+



                                                                                

In [90]:
crimes = processedSDF\
.filter((col("LOC_OF_OCCUR_DESC") != "INSIDE"))\
.select("OFNS_DESC").distinct().toPandas()



In [63]:
places = processedSDF\
.filter((col("LOC_OF_OCCUR_DESC") != "INSIDE") | (col("LOC_OF_OCCUR_DESC").isNotNull()))\
.select("PREM_TYP_DESC").distinct().toPandas()

                                                                                

In [70]:
insideCrimes = processedSDF\
    .select(
        "LOC_OF_OCCUR_DESC",
        year("CMPLNT_FR").alias("CMPLNT_FR_YEAR"),
        month("CMPLNT_FR").alias("CMPLNT_FR_MONTH"),
    )\
    .filter((col("LOC_OF_OCCUR_DESC") != "INSIDE"))\
    .groupBy(["CMPLNT_FR_YEAR", "CMPLNT_FR_MONTH"])\
    .count()\
    .sort([col("CMPLNT_FR_YEAR"), col("CMPLNT_FR_MONTH")])\
    .toPandas()

                                                                                

In [71]:
insideCrimes["Date"] = insideCrimes.CMPLNT_FR_MONTH.map(str) + "/" + insideCrimes.CMPLNT_FR_YEAR.map(str)
insideCrimes.Date = pd.to_datetime(insideCrimes.Date)
insideCrimes.set_index("Date", inplace=True)
insideCrimes.drop(columns=["CMPLNT_FR_YEAR", "CMPLNT_FR_MONTH"], inplace=True)

In [72]:
fig = px.scatter(
    x=df.TAVG, 
    y=insideCrimes["count"],
    trendline="ols",
    title="Crimes per Month vs. Temperature",
)

fig.update_layout(
    xaxis_title="Temperature (°C)",
    yaxis_title="Crime Rate",
)
fig

In [73]:
import statsmodels.formula.api as smf


In [78]:
df.rename(columns={"DATE": "Date"}).set_index("Date")

Unnamed: 0_level_0,TAVG,TMAX,TMIN,PRCP,SNOW
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2006-01-01,4.93,8.64,1.23,126.8,51.0
2006-02-01,2.07,5.78,-1.63,73.1,683.0
2006-03-01,6.15,10.33,1.98,20.3,33.0
2006-04-01,13.15,18.48,7.82,141.2,3.0
2006-05-01,17.27,21.97,12.57,117.5,0.0
...,...,...,...,...,...
2021-08-01,25.30,28.81,21.80,262.1,0.0
2021-09-01,21.27,24.79,17.76,254.8,0.0
2021-10-01,16.67,19.93,13.41,133.7,0.0
2021-11-01,7.91,11.54,4.29,28.6,0.0


In [129]:
output = []
selectColumns = processedSDF\
    .select(
        "OFNS_DESC",
        year("CMPLNT_FR").alias("CMPLNT_FR_YEAR"),
        month("CMPLNT_FR").alias("CMPLNT_FR_MONTH"),
    ).cache()

for crime in crimes["OFNS_DESC"]:
    print(crime)
    crimeSpecific = selectColumns\
    .filter((col("LOC_OF_OCCUR_DESC") != "INSIDE") & (col("OFNS_DESC") == crime))\
    .groupBy(["CMPLNT_FR_YEAR", "CMPLNT_FR_MONTH"])\
    .count()\
    .sort([col("CMPLNT_FR_YEAR"), col("CMPLNT_FR_MONTH")])\
    .toPandas()
    
    crimeSpecific["Date"] = crimeSpecific.CMPLNT_FR_MONTH.map(str) + "/" + crimeSpecific.CMPLNT_FR_YEAR.map(str)
    crimeSpecific.Date = pd.to_datetime(crimeSpecific.Date)
    crimeSpecific.set_index("Date", inplace=True)
    crimeSpecific.drop(columns=["CMPLNT_FR_YEAR", "CMPLNT_FR_MONTH"], inplace=True)
    merged = crimeSpecific.merge(df.rename(columns={"DATE": "Date"}).set_index("Date"), left_index=True, right_index=True)
    ols = smf.ols(formula="TAVG ~ count",data=merged)
    model=ols.fit()
    output.append((crime, model.rsquared))

23/03/07 15:06:08 WARN CacheManager: Asked to cache already cached data.
OFFENSES AGAINST PUBLIC ORDER/ADMINISTRATION


                                                                                

OTHER TRAFFIC INFRACTION
ANTICIPATORY OFFENSES
FELONY SEX CRIMES
NEW YORK CITY HEALTH CODE
VEHICLE AND TRAFFIC LAWS
KIDNAPPING & RELATED OFFENSES
HOMICIDE-NEGLIGENT-VEHICLE
PETIT LARCENY OF MOTOR VEHICLE
FELONY ASSAULT
ALCOHOLIC BEVERAGE CONTROL LAW
OFFENSES RELATED TO CHILDREN
THEFT-FRAUD
THEFT OF SERVICES
JOSTLING
MISCELLANEOUS PENAL LAW
CRIMINAL MISCHIEF
ARSON
OFFENSES AGAINST THE PERSON
GAMBLING
ENDANGERING WELFARE OF INCOMPETENT
SEX CRIMES
CRIMINAL TRESPASS
BURGLAR'S TOOLS
NYS LAWS-UNCLASSIFIED VIOLATION
CHILD ABANDONMENT/NON SUPPORT
OFFENSES INVOLVING FRAUD
INTOXICATED/IMPAIRED DRIVING
HOMICIDE-NEGLIGENT
LOITERING/DEVIATE SEX
INTOXICATED & IMPAIRED DRIVING
ESCAPE
AGRICULTURE & MARKETS LAW
FRAUDS
FORTUNE TELLING
NYS LAWS-UNCLASSIFIED FELONY
UNLAWFUL POSSESSION OF WEAPON ON SCHOOL
DANGEROUS DRUGS
PROSTITUTION & RELATED OFFENSES
ASSAULT & RELATED OFFENSES
FORGERY
FRAUDULENT ACCOSTING
OTHER OFFENSES RELATED TO THEFT
LOITERING
ROBBERY
GRAND LARCENY OF MOTOR VEHICLE
DANGEROUS WEAPONS
U

In [130]:
sorted(output, key=lambda x: x[1], reverse=True)

[('FELONY ASSAULT', 0.48253519643883436),
 ('ASSAULT & RELATED OFFENSES', 0.4650660193920091),
 ('PETIT LARCENY', 0.44064932157025616),
 ('OFFENSES AGAINST THE PERSON', 0.41374865862507637),
 ('GAMBLING', 0.3867934810164907),
 ('HARRASSMENT', 0.32310071670682594),
 ('CRIMINAL MISCHIEF', 0.23610076975893513),
 ('MISCELLANEOUS PENAL LAW', 0.21382361767327818),
 ('PETIT LARCENY OF MOTOR VEHICLE', 0.18222193403052367),
 ('UNAUTHORIZED USE OF A VEHICLE', 0.18166235065619774),
 ('OFFENSES AGAINST PUBLIC SAFETY', 0.17582010316032182),
 ('RAPE', 0.16358297076878547),
 ('GRAND LARCENY', 0.1553253079589657),
 ('OTHER TRAFFIC INFRACTION', 0.13830814727992946),
 ('SEX CRIMES', 0.12834574194751813),
 ('LOITERING/DEVIATE SEX', 0.11097295874372015),
 ('CRIMINAL TRESPASS', 0.10454097281253616),
 ('VEHICLE AND TRAFFIC LAWS', 0.10204362519981713),
 ('ADMINISTRATIVE CODE', 0.09933185160460578),
 ('GRAND LARCENY OF MOTOR VEHICLE', 0.0800475198725995),
 ('ADMINISTRATIVE CODES', 0.0703010382522532),
 ('THEF

In [127]:
assaultDF = selectColumns\
.filter((col("LOC_OF_OCCUR_DESC") != "INSIDE") & (col("OFNS_DESC") == "ASSAULT & RELATED OFFENSES"))\
.groupBy(["CMPLNT_FR_YEAR", "CMPLNT_FR_MONTH"])\
.count()\
.sort([col("CMPLNT_FR_YEAR"), col("CMPLNT_FR_MONTH")])\
.toPandas()

assaultDF["Date"] = assaultDF.CMPLNT_FR_MONTH.map(str) + "/" + assaultDF.CMPLNT_FR_YEAR.map(str)
assaultDF.Date = pd.to_datetime(assaultDF.Date)
assaultDF.set_index("Date", inplace=True)
assaultDF.drop(columns=["CMPLNT_FR_YEAR", "CMPLNT_FR_MONTH"], inplace=True)

                                                                                

In [128]:
fig = px.scatter(
    x=df.TAVG, 
    y=assaultDF["count"],
    trendline="ols",
    title="Crimes per Month vs. Temperature",
)

fig.update_layout(
    xaxis_title="Temperature (°C)",
    yaxis_title="Crime Rate",
)
fig

In [None]:
assaultDF