In [4]:
import pandas as pd
import pathlib as pl

unemployment_crime = pd.read_csv(pl.Path('..','clean_data', 'merged_data.csv'))
# unemployment_crime

In [None]:
# Column for total offenses
unemployment_crime.to_csv(pl.Path('..','clean_data', 'merged_data.csv'), index=False)

In [5]:
from scipy.stats import pearsonr
import numpy as np


# Prepare a dictionary to hold the results
results = {}

# Iterate over each crime category to calculate r and p-values
crime_columns = unemployment_crime.columns[2:-1] # Exclude state_abbr, data_year, and Unemployment_Rate

for crime in crime_columns:
    # Calculate Pearson correlation coefficient and p-value
    r, p = pearsonr(unemployment_crime[crime], unemployment_crime['Unemployment_Rate'])
    results[crime] = {'r_value': r, 'p_value': p}

# Convert results dictionary to a DataFrame for better visualization
results_df = pd.DataFrame(results).T

# Display the results
results_df

Unnamed: 0,r_value,p_value
Aggravated Assault,0.087868,0.063734
All Other Offenses (Except Traffic),-0.028448,0.54902
Arson,0.080976,0.087614
Burglary,0.097828,0.038908
Curfew and Loitering Law Violations,0.057901,0.222322
Disorderly Conduct,0.062119,0.19038
Driving Under the Influence,0.0685,0.148666
Drug Abuse Violations - Grand Total,0.038398,0.418555
Drunkenness,0.039483,0.405511
Embezzlement,-0.02827,0.551525


In [6]:
# Sum all crimes for each row to get total crimes per year per state
unemployment_crime['Total_Crimes'] = unemployment_crime[crime_columns].sum(axis=1)

# Calculate Pearson correlation coefficient and p-value for the total crimes
r_total, p_total = pearsonr(unemployment_crime['Total_Crimes'], unemployment_crime['Unemployment_Rate'])

# Display the results for total crimes
r_total, p_total

(0.03506244131607798, 0.46013611311618124)

In [8]:
# Initialize a dictionary to hold state-by-state results
state_results = {}

# Group data by state
grouped_data = unemployment_crime.groupby('state_abbr')

# Iterate over each state
for state, group in grouped_data:
    state_results[state] = {}
    # Iterate over each crime category + Total_Crimes for the current state
    for crime in crime_columns.tolist() + ['Total_Crimes']:
        # Calculate Pearson correlation coefficient and p-value for the current state and crime
        r, p = pearsonr(group[crime], group['Unemployment_Rate'])
        state_results[state][crime] = {'r_value': r, 'p_value': p}

# Convert the nested dictionary to a more accessible format (DataFrame or similar)
# This requires a bit of manipulation to get it into a nice format for display
# We will prepare a DataFrame with multi-index (State, Crime) and columns for r_value and p_value

# Flatten the dictionary
flat_data = []
for state, crimes in state_results.items():
    for crime, stats in crimes.items():
        flat_data.append([state, crime, stats['r_value'], stats['p_value']])

# Create a DataFrame from the flat data
state_results_df = pd.DataFrame(flat_data, columns=['State', 'Crime', 'r_value', 'p_value'])

# Display the first few rows of the DataFrame
state_results_df.head(50
)  # Show the first 20 rows to get an idea of the results across different states



Unnamed: 0,State,Crime,r_value,p_value
0,AK,Aggravated Assault,-0.130217,0.738445
1,AK,All Other Offenses (Except Traffic),0.053327,0.89162
2,AK,Arson,0.094516,0.808882
3,AK,Burglary,-0.200222,0.605493
4,AK,Curfew and Loitering Law Violations,-0.487109,0.183539
5,AK,Disorderly Conduct,0.053758,0.890749
6,AK,Driving Under the Influence,-0.294236,0.442165
7,AK,Drug Abuse Violations - Grand Total,-0.081031,0.835826
8,AK,Drunkenness,-0.13274,0.733523
9,AK,Embezzlement,-0.19903,0.607686
