In [2]:
import pandas as pd
import pathlib as pl

unemployment_crime = pd.read_csv(pl.Path('..','clean_data', 'merged_data.csv'))
# unemployment_crime

In [3]:
# Column for total offenses
unemployment_crime.to_csv(pl.Path('..','clean_data', 'merged_data.csv'), index=False)

In [4]:
from scipy.stats import pearsonr
import numpy as np


# Prepare a dictionary to hold the results
results = {}

# Iterate over each crime category to calculate r and p-values
crime_columns = unemployment_crime.columns[2:-1] # Exclude state_abbr, data_year, and Unemployment_Rate

for crime in crime_columns:
    # Calculate Pearson correlation coefficient and p-value
    r, p = pearsonr(unemployment_crime[crime], unemployment_crime['Unemployment_Rate'])
    results[crime] = {'r_value': r, 'p_value': p}

# Convert results dictionary to a DataFrame for better visualization
results_df = pd.DataFrame(results).T

# Display the results
results_df
results_df.to_csv(pl.Path('..','clean_data', 'crime_unemployment_correlation_results.csv'), index=True)

Unnamed: 0,r_value,p_value
Aggravated Assault,0.087868,0.063734
All Other Offenses (Except Traffic),-0.028448,0.54902
Arson,0.080976,0.087614
Burglary,0.097828,0.038908
Curfew and Loitering Law Violations,0.057901,0.222322
Disorderly Conduct,0.062119,0.19038
Driving Under the Influence,0.0685,0.148666
Drug Abuse Violations - Grand Total,0.038398,0.418555
Drunkenness,0.039483,0.405511
Embezzlement,-0.02827,0.551525


In [6]:
# Sum all crimes for each row to get total crimes per year per state
unemployment_crime['Total_Crimes'] = unemployment_crime[crime_columns].sum(axis=1)

# Calculate Pearson correlation coefficient and p-value for the total crimes
r_total, p_total = pearsonr(unemployment_crime['Total_Crimes'], unemployment_crime['Unemployment_Rate'])

# Display the results for total crimes
r_total, p_total

(0.03506244131607798, 0.46013611311618124)

In [8]:
# Initialize a dictionary to hold state-by-state results
state_results = {}

# Group data by state
grouped_data = unemployment_crime.groupby('state_abbr')

# Iterate over each state
for state, group in grouped_data:
    state_results[state] = {}
    # Iterate over each crime category + Total_Crimes for the current state
    for crime in crime_columns.tolist() + ['Total_Crimes']:
        # Calculate Pearson correlation coefficient and p-value for the current state and crime
        r, p = pearsonr(group[crime], group['Unemployment_Rate'])
        state_results[state][crime] = {'r_value': r, 'p_value': p}

# Convert the nested dictionary to a more accessible format (DataFrame or similar)
# This requires a bit of manipulation to get it into a nice format for display
# We will prepare a DataFrame with multi-index (State, Crime) and columns for r_value and p_value

# Flatten the dictionary
flat_data = []
for state, crimes in state_results.items():
    for crime, stats in crimes.items():
        flat_data.append([state, crime, stats['r_value'], stats['p_value']])

# Create a DataFrame from the flat data
state_results_df = pd.DataFrame(flat_data, columns=['State', 'Crime', 'r_value', 'p_value'])

# Display the first few rows of the DataFrame
state_results_df.head(50
)  # Show the first 20 rows to get an idea of the results across different states
state_results_df.to_csv(pl.Path('..','clean_data', 'state_crime_unemployment_correlation_results.csv'), index=False)



Unnamed: 0,State,Crime,r_value,p_value
0,AK,Aggravated Assault,-0.130217,0.738445
1,AK,All Other Offenses (Except Traffic),0.053327,0.89162
2,AK,Arson,0.094516,0.808882
3,AK,Burglary,-0.200222,0.605493
4,AK,Curfew and Loitering Law Violations,-0.487109,0.183539
5,AK,Disorderly Conduct,0.053758,0.890749
6,AK,Driving Under the Influence,-0.294236,0.442165
7,AK,Drug Abuse Violations - Grand Total,-0.081031,0.835826
8,AK,Drunkenness,-0.13274,0.733523
9,AK,Embezzlement,-0.19903,0.607686


In [6]:
population_data = pd.read_csv(pl.Path('..','clean_data', 'us_population.csv'))
crime_data = pd.read_csv(pl.Path('..','clean_data', 'national_crime_by_year.csv'))

# Ensure population_data is in a suitable format for merging
population_data_long = population_data.melt(id_vars=['Date'], var_name='state_abbr', value_name='Population')

# Convert the 'Date' column in population_data_long to numeric type for consistency with crime_data
population_data_long['Date'] = pd.to_numeric(population_data_long['Date'])

# Merge crime_data with population_data_long on state and year
merged_data = pd.merge(crime_data, population_data_long, how='left', left_on=['state_abbr', 'data_year'], right_on=['state_abbr', 'Date'])

# Drop the extra 'Date' column as it's redundant
merged_data.drop('Date', axis=1, inplace=True)

# Calculate crime rates per 100,000 population for each crime type
crime_types = crime_data.columns[2:]  # Assuming the first two columns are 'state_abbr' and 'data_year'
for crime in crime_types:
    merged_data[f'{crime}_rate'] = (merged_data[crime] / merged_data['Population']) * 100000

# Display the first few rows of the modified dataframe to verify the calculations
merged_data.head()

# Note: You might want to adjust the path and save the resulting dataframe to a new CSV file or use it for further analysis.
# merged_data.to_csv('path_to_save/modified_crime_data.csv', index=False)

Unnamed: 0,state_abbr,data_year,Aggravated Assault,All Other Offenses (Except Traffic),Arson,Burglary,Curfew and Loitering Law Violations,Disorderly Conduct,Driving Under the Influence,Drug Abuse Violations - Grand Total,...,Prostitution and Commercialized Vice_rate,Rape_rate,Robbery_rate,Simple Assault_rate,"Stolen Property: Buying, Receiving, Possessing_rate",Suspicion_rate,Vagrancy_rate,Vandalism_rate,"Weapons: Carrying, Possessing, Etc._rate","Sex Offenses (Except Rape, and Prostitution and Commercialized Vice)_rate"
0,AL,2004,3926,78939,121,3498,59,4834,13582,18315,...,,,,,,,,,,
1,AL,2005,3389,68569,95,2932,76,4675,11621,15704,...,,,,,,,,,,
2,AL,2006,2971,62179,95,2843,30,3966,10536,14814,...,,,,,,,,,,
3,AL,2007,3807,77383,107,3379,55,5014,12857,17308,...,,,,,,,,,,
4,AL,2008,4019,87077,141,4275,37,5028,15900,17817,...,,,,,,,,,,


In [13]:
# First, filter out the year 2023 from the population data
filtered_population_data = population_data[population_data['Date'] != 2023]

# Now, transform the filtered data to the long format
population_data_long_filtered = filtered_population_data.melt(id_vars=['Date'], var_name='state_abbr', value_name='Population')

# Rename the 'Date' column to 'data_year' for consistency
population_data_long_filtered.rename(columns={'Date': 'data_year'}, inplace=True)

# Display the first few rows of the filtered and reformatted population data
population_data_long_filtered.head(20)



Unnamed: 0,data_year,state_abbr,Population
0,2014,AL,4843737
1,2015,AL,4854803
2,2016,AL,4866824
3,2017,AL,4877989
4,2018,AL,4891628
5,2019,AL,4907965
6,2020,AL,5031864
7,2021,AL,5050380
8,2022,AL,5073903
9,2014,AK,737075


In [14]:
# population_data_long_filtered.to_csv(pl.Path('..','clean_data', 'population_data_long_formatted.csv', index=False))

In [20]:
# pd.read_csv(pl.Path('..','clean_data', 'merged_data.csv'))
# new_crime_data = pd.read_csv(pl.Path('..','clean_data', 'national_crime_by_year.csv'))

# # Load the population_data_long_formatted.csv
# population_data_long_formatted = pd.read_csv(pl.Path('..','clean_data', 'population_data_long_formatted.csv'))

# # Attempt the merge operation again to append population data to the crime data
# merged_data = pd.merge(new_crime_data, population_data_long_formatted, on=['state_abbr', 'data_year'], how='left')

# # Display the first few rows of the merged data to ensure the operation was successful
# merged_data.head()
# merged_data.to_csv(pl.Path('..','clean_data', 'merged_data.csv'), index=False)

# Load the national crime data and filter out the years 2004-2013
new_crime_data = pd.read_csv(pl.Path('..','clean_data', 'national_crime_by_year.csv'))
new_crime_data = new_crime_data[new_crime_data['data_year'] >= 2014]

# Load the population_data_long_formatted.csv and filter out the years 2004-2013
population_data_long_formatted = pd.read_csv(pl.Path('..','clean_data', 'population_data_long_formatted.csv'))
population_data_long_formatted = population_data_long_formatted[population_data_long_formatted['data_year'] >= 2014]

# Attempt the merge operation again to append population data to the crime data
merged_data = pd.merge(new_crime_data, population_data_long_formatted, on=['state_abbr', 'data_year'], how='left')

# Display the first few rows of the merged data to ensure the operation was successful
print(merged_data.head())

# Save the filtered and merged data back to a CSV
merged_data.to_csv(pl.Path('..','clean_data', 'merged_data.csv'), index=False)

  state_abbr  data_year  Aggravated Assault  \
0         AL       2014                 707   
1         AL       2015                4346   
2         AL       2016                4887   
3         AL       2017                4804   
4         AL       2018                4649   

   All Other Offenses (Except Traffic)  Arson  Burglary  \
0                                12402     30       601   
1                                64046    115      3317   
2                                73292    115      3302   
3                                80304    132      3468   
4                                84007    131      3183   

   Curfew and Loitering Law Violations  Disorderly Conduct  \
0                                    0                 497   
1                                    0                2416   
2                                    0                2346   
3                                    0                2305   
4                                    0              

In [22]:
# Calculate the crime rates for each crime type
crime_types = merged_data.columns[2:-2]  # Exclude state_abbr, data_year, Unnamed: 0, and Population columns

# Calculate and append the crime rates to the DataFrame
for crime in crime_types:
    rate_column_name = f'{crime}_rate'
    merged_data[rate_column_name] = (merged_data[crime] / merged_data['Population']) * 100000

# Display the first few rows of the DataFrame to confirm the new columns
merged_data.head()
merged_data.to_csv(pl.Path('..','clean_data', 'crime_rates.csv'), index=False)

In [28]:
# Load the crime rates data
crime_rates_df = pd.read_csv(pl.Path('..','clean_data', 'crime_rates.csv'))

# Load the unemployment rates data
unemployment_df = pd.read_csv(pl.Path('..','clean_data','unemployment_year_state.csv'))

# Display the first few rows of each dataset to understand their structure
crime_rates_df_head = crime_rates_df.head()
unemployment_df_head = unemployment_df.head()

(crime_rates_df_head, unemployment_df_head)

# Transform the unemployment data to a long format
unemployment_long_df = unemployment_df.melt(id_vars=['State'], var_name='data_year', value_name='Unemployment_Rate')
unemployment_long_df['data_year'] = unemployment_long_df['data_year'].astype(int)

# Map state names to abbreviations in the unemployment data
state_abbreviations = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
    'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
    'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
    'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
    'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO',
    'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
    'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH',
    'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
    'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT',
    'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'
}
unemployment_long_df['State'] = unemployment_long_df['State'].map(state_abbreviations)

# Merge the datasets on state and year
merged_df = pd.merge(crime_rates_df, unemployment_long_df, left_on=['state_abbr', 'data_year'], right_on=['State', 'data_year'])

# Clean the data by removing NaN values across all crime rate columns
cleaned_merged_df = merged_df.dropna()

# Initialize a list to store the correlation results for each crime type, for each state
all_crimes_correlation_results = []

# Loop through each state
for state in cleaned_merged_df['state_abbr'].unique():
    state_data = cleaned_merged_df[cleaned_merged_df['state_abbr'] == state]
    # Loop through each crime rate column (excluding non-crime rate and non-numeric columns)
    for crime_rate_column in [col for col in crime_rates_df.columns if '_rate' in col]:
        if state_data[crime_rate_column].notnull().all() and state_data['Unemployment_Rate'].notnull().all():
            # Calculate Pearson correlation coefficient and p-value
            r, p = pearsonr(state_data[crime_rate_column], state_data['Unemployment_Rate'])
            all_crimes_correlation_results.append({
                'State': state,
                'Crime_Type': crime_rate_column,
                'Pearson_r': r,
                'p_value': p
            })

# Convert the results into a DataFrame
all_crimes_correlation_df = pd.DataFrame(all_crimes_correlation_results)

# Display the first few rows of the correlation results DataFrame
all_crimes_correlation_df.head(50)



Unnamed: 0,State,Crime_Type,Pearson_r,p_value
0,AL,Aggravated Assault_rate,-0.233952,0.544604
1,AL,All Other Offenses (Except Traffic)_rate,-0.502144,0.168351
2,AL,Arson_rate,-0.230379,0.550942
3,AL,Burglary_rate,-0.077677,0.842552
4,AL,Curfew and Loitering Law Violations_rate,,
5,AL,Disorderly Conduct_rate,-0.630141,0.068906
6,AL,Driving Under the Influence_rate,-0.140706,0.718044
7,AL,Drug Abuse Violations - Grand Total_rate,-0.774677,0.014229
8,AL,Drunkenness_rate,0.232189,0.547727
9,AL,Embezzlement_rate,0.183867,0.635821


In [29]:
all_crimes_correlation_df.to_csv(pl.Path('..','clean_data', 'all_crimes_correlation_results.csv'), index=False)