In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv('Merged_Cancer_Rates_with_FIPS_Final.csv')

# Convert Urbanicity to numeric
df['Urbanicity_Numeric'] = np.where(df['Urbanicity'] == 'Urban', 1, 0)

# Calculate correlation between Poverty and Death Rate for each state
state_correlations = df.groupby('States').apply(lambda x: x['Persons in Poverty'].corr(x['Death Rate']))

# Calculate average Urbanicity for each state
state_urbanicity = df.groupby('States')['Urbanicity_Numeric'].mean()

# Combine correlations and urbanicity
state_data = pd.DataFrame({
    'Correlation': state_correlations,
    'Urbanicity': state_urbanicity
})

# Create the scatter plot
plt.figure(figsize=(12, 8))
sns.scatterplot(x='Urbanicity', y='Correlation', data=state_data)

plt.title('Correlation between Poverty and Cancer Death Rates vs Urbanicity by State')
plt.xlabel('Urbanicity (0: Rural, 1: Urban)')
plt.ylabel('Correlation (Poverty vs Cancer Death Rates)')

# Add state labels to the points
for idx, row in state_data.iterrows():
    plt.annotate(idx, (row['Urbanicity'], row['Correlation']), xytext=(5, 5), textcoords='offset points')

plt.tight_layout()
plt.show()

# Display the state_data DataFrame
print(state_data.sort_values('Correlation', ascending=False))

In [None]:

from scipy import stats


# Calculate correlation between Poverty and Death Rate for each state
state_correlations = df.groupby('States').apply(lambda x: x['Persons in Poverty'].corr(x['Death Rate']))

# Identify potential mediating or confounding variables
potential_variables = [
    'Urbanicity', 'White Pct', 'Black Pct.', 'Hispanic Pct.', 'Asian/PI Pct',
    'AI/AN Pct', 'Unemployment Rate', 'Bachelors Degree', 'HS Education',
    'Incidence Rate', 'SES', 'Racial Minority Index'
]

# Create a dictionary to store state-level data
state_data = {'Correlation': state_correlations}

# Calculate state-level averages for potential variables
for var in potential_variables:
    if df[var].dtype == 'object':
        # For categorical variables, we'll use mode instead of mean
        state_data[var] = df.groupby('States')[var].agg(lambda x: x.mode().iloc[0])
    else:
        state_data[var] = df.groupby('States')[var].mean()

# Convert to DataFrame
state_df = pd.DataFrame(state_data)

# Convert categorical variables to numeric
state_df['Urbanicity'] = np.where(state_df['Urbanicity'] == 'Urban', 1, 0)
state_df['SES'] = pd.to_numeric(state_df['SES'], errors='coerce')
state_df['Racial Minority Index'] = pd.to_numeric(state_df['Racial Minority Index'], errors='coerce')

print(state_df.head())
print("\
Data types:")
print(state_df.dtypes)

# Calculate correlations between the main correlation and potential variables
correlations = state_df.corr()['Correlation'].sort_values(ascending=False)

print("\
Correlations with Poverty-Death Rate Correlation:")
print(correlations)

# Create a heatmap of correlations
plt.figure(figsize=(12, 10))
sns.heatmap(state_df.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title('Correlation Heatmap of State-Level Variables')
plt.tight_layout()
plt.show()

#Find Hispanic vs State Correlation
# Scatter plot of Hispanic Pct. vs Correlation
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Hispanic Pct.', y='Correlation', data=state_df)
plt.title('Hispanic Population Percentage vs Poverty-Death Rate Correlation')
plt.xlabel('Hispanic Population Percentage')
plt.ylabel('Poverty-Death Rate Correlation')

# Add state labels to the points
for idx, row in state_df.iterrows():
    plt.annotate(idx, (row['Hispanic Pct.'], row['Correlation']), xytext=(5, 5), textcoords='offset points')

plt.tight_layout()
plt.show()

In [None]:




#Find Partial Correlations

# Function to calculate partial correlation
def partial_corr(x, y, z):
    slope, intercept, _, _, _ = stats.linregress(z, x)
    xy_resid = x - (slope * z + intercept)
    slope, intercept, _, _, _ = stats.linregress(z, y)
    yz_resid = y - (slope * z + intercept)
    return stats.pearsonr(xy_resid, yz_resid)[0]

# Calculate partial correlations
partial_correlations = {}
for var in potential_variables:
    if var != 'Hispanic Pct.':
        partial_correlations[var] = partial_corr(state_df['Correlation'], state_df[var], state_df['Hispanic Pct.'])

# Sort and print partial correlations
partial_corr_series = pd.Series(partial_correlations).sort_values(ascending=False)
print("Partial correlations controlling for Hispanic Pct.:")
print(partial_corr_series)

# Display the first few rows of state_df
print("\
First few rows of state_df:")
print(state_df.head())

In [None]:

import statsmodels.api as sm

#Hipanic Interactions

# Create interaction terms
state_df['Hispanic_Asian_Interaction'] = state_df['Hispanic Pct.'] * state_df['Asian/PI Pct']
state_df['Hispanic_White_Interaction'] = state_df['Hispanic Pct.'] * state_df['White Pct']

# Fit regression model
X = sm.add_constant(state_df[['Hispanic Pct.', 'Asian/PI Pct', 'White Pct', 
                              'Hispanic_Asian_Interaction', 'Hispanic_White_Interaction']])
y = state_df['Correlation']
model = sm.OLS(y, X).fit()

print("Regression Results:")
print(model.summary().tables[1])  # Print coefficient table

# Plot interaction effects
plt.figure(figsize=(12, 5))

# Hispanic-Asian interaction
plt.subplot(1, 2, 1)
sns.scatterplot(x='Hispanic Pct.', y='Correlation', hue='Asian/PI Pct', data=state_df)
plt.title('Interaction: Hispanic Pct. and Asian/PI Pct.')

# Hispanic-White interaction
plt.subplot(1, 2, 2)
sns.scatterplot(x='Hispanic Pct.', y='Correlation', hue='White Pct', data=state_df)
plt.title('Interaction: Hispanic Pct. and White Pct.')

plt.tight_layout()
plt.show()

# Calculate correlations between poverty and death rates for different racial groups
racial_correlations = {}
for race in ['White Pct', 'Black Pct.', 'Hispanic Pct.', 'Asian/PI Pct', 'AI/AN Pct']:
    racial_correlations[race] = df.groupby('States').apply(lambda x: x[race].corr(x['Death Rate']))

racial_corr_df = pd.DataFrame(racial_correlations)
print("\
Correlations between racial percentages and death rates by state:")
print(racial_corr_df.describe())