In [None]:
Initial EDA

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Read the updated Cancer Rates CSV file
cancer_rates = pd.read_csv('Cancer Rates Updated.csv')

# 1. Distribution of Death Rates
plt.figure(figsize=(12, 6))
sns.histplot(cancer_rates['Death Rate'], kde=True)
plt.title('Distribution of Death Rates')
plt.xlabel('Death Rate (per 100,000)')
plt.ylabel('Count')
plt.savefig('death_rate_distribution.png')
plt.close()

# 2. Distribution of Incidence Rates
plt.figure(figsize=(12, 6))
sns.histplot(cancer_rates['Incidence Rate'], kde=True)
plt.title('Distribution of Incidence Rates')
plt.xlabel('Incidence Rate (per 100,000)')
plt.ylabel('Count')
plt.savefig('incidence_rate_distribution.png')
plt.close()

# 3. Scatter plot: Death Rate vs Incidence Rate
plt.figure(figsize=(12, 8))
sns.scatterplot(x='Incidence Rate', y='Death Rate', data=cancer_rates, alpha=0.6)
plt.title('Death Rate vs Incidence Rate')
plt.xlabel('Incidence Rate (per 100,000)')
plt.ylabel('Death Rate (per 100,000)')
plt.savefig('death_vs_incidence_scatter.png')
plt.close()

# 4. Box plot: Death Rates by Urbanicity
plt.figure(figsize=(10, 6))
sns.boxplot(x='Urbanicity', y='Death Rate', data=cancer_rates)
plt.title('Death Rates by Urbanicity')
plt.xlabel('Urbanicity')
plt.ylabel('Death Rate (per 100,000)')
plt.savefig('death_rate_by_urbanicity.png')
plt.close()

# 5. Top 10 states with highest average Death Rates
top_10_death_rates = cancer_rates.groupby('States')['Death Rate'].mean().nlargest(10)
plt.figure(figsize=(12, 6))
top_10_death_rates.plot(kind='bar')
plt.title('Top 10 States with Highest Average Death Rates')
plt.xlabel('State')
plt.ylabel('Average Death Rate (per 100,000)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('top_10_states_death_rates.png')
plt.close()

# 6. Correlation heatmap
correlation_matrix = cancer_rates[['Death Rate', 'Incidence Rate', '5 Year Trend in Death Rates', '5 Year Trend in Incidence Rate']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.savefig('correlation_heatmap.png')
plt.close()

print("EDA graphs have been created and saved as PNG files.")

# Calculate some summary statistics
summary_stats = cancer_rates.describe()
print("\
Summary Statistics:")
print(summary_stats)

# Calculate correlations
correlations = cancer_rates[['Death Rate', 'Incidence Rate', '5 Year Trend in Death Rates', '5 Year Trend in Incidence Rate']].corr()
print("\
Correlation Matrix:")
print(correlations)

# Count of counties by Urbanicity
urbanicity_counts = cancer_rates['Urbanicity'].value_counts()
print("\
Count of Counties by Urbanicity:")
print(urbanicity_counts)

# Top 5 counties with highest Death Rates
top_5_death_rates = cancer_rates.nlargest(5, 'Death Rate')[['County', 'States', 'Death Rate']]
print("\
Top 5 Counties with Highest Death Rates:")
print(top_5_death_rates)

# Top 5 counties with highest Incidence Rates
top_5_incidence_rates = cancer_rates.nlargest(5, 'Incidence Rate')[['County', 'States', 'Incidence Rate']]
print("\
Top 5 Counties with Highest Incidence Rates:")
print(top_5_incidence_rates)









In [None]:
#EDA PART II

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Read the merged dataset
df = pd.read_csv('Merged_Cancer_Rates_with_FIPS_Final.csv')

# Select relevant columns for analysis
numeric_columns = ['Death Rate', 'Incidence Rate', 'White Pct', 'Black Pct.', 'Hispanic Pct.', 'Asian/PI Pct', 'AI/AN Pct',
                   'Bachelors Degree', 'HS Education', 'Unemployment Rate', 'Persons in Poverty', 'SES']

# Clean the data
df['SES'] = pd.to_numeric(df['SES'].replace('data not available', np.nan), errors='coerce')

# Convert all columns to numeric
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Check for remaining NaN values
nan_counts = df[numeric_columns].isna().sum()
print("NaN counts after cleaning:")
print(nan_counts)

# Remove rows with NaN values
df_clean = df.dropna(subset=numeric_columns)
print(f"\
Rows removed: {len(df) - len(df_clean)}")

# Create a correlation matrix
corr_matrix = df_clean[numeric_columns].corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title('Correlation Heatmap: Demographics vs Cancer Rates')
plt.tight_layout()
plt.savefig('correlation_heatmap.png')
plt.close()

# Function to create scatter plots
def create_scatter_plot(x, y, xlabel, ylabel):
    plt.figure(figsize=(10, 6))
    plt.scatter(df_clean[x], df_clean[y], alpha=0.5)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(f'{ylabel} vs {xlabel}')
    plt.tight_layout()
    plt.savefig(f'{y.lower().replace(" ", "_")}_vs_{x.lower().replace(" ", "_")}.png')
    plt.close()

# Create scatter plots for some interesting relationships
create_scatter_plot('SES', 'Death Rate', 'Socioeconomic Status', 'Death Rate')
create_scatter_plot('Bachelors Degree', 'Incidence Rate', 'Bachelor\'s Degree (%)', 'Incidence Rate')
create_scatter_plot('Persons in Poverty', 'Death Rate', 'Persons in Poverty (%)', 'Death Rate')

# Analyze cancer rates by Urbanicity
urbanicity_stats = df_clean.groupby('Urbanicity')[['Death Rate', 'Incidence Rate']].agg(['mean', 'std'])
print("\
Cancer Rates by Urbanicity:")
print(urbanicity_stats)

# Perform t-test for Death Rate between Urban and Rural areas
urban_death_rate = df_clean[df_clean['Urbanicity'] == 'Urban']['Death Rate']
rural_death_rate = df_clean[df_clean['Urbanicity'] == 'Rural']['Death Rate']
t_stat, p_value = stats.ttest_ind(urban_death_rate, rural_death_rate)
print(f"\
t-test for Death Rate (Urban vs Rural): t-statistic = {t_stat:.4f}, p-value = {p_value:.4f}")

# Calculate correlations with Death Rate and Incidence Rate
correlations_death = df_clean[numeric_columns].corr()['Death Rate'].sort_values(ascending=False)
correlations_incidence = df_clean[numeric_columns].corr()['Incidence Rate'].sort_values(ascending=False)

print("\
Top correlations with Death Rate:")
print(correlations_death)

print("\
Top correlations with Incidence Rate:")
print(correlations_incidence)

# Summary statistics
summary_stats = df_clean[numeric_columns].describe()
print("\
Summary Statistics:")
print(summary_stats)

print("\
EDA analysis complete. Visualizations have been saved as PNG files.")


#Adding Trend lines
df_clean = df.dropna(subset=numeric_columns)

# Function to create scatter plots with trend lines
def create_scatter_plot(x, y, xlabel, ylabel, hue=None):
    plt.figure(figsize=(12, 8))
    sns.scatterplot(data=df_clean, x=x, y=y, hue=hue, alpha=0.6)
    sns.regplot(data=df_clean, x=x, y=y, scatter=False, color='red')
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(f'{ylabel} vs {xlabel}')
    if hue:
        plt.legend(title=hue)
    plt.tight_layout()
    plt.savefig(f'{y.lower().replace(" ", "_")}_vs_{x.lower().replace(" ", "_")}.png')
    plt.close()

# Create the requested plots
create_scatter_plot('Bachelors Degree', 'Death Rate', 'Bachelor\'s Degree (%)', 'Death Rate')
create_scatter_plot('HS Education', 'Death Rate', 'High School Education (%)', 'Death Rate')
create_scatter_plot('White Pct', 'Death Rate', 'White Population (%)', 'Death Rate')
create_scatter_plot('Black Pct.', 'Death Rate', 'Black Population (%)', 'Death Rate')
create_scatter_plot('Hispanic Pct.', 'Death Rate', 'Hispanic Population (%)', 'Death Rate')
create_scatter_plot('Families Below Poverty', 'Death Rate', 'Families Below Poverty (%)', 'Death Rate')
create_scatter_plot('Poverty Below 150%', 'Death Rate', 'Population Below 150% Poverty Line (%)', 'Death Rate')
create_scatter_plot('SES', 'Death Rate', 'Socioeconomic Status', 'Death Rate')

# Create box plots for cancer rates vs Urbanicity
plt.figure(figsize=(12, 8))
sns.boxplot(data=df_clean, x='Urbanicity', y='Death Rate')
plt.title('Death Rate vs Urbanicity')
plt.savefig('death_rate_vs_urbanicity_boxplot.png')
plt.close()

plt.figure(figsize=(12, 8))
sns.boxplot(data=df_clean, x='Urbanicity', y='Incidence Rate')
plt.title('Incidence Rate vs Urbanicity')
plt.savefig('incidence_rate_vs_urbanicity_boxplot.png')
plt.close()

print("All plots have been created and saved as PNG files.")




In [None]:
#Poverty, Education, Race and Cancer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Read the merged dataset
df = pd.read_csv('Merged_Cancer_Rates_with_FIPS_Final.csv')

# Clean the data
numeric_columns = ['Death Rate', 'Incidence Rate', 'White Pct', 'Black Pct.', 'Hispanic Pct.', 'Asian/PI Pct', 'AI/AN Pct',
                   'Bachelors Degree', 'HS Education', 'Unemployment Rate', 'Persons in Poverty', 'SES', 'Poverty Below 150%', 'Families Below Poverty']

for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df_clean = df.dropna(subset=numeric_columns)

# Create a function to categorize poverty levels
def categorize_poverty(poverty_rate):
    if poverty_rate < 10:
        return 'Low'
    elif poverty_rate < 20:
        return 'Medium'
    else:
        return 'High'

# Add poverty category to the dataframe
df_clean['Poverty Category'] = df_clean['Persons in Poverty'].apply(categorize_poverty)

# Create a function to categorize education levels
def categorize_education(education_rate):
    if education_rate < 20:
        return 'Low'
    elif education_rate < 30:
        return 'Medium'
    else:
        return 'High'

# Add education category to the dataframe
df_clean['Education Category'] = df_clean['Bachelors Degree'].apply(categorize_education)

# Create scatter plots for each race, with poverty and education represented
races = ['White Pct', 'Black Pct.', 'Hispanic Pct.']
race_names = ['White', 'Black', 'Hispanic']

plt.figure(figsize=(20, 15))

for i, (race, race_name) in enumerate(zip(races, race_names)):
    plt.subplot(2, 2, i+1)
    sns.scatterplot(data=df_clean, x=race, y='Death Rate', 
                    hue='Poverty Category', style='Education Category', 
                    palette='viridis', s=50, alpha=0.7)
    
    plt.title(f'Death Rate vs {race_name} Population %')
    plt.xlabel(f'{race_name} Population %')
    plt.ylabel('Death Rate')
    plt.legend(title='Poverty / Education', bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust layout and save the plot
plt.tight_layout()
plt.savefig('race_poverty_education_death_rate.png', dpi=300, bbox_inches='tight')
plt.close()

# Create a summary table
summary = df_clean.groupby(['Poverty Category', 'Education Category']).agg({
    'Death Rate': 'mean',
    'White Pct': 'mean',
    'Black Pct.': 'mean',
    'Hispanic Pct.': 'mean'
}).round(2)

print("Summary Table:")
print(summary)

# Save summary to CSV
summary.to_csv('race_poverty_education_summary.csv')
print("\
Summary table saved to 'race_poverty_education_summary.csv'")

print("\
Visualization has been created and saved as 'race_poverty_education_death_rate.png'")



#Improving the charts
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Read the merged dataset
df = pd.read_csv('Merged_Cancer_Rates_with_FIPS_Final.csv')

# Clean the data
numeric_columns = ['Death Rate', 'Incidence Rate', 'White Pct', 'Black Pct.', 'Hispanic Pct.', 'Asian/PI Pct', 'AI/AN Pct',
                   'Bachelors Degree', 'HS Education', 'Unemployment Rate', 'Persons in Poverty', 'SES', 'Poverty Below 150%', 'Families Below Poverty']

for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df_clean = df.dropna(subset=numeric_columns)

# Create categories
df_clean['Poverty Category'] = pd.cut(df_clean['Persons in Poverty'], bins=[0, 10, 20, 100], labels=['Low Poverty', 'Medium Poverty', 'High Poverty'])
df_clean['Education Category'] = pd.cut(df_clean['Bachelors Degree'], bins=[0, 20, 30, 100], labels=['Low Education', 'Medium Education', 'High Education'])

# Function to create heatmap
def create_heatmap(data, title, filename):
    plt.figure(figsize=(12, 8))
    sns.heatmap(data, annot=True, cmap='YlOrRd', fmt='.2f', cbar_kws={'label': 'Percentage'})
    plt.title(title, fontsize=16)
    plt.xlabel('Education Level', fontsize=12)
    plt.ylabel('Poverty Level', fontsize=12)
    plt.tight_layout()
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close()

# Create heatmaps for each race
races = ['White Pct', 'Black Pct.', 'Hispanic Pct.']
race_names = ['White', 'Black', 'Hispanic']

for race, race_name in zip(races, race_names):
    summary = df_clean.groupby(['Poverty Category', 'Education Category'])[race].mean().unstack()
    create_heatmap(summary, f'{race_name} Population Percentage by Poverty and Education Levels', f'{race_name.lower()}_population_heatmap.png')

# Create death rate heatmap
death_rate_summary = df_clean.groupby(['Poverty Category', 'Education Category'])['Death Rate'].mean().unstack()
create_heatmap(death_rate_summary, 'Cancer Death Rates by Poverty and Education Levels', 'death_rate_heatmap.png')

print("Heatmaps have been created for each race and saved as PNG files.")
print("The death rate heatmap has been updated and saved as 'death_rate_heatmap.png'.")

# Display summary statistics
print("\
Summary Statistics:")
summary_stats = df_clean.groupby(['Poverty Category', 'Education Category']).agg({
    'Death Rate': 'mean',
    'White Pct': 'mean',
    'Black Pct.': 'mean',
    'Hispanic Pct.': 'mean'
}).round(2)
print(summary_stats)






In [None]:
#State Correlations
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv('Merged_Cancer_Rates_with_FIPS_Final.csv')

# Convert relevant columns to numeric, handling any non-numeric values
numeric_columns = ['Death Rate', 'Incidence Rate', 'White Pct', 'Black Pct.', 'Hispanic Pct.', 
                   'Asian/PI Pct', 'AI/AN Pct', 'Persons in Poverty']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows with missing values in the columns we're interested in
df_clean = df.dropna(subset=numeric_columns + ['States'])

# Function to calculate correlations for a state
def state_correlations(state_df):
    correlations = {}
    for race in ['White Pct', 'Black Pct.', 'Hispanic Pct.', 'Asian/PI Pct', 'AI/AN Pct']:
        correlations[f'{race}_poverty'] = state_df[race].corr(state_df['Persons in Poverty'])
        correlations[f'{race}_death'] = state_df[race].corr(state_df['Death Rate'])
        correlations[f'{race}_incidence'] = state_df[race].corr(state_df['Incidence Rate'])
    
    correlations['poverty_death'] = state_df['Persons in Poverty'].corr(state_df['Death Rate'])
    correlations['poverty_incidence'] = state_df['Persons in Poverty'].corr(state_df['Incidence Rate'])
    
    return pd.Series(correlations)

# Group by state and calculate correlations for states with at least 10 counties
state_correlations = df_clean.groupby('States').filter(lambda x: len(x) >= 10).groupby('States').apply(state_correlations)

# Sort states by the strength of correlation between poverty and death rate
state_correlations_sorted = state_correlations.sort_values('poverty_death', ascending=False)

# Display the top 10 states with the strongest poverty-death rate correlation
print("Top 10 states with strongest poverty-death rate correlation:")
print(state_correlations_sorted['poverty_death'].head(10))

# Display the bottom 10 states with the weakest poverty-death rate correlation
print("\
Bottom 10 states with weakest poverty-death rate correlation:")
print(state_correlations_sorted['poverty_death'].tail(10))

# Calculate average correlations across all states
avg_correlations = state_correlations.mean()
print("\
Average correlations across all states:")
print(avg_correlations)

# Create a heatmap of average correlations
plt.figure(figsize=(12, 10))
sns.heatmap(avg_correlations.to_frame().T, annot=True, cmap='coolwarm', center=0)
plt.title('Average Correlations Across All States')
plt.tight_layout()
plt.savefig('average_correlations_heatmap.png')
plt.close()

print("\
Visualization plot has been saved as 'average_correlations_heatmap.png'")

# Calculate the correlation between state-level correlations
state_level_corr = state_correlations.corr()
print("\
Correlation between state-level correlations:")
print(state_level_corr['poverty_death'].sort_values(ascending=False))

In [None]:
#States and Urbanicity
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv('Merged_Cancer_Rates_with_FIPS_Final.csv')

# Convert relevant columns to numeric, handling any non-numeric values
numeric_columns = ['Death Rate', 'Incidence Rate', 'White Pct', 'Black Pct.', 'Hispanic Pct.', 
                   'Asian/PI Pct', 'AI/AN Pct', 'Persons in Poverty']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Encode the 'Urbanicity' column
df['Urbanicity_Encoded'] = pd.Categorical(df['Urbanicity']).codes

# Drop rows with missing values in the columns we're interested in
df_clean = df.dropna(subset=numeric_columns + ['States', 'Urbanicity_Encoded'])

# Function to calculate correlations for a state
def state_correlations(state_df):
    correlations = {}
    for race in ['White Pct', 'Black Pct.', 'Hispanic Pct.', 'Asian/PI Pct', 'AI/AN Pct']:
        correlations[f'{race}_poverty'] = state_df[race].corr(state_df['Persons in Poverty'])
        correlations[f'{race}_death'] = state_df[race].corr(state_df['Death Rate'])
        correlations[f'{race}_incidence'] = state_df[race].corr(state_df['Incidence Rate'])
    
    correlations['poverty_death'] = state_df['Persons in Poverty'].corr(state_df['Death Rate'])
    correlations['poverty_incidence'] = state_df['Persons in Poverty'].corr(state_df['Incidence Rate'])
    correlations['urbanicity_death'] = state_df['Urbanicity_Encoded'].corr(state_df['Death Rate'])
    correlations['urbanicity_incidence'] = state_df['Urbanicity_Encoded'].corr(state_df['Incidence Rate'])
    
    return pd.Series(correlations)

# Group by state and calculate correlations for states with at least 10 counties
state_correlations = df_clean.groupby('States').filter(lambda x: len(x) >= 10).groupby('States').apply(state_correlations)

# Sort states by the strength of correlation between poverty and death rate
state_correlations_sorted = state_correlations.sort_values('poverty_death', ascending=False)

print("Data preparation and correlation calculation completed successfully.")
print("\
Shape of state_correlations DataFrame:")
print(state_correlations.shape)
print("\
First few rows of state_correlations:")
print(state_correlations.head())

# Now let's create our visualizations
plt.figure(figsize=(15, 12))
sns.heatmap(state_correlations_sorted.iloc[:10], annot=True, cmap='coolwarm', center=0, vmin=-1, vmax=1)
plt.title('Correlations for Top 10 States (Sorted by Poverty-Death Rate Correlation)')
plt.tight_layout()
plt.savefig('top_10_states_heatmap.png')
plt.close()

plt.figure(figsize=(15, 12))
sns.heatmap(state_correlations_sorted.iloc[-10:], annot=True, cmap='coolwarm', center=0, vmin=-1, vmax=1)
plt.title('Correlations for Bottom 10 States (Sorted by Poverty-Death Rate Correlation)')
plt.tight_layout()
plt.savefig('bottom_10_states_heatmap.png')
plt.close()

plt.figure(figsize=(12, 8))
plt.scatter(state_correlations['poverty_death'], state_correlations['urbanicity_death'])
for i, state in enumerate(state_correlations.index):
    plt.annotate(state, (state_correlations['poverty_death'][i], state_correlations['urbanicity_death'][i]))
plt.xlabel('Poverty-Death Rate Correlation')
plt.ylabel('Urbanicity-Death Rate Correlation')
plt.title('Poverty vs Urbanicity Correlations with Death Rate by State')
plt.tight_layout()
plt.savefig('poverty_urbanicity_correlations.png')
plt.close()

plt.figure(figsize=(15, 10))
state_correlations_sorted['poverty_death'].plot(kind='bar')
plt.title('Poverty-Death Rate Correlation by State')
plt.xlabel('State')
plt.ylabel('Correlation Coefficient')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('poverty_death_correlation_by_state.png')
plt.close()

print("Visualization plots have been saved as 'top_10_states_heatmap.png', 'bottom_10_states_heatmap.png', 'poverty_urbanicity_correlations.png', and 'poverty_death_correlation_by_state.png'")

# Display summary statistics
print("\
Summary statistics of correlations across all states:")
print(state_correlations.describe())

# Calculate the correlation between state-level correlations
state_level_corr = state_correlations.corr()
print("\
Correlation between poverty-death rate and other correlations:")
print(state_level_corr['poverty_death'].sort_values(ascending=False))







In [None]:
# SURVIVAL RATE 

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv('Merged_Cancer_Rates_with_Survival_Rate.csv')

# Correlation between Survival Rate and Poverty
poverty_corr = df['Survival Rate'].corr(df['Persons in Poverty'])

plt.figure(figsize=(10, 6))
sns.scatterplot(x='Persons in Poverty', y='Survival Rate', data=df)
plt.title(f'Survival Rate vs Poverty (Correlation: {poverty_corr:.2f})')
plt.xlabel('Persons in Poverty (%)')
plt.ylabel('Survival Rate (%)')
plt.savefig('survival_rate_vs_poverty.png')
plt.close()

print(f"Correlation between Survival Rate and Poverty: {poverty_corr:.4f}")

# Summary statistics for Survival Rate
print("\
Summary statistics for Survival Rate:")
print(df['Survival Rate'].describe())

# Top 5 counties with highest survival rates
print("\
Top 5 counties with highest survival rates:")
print(df.nlargest(5, 'Survival Rate')[['County', 'States', 'Survival Rate']])

# Bottom 5 counties with lowest survival rates
print("\
Bottom 5 counties with lowest survival rates:")
print(df.nsmallest(5, 'Survival Rate')[['County', 'States', 'Survival Rate']])

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv('Merged_Cancer_Rates_with_Survival_Rate.csv')

# Print file information
print("File Information:")
print(f"Filename: Merged_Cancer_Rates_with_Survival_Rate.csv")
print(f"File size: {df.memory_usage().sum() / 1024 / 1024:.2f} MB")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

# Calculate average survival rate by state
state_survival = df.groupby('States')['Survival Rate'].mean().sort_values(ascending=False)

# Top 5 and Bottom 5 states
print("\
Top 5 States by Average Survival Rate:")
print(state_survival.head())
print("\
Bottom 5 States by Average Survival Rate:")
print(state_survival.tail())

# Correlation between Survival Rate and Education (Bachelor's Degree)
education_corr = df['Survival Rate'].corr(df['Bachelors Degree'])

# Scatter plot for Survival Rate vs Education
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Bachelors Degree', y='Survival Rate', data=df)
plt.title(f'Survival Rate vs Education (Correlation: {education_corr:.2f})')
plt.xlabel('Bachelor\'s Degree (%)')
plt.ylabel('Survival Rate (%)')
plt.savefig('survival_rate_vs_education.png')
plt.close()

print(f"\
Correlation between Survival Rate and Education (Bachelor's Degree): {education_corr:.4f}")

# Correlation between Survival Rate and Racial Demographics
race_columns = ['White Pct', 'Black Pct.', 'Hispanic Pct.', 'Asian/PI Pct', 'AI/AN Pct']
race_correlations = df[race_columns + ['Survival Rate']].corr()['Survival Rate'].drop('Survival Rate')

print("\
Correlation between Survival Rate and Racial Demographics:")
print(race_correlations)

# Bar plot for racial correlations
plt.figure(figsize=(10, 6))
race_correlations.plot(kind='bar')
plt.title('Correlation between Survival Rate and Race Percentages')
plt.xlabel('Race')
plt.ylabel('Correlation Coefficient')
plt.savefig('survival_rate_race_correlation.png')
plt.close()

print("\
Visualization plots have been saved as 'survival_rate_vs_education.png' and 'survival_rate_race_correlation.png'")


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv('Merged_Cancer_Rates_with_Survival_Rate.csv')

# 1. Chart for Survival Rate vs HS Graduate
hs_corr = df['Survival Rate'].corr(df['HS Education'])

plt.figure(figsize=(10, 6))
sns.scatterplot(x='HS Education', y='Survival Rate', data=df)
plt.title(f'Survival Rate vs High School Education (Correlation: {hs_corr:.2f})')
plt.xlabel('High School Graduate (%)')
plt.ylabel('Survival Rate (%)')
plt.savefig('survival_rate_vs_hs_education.png')
plt.close()

print(f"Correlation between Survival Rate and High School Education: {hs_corr:.4f}")

# 2. Top 5 and Bottom 5 States by Average Survival Rate
state_survival = df.groupby('States')['Survival Rate'].mean().sort_values(ascending=False)
top_5_states = state_survival.head()
bottom_5_states = state_survival.tail()

plt.figure(figsize=(12, 6))
ax = sns.barplot(x=top_5_states.index, y=top_5_states.values, palette='viridis')
plt.title('Top 5 States by Average Survival Rate')
plt.xlabel('State')
plt.ylabel('Average Survival Rate (%)')
plt.ylim(50, 75)  # Adjust y-axis for better visualization
for i, v in enumerate(top_5_states.values):
    ax.text(i, v, f'{v:.2f}', ha='center', va='bottom')
plt.savefig('top_5_states_survival_rate.png')
plt.close()

plt.figure(figsize=(12, 6))
ax = sns.barplot(x=bottom_5_states.index, y=bottom_5_states.values, palette='viridis')
plt.title('Bottom 5 States by Average Survival Rate')
plt.xlabel('State')
plt.ylabel('Average Survival Rate (%)')
plt.ylim(50, 75)  # Adjust y-axis for better visualization
for i, v in enumerate(bottom_5_states.values):
    ax.text(i, v, f'{v:.2f}', ha='center', va='bottom')
plt.savefig('bottom_5_states_survival_rate.png')
plt.close()

print("\
Top 5 States by Average Survival Rate:")
print(top_5_states)
print("\
Bottom 5 States by Average Survival Rate:")
print(bottom_5_states)

print("\
Visualization plots have been saved as 'survival_rate_vs_hs_education.png', 'top_5_states_survival_rate.png', and 'bottom_5_states_survival_rate.png'")

In [None]:
#Survival PROXY
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Load the data
df = pd.read_csv('Merged_Cancer_Rates_with_FIPS_Final.csv')

# Convert relevant columns to numeric, handling any non-numeric values
numeric_columns = ['Death Rate', 'Incidence Rate', 'White Pct', 'Black Pct.', 'Hispanic Pct.', 
                   'Asian/PI Pct', 'AI/AN Pct', 'Persons in Poverty', 'Bachelors Degree']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Calculate the difference between incidence rate and death rate
df['Survival_Proxy'] = df['Incidence Rate'] - df['Death Rate']

# Function to calculate correlations for a state
def state_correlations(state_df):
    correlations = {}
    for factor in ['Persons in Poverty', 'Bachelors Degree', 'White Pct', 'Black Pct.', 'Hispanic Pct.', 'Asian/PI Pct', 'AI/AN Pct']:
        correlations[f'{factor}_survival'] = state_df[factor].corr(state_df['Survival_Proxy'])
    return pd.Series(correlations)

# Group by state and calculate correlations for states with at least 10 counties
state_correlations = df.groupby('States').filter(lambda x: len(x) >= 10).groupby('States').apply(state_correlations)

# Calculate average survival proxy for each state
state_avg_survival = df.groupby('States')['Survival_Proxy'].mean().sort_values(ascending=False)

# Create a heatmap of correlations
plt.figure(figsize=(12, 10))
sns.heatmap(state_correlations, annot=True, cmap='coolwarm', center=0, vmin=-1, vmax=1)
plt.title('Correlations between Survival Proxy and Socioeconomic Factors by State')
plt.tight_layout()
plt.savefig('survival_correlations_heatmap.png')
plt.close()

# Create a bar plot of average survival proxy by state
plt.figure(figsize=(15, 10))
state_avg_survival.plot(kind='bar')
plt.title('Average Survival Proxy by State')
plt.xlabel('State')
plt.ylabel('Survival Proxy (Incidence Rate - Death Rate)')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('avg_survival_by_state.png')
plt.close()

# Scatter plot of poverty vs survival proxy
plt.figure(figsize=(12, 8))
plt.scatter(df.groupby('States')['Persons in Poverty'].mean(), state_avg_survival)
for i, state in enumerate(state_avg_survival.index):
    plt.annotate(state, (df.groupby('States')['Persons in Poverty'].mean()[state], state_avg_survival[state]))
plt.xlabel('Average Poverty Rate')
plt.ylabel('Average Survival Proxy')
plt.title('Poverty Rate vs Survival Proxy by State')
plt.tight_layout()
plt.savefig('poverty_vs_survival.png')
plt.close()

print("Visualization plots have been saved as 'survival_correlations_heatmap.png', 'avg_survival_by_state.png', and 'poverty_vs_survival.png'")

# Display summary statistics
print("\
Summary statistics of survival proxy across all counties:")
print(df['Survival_Proxy'].describe())

print("\
Top 10 states with highest average survival proxy:")
print(state_avg_survival.head(10))

print("\
Bottom 10 states with lowest average survival proxy:")
print(state_avg_survival.tail(10))

print("\
Correlations between survival proxy and socioeconomic factors (averaged across all states):")
print(state_correlations.mean().sort_values(ascending=False))

# Calculate overall correlations (not state-specific)
overall_correlations = df[['Survival_Proxy', 'Persons in Poverty', 'Bachelors Degree', 'White Pct', 'Black Pct.', 'Hispanic Pct.', 'Asian/PI Pct', 'AI/AN Pct']].corr()['Survival_Proxy'].sort_values(ascending=False)

print("\
Overall correlations between survival proxy and socioeconomic factors (across all counties):")
print(overall_correlations)


#State Correlations of Survival Proxy and Poverty/Race/Education
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from statsmodels.formula.api import mixedlm
import statsmodels.api as sm

# Load the data
df = pd.read_csv('Merged_Cancer_Rates_with_FIPS_Final.csv')

# Convert relevant columns to numeric, handling any non-numeric values
numeric_columns = ['Death Rate', 'Incidence Rate', 'White Pct', 'Black Pct.', 'Hispanic Pct.', 
                   'Asian/PI Pct', 'AI/AN Pct', 'Persons in Poverty', 'Bachelors Degree']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Calculate the survival proxy
df['Survival_Proxy'] = df['Incidence Rate'] - df['Death Rate']

# Standardize the predictor variables
scaler = StandardScaler()
df[['Poverty_Scaled', 'Education_Scaled', 'White_Scaled', 'Black_Scaled', 'Hispanic_Scaled', 'Asian_PI_Scaled', 'AI_AN_Scaled']] = scaler.fit_transform(df[['Persons in Poverty', 'Bachelors Degree', 'White Pct', 'Black Pct.', 'Hispanic Pct.', 'Asian/PI Pct', 'AI/AN Pct']])

# Fit a mixed-effects model
model = mixedlm("Survival_Proxy ~ Poverty_Scaled + Education_Scaled + White_Scaled + Black_Scaled + Hispanic_Scaled + Asian_PI_Scaled + AI_AN_Scaled", groups=df["States"], data=df)
results = model.fit()

print("Mixed-effects model summary:")
print(results.summary())

# Calculate state-level effects
state_effects = results.random_effects

# Create a dataframe of state effects
state_effects_df = pd.DataFrame.from_dict(state_effects, orient='index')
state_effects_df.columns = ['State_Effect']
state_effects_df = state_effects_df.sort_values('State_Effect', ascending=False)

# Plot state effects
plt.figure(figsize=(12, 8))
state_effects_df['State_Effect'].plot(kind='bar')
plt.title('State-Level Effects on Survival Proxy')
plt.xlabel('State')
plt.ylabel('Effect on Survival Proxy')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('state_effects.png')
plt.close()

print("\
Top 5 states with positive effects on survival proxy:")
print(state_effects_df.head())

print("\
Bottom 5 states with negative effects on survival proxy:")
print(state_effects_df.tail())

# Interaction effects
interaction_model = sm.OLS.from_formula("Survival_Proxy ~ Poverty_Scaled * Education_Scaled + White_Scaled + Black_Scaled + Hispanic_Scaled + Asian_PI_Scaled + AI_AN_Scaled", data=df)
interaction_results = interaction_model.fit()

print("\
Interaction model summary:")
print(interaction_results.summary())

# Visualize interaction between poverty and education
poverty_range = np.linspace(df['Poverty_Scaled'].min(), df['Poverty_Scaled'].max(), 100)
education_range = np.linspace(df['Education_Scaled'].min(), df['Education_Scaled'].max(), 100)

X, Y = np.meshgrid(poverty_range, education_range)
Z = interaction_results.params['Intercept'] + \
    interaction_results.params['Poverty_Scaled'] * X + \
    interaction_results.params['Education_Scaled'] * Y + \
    interaction_results.params['Poverty_Scaled:Education_Scaled'] * X * Y

plt.figure(figsize=(10, 8))
contour = plt.contourf(X, Y, Z, cmap='viridis')
plt.colorbar(contour)
plt.title('Interaction Effect of Poverty and Education on Survival Proxy')
plt.xlabel('Poverty (Scaled)')
plt.ylabel('Education (Scaled)')
plt.savefig('poverty_education_interaction.png')
plt.close()

print("Visualization plots have been saved as 'state_effects.png' and 'poverty_education_interaction.png'")

# Calculate correlations between state effects and state-level averages of predictors
state_avg = df.groupby('States')[['Persons in Poverty', 'Bachelors Degree', 'White Pct', 'Black Pct.', 'Hispanic Pct.', 'Asian/PI Pct', 'AI/AN Pct']].mean()
state_effects_df = state_effects_df.join(state_avg)

correlations = state_effects_df.corr()['State_Effect'].sort_values(ascending=False)

print("\
Correlations between state effects and state-level averages of predictors:")
print(correlations)