In [None]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv('Merged_Cancer_Rates_with_Survival_Rate.csv')

# Select relevant features for regression
features = ['Persons in Poverty', 'Families Below Poverty', 'Poverty Below 150%', 'SES',
            'Bachelors Degree', 'HS Education', 'Unemployment Rate', 'Black Pct.',
            'White Pct', 'Hispanic Pct.', 'Racial Minority Index', 'Urbanicity']
target = 'Death Rate'

# Check for non-numeric values in our features and target
for column in features + [target]:
    non_numeric = df[pd.to_numeric(df[column], errors='coerce').isna()][column].unique()
    if len(non_numeric) > 0:
        print(f"Non-numeric values in {column}: {non_numeric}")

# Display info about the dataset
print("\
Dataset Info:")
df[features + [target]].info()

# Display the first few rows
print("\
First few rows of the dataset:")
print(df[features + [target]].head())

# Check for missing values
print("\
Missing values in each column:")
print(df[features + [target]].isnull().sum())

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Load the data
df = pd.read_csv('Merged_Cancer_Rates_with_Survival_Rate.csv')

# Select relevant features for regression
features = ['Persons in Poverty', 'Families Below Poverty', 'Poverty Below 150%', 'SES',
            'Bachelors Degree', 'HS Education', 'Unemployment Rate', 'Black Pct.',
            'White Pct', 'Hispanic Pct.', 'Racial Minority Index', 'Urbanicity']
target = 'Death Rate'

# Clean the data
df['SES'] = pd.to_numeric(df['SES'], errors='coerce')
df['Racial Minority Index'] = pd.to_numeric(df['Racial Minority Index'], errors='coerce')

# Encode Urbanicity
le = LabelEncoder()
df['Urbanicity'] = le.fit_transform(df['Urbanicity'])

# Remove rows with NaN values
df_clean = df.dropna(subset=features + [target])

# Prepare the data
X = df_clean[features]
y = df_clean[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Model Performance:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")

# Print coefficients and their importance
coef_df = pd.DataFrame({'Feature': features, 'Coefficient': model.coef_})
coef_df = coef_df.sort_values('Coefficient', key=abs, ascending=False)
print("\
Feature Coefficients:")
print(coef_df)

# Visualize coefficients
plt.figure(figsize=(12, 8))
sns.barplot(x='Coefficient', y='Feature', data=coef_df)
plt.title('Feature Importance in Predicting Death Rate')
plt.tight_layout()
plt.savefig('feature_importance_death_rate.png')
plt.close()

# Correlation heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(df_clean[features + [target]].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0, fmt='.2f')
plt.title('Correlation Heatmap of Features and Death Rate')
plt.tight_layout()
plt.savefig('correlation_heatmap_death_rate.png')
plt.close()

print("\
Feature importance plot and correlation heatmap have been saved as 'feature_importance_death_rate.png' and 'correlation_heatmap_death_rate.png'")

# Residual plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_pred, y=y_test - y_pred)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Death Rate')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.tight_layout()
plt.savefig('residual_plot_death_rate.png')
plt.close()

print("Residual plot has been saved as 'residual_plot_death_rate.png'")

# Summary statistics
print("\
Summary Statistics:")
print(df_clean[features + [target]].describe())

# Calculate VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = features
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print("\
Variance Inflation Factors:")
print(vif_data.sort_values('VIF', ascending=False))

In [None]:
#Addressing MC

#Add VIF library
from statsmodels.stats.outliers_influence import variance_inflation_factor


# Correlation matrix
corr_matrix = df_clean[features].corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title('Correlation Heatmap of Features')
plt.tight_layout()
plt.savefig('correlation_heatmap_features.png')
plt.close()

print("Correlation heatmap has been saved as 'correlation_heatmap_features.png'")

# Calculate VIF for each feature
X = df_clean[features]
vif_data = pd.DataFrame()
vif_data["Feature"] = features
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_data = vif_data.sort_values('VIF', ascending=False)

print("\
Initial Variance Inflation Factors:")
print(vif_data)

# Select features based on correlation and VIF
selected_features = ['SES', 'Bachelors Degree', 'Unemployment Rate', 'Black Pct.', 'Hispanic Pct.', 'Urbanicity']

# Recalculate VIF for selected features
X_selected = df_clean[selected_features]
vif_data_selected = pd.DataFrame()
vif_data_selected["Feature"] = selected_features
vif_data_selected["VIF"] = [variance_inflation_factor(X_selected.values, i) for i in range(X_selected.shape[1])]
vif_data_selected = vif_data_selected.sort_values('VIF', ascending=False)

print("\
Updated Variance Inflation Factors:")
print(vif_data_selected)

# Prepare the data with selected features
X = df_clean[selected_features]
y = df_clean[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\
Updated Model Performance:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")

# Print coefficients and their importance
coef_df = pd.DataFrame({'Feature': selected_features, 'Coefficient': model.coef_})
coef_df = coef_df.sort_values('Coefficient', key=abs, ascending=False)
print("\
Updated Feature Coefficients:")
print(coef_df)

# Visualize coefficients
plt.figure(figsize=(10, 6))
sns.barplot(x='Coefficient', y='Feature', data=coef_df)
plt.title('Feature Importance in Predicting Death Rate (Updated)')
plt.tight_layout()
plt.savefig('feature_importance_death_rate_updated.png')
plt.close()

print("\
Updated feature importance plot has been saved as 'feature_importance_death_rate_updated.png'")

In [None]:
#Interaction Effects
from sklearn.preprocessing import StandardScaler



# Select features
features = ['SES', 'Bachelors Degree', 'Unemployment Rate', 'Black Pct.', 'Hispanic Pct.', 'Urbanicity']
target = 'Death Rate'

# Remove rows with NaN values
df_clean = df.dropna(subset=features + [target])

# Create interaction terms
df_clean['SES_x_Bachelors'] = df_clean['SES'] * df_clean['Bachelors Degree']
df_clean['SES_x_Unemployment'] = df_clean['SES'] * df_clean['Unemployment Rate']
df_clean['SES_x_Urbanicity'] = df_clean['SES'] * df_clean['Urbanicity']

# Update features list
features_with_interactions = features + ['SES_x_Bachelors', 'SES_x_Unemployment', 'SES_x_Urbanicity']

# Prepare the data
X = df_clean[features_with_interactions]
y = df_clean[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Model Performance with Interactions:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")

# Print coefficients
coef_df = pd.DataFrame({'Feature': features_with_interactions, 'Coefficient': model.coef_})
coef_df = coef_df.sort_values('Coefficient', key=abs, ascending=False)
print("\
Feature Coefficients (including interactions):")
print(coef_df)

# Visualize coefficients
plt.figure(figsize=(12, 8))
sns.barplot(x='Coefficient', y='Feature', data=coef_df)
plt.title('Feature Importance in Predicting Death Rate (with Interactions)')
plt.tight_layout()
plt.savefig('feature_importance_with_interactions.png')
plt.close()

print("\
Feature importance plot with interactions has been saved as 'feature_importance_with_interactions.png'")

# Visualize interaction effects
fig, axs = plt.subplots(1, 3, figsize=(20, 6))

# SES x Bachelors Degree
sns.scatterplot(x='SES', y='Death Rate', hue='Bachelors Degree', data=df_clean, ax=axs[0])
axs[0].set_title('Interaction: SES and Bachelors Degree')

# SES x Unemployment Rate
sns.scatterplot(x='SES', y='Death Rate', hue='Unemployment Rate', data=df_clean, ax=axs[1])
axs[1].set_title('Interaction: SES and Unemployment Rate')

# SES x Urbanicity
sns.scatterplot(x='SES', y='Death Rate', hue='Urbanicity', data=df_clean, ax=axs[2])
axs[2].set_title('Interaction: SES and Urbanicity')

plt.tight_layout()
plt.savefig('interaction_effects_visualization.png')
plt.close()

print("\
Interaction effects visualization has been saved as 'interaction_effects_visualization.png'")

# Calculate and print the change in R-squared
print(f"\
Change in R-squared: {r2 - 0.4340:.4f}")  # 0.4340 was the R-squared from the previous model