In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Load the data
df = pd.read_csv('Merged_Cancer_Rates_with_Survival_Rate.csv')

# Select relevant features for regression
features = ['Persons in Poverty', 'Families Below Poverty', 'Poverty Below 150%', 'SES',
            'Bachelors Degree', 'HS Education', 'Unemployment Rate', 'Black Pct.',
            'White Pct', 'Hispanic Pct.', 'Racial Minority Index', 'Urbanicity']
target = 'Survival Rate'

# Clean the data
df['SES'] = pd.to_numeric(df['SES'], errors='coerce')
df['Racial Minority Index'] = pd.to_numeric(df['Racial Minority Index'], errors='coerce')

# Encode Urbanicity
le = LabelEncoder()
df['Urbanicity'] = le.fit_transform(df['Urbanicity'])

# Remove rows with NaN values
df_clean = df.dropna(subset=features + [target])

# Correlation matrix
corr_matrix = df_clean[features + [target]].corr()

# Plot correlation heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title('Correlation Heatmap of Features and Survival Rate')
plt.tight_layout()
plt.savefig('correlation_heatmap_survival_rate.png')
plt.close()

print("Correlation heatmap has been saved as 'correlation_heatmap_survival_rate.png'")

# Calculate VIF for each feature
X = df_clean[features]
vif_data = pd.DataFrame()
vif_data["Feature"] = features
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_data = vif_data.sort_values('VIF', ascending=False)

print("\
Initial Variance Inflation Factors:")
print(vif_data)

# Select features based on correlation and VIF
selected_features = ['SES', 'Bachelors Degree', 'Unemployment Rate', 'Black Pct.', 'Hispanic Pct.', 'Urbanicity']

# Recalculate VIF for selected features
X_selected = df_clean[selected_features]
vif_data_selected = pd.DataFrame()
vif_data_selected["Feature"] = selected_features
vif_data_selected["VIF"] = [variance_inflation_factor(X_selected.values, i) for i in range(X_selected.shape[1])]
vif_data_selected = vif_data_selected.sort_values('VIF', ascending=False)

print("\
Updated Variance Inflation Factors:")
print(vif_data_selected)

# Prepare the data with selected features
X = df_clean[selected_features]
y = df_clean[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\
Model Performance:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")

# Print coefficients and their importance
coef_df = pd.DataFrame({'Feature': selected_features, 'Coefficient': model.coef_})
coef_df = coef_df.sort_values('Coefficient', key=abs, ascending=False)
print("\
Feature Coefficients:")
print(coef_df)

# Visualize coefficients
plt.figure(figsize=(10, 6))
sns.barplot(x='Coefficient', y='Feature', data=coef_df)
plt.title('Feature Importance in Predicting Survival Rate')
plt.tight_layout()
plt.savefig('feature_importance_survival_rate.png')
plt.close()

print("\
Feature importance plot has been saved as 'feature_importance_survival_rate.png'")

# Residual plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_pred, y=y_test - y_pred)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Survival Rate')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.tight_layout()
plt.savefig('residual_plot_survival_rate.png')
plt.close()

print("Residual plot has been saved as 'residual_plot_survival_rate.png'")