In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Load the data
df = pd.read_csv('Merged_Cancer_Rates_with_Survival_Rate.csv')

# Select relevant features for regression
features = ['Persons in Poverty', 'Families Below Poverty', 'Poverty Below 150%', 'SES',
            'Bachelors Degree', 'HS Education', 'Unemployment Rate', 'Black Pct.',
            'White Pct', 'Hispanic Pct.', 'Racial Minority Index', 'Urbanicity']
targets = ['Death Rate', 'Survival Rate']

# Clean the data
df['SES'] = pd.to_numeric(df['SES'], errors='coerce')
df['Racial Minority Index'] = pd.to_numeric(df['Racial Minority Index'], errors='coerce')

# Encode Urbanicity
le = LabelEncoder()
df['Urbanicity'] = le.fit_transform(df['Urbanicity'])

# Remove rows with NaN values
df_clean = df.dropna(subset=features + targets)

# Function to train and evaluate Random Forest model
def train_evaluate_rf(X, y, target_name):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Create and train the model
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = rf_model.predict(X_test_scaled)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"\
Random Forest Model Performance for {target_name}:")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R-squared Score: {r2:.4f}")
    
    # Feature importance
    feature_importance = pd.DataFrame({'Feature': features, 'Importance': rf_model.feature_importances_})
    feature_importance = feature_importance.sort_values('Importance', ascending=False)
    
    # Visualize feature importance
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_importance)
    plt.title(f'Feature Importance in Predicting {target_name} (Random Forest)')
    plt.tight_layout()
    plt.savefig(f'feature_importance_rf_{target_name.lower().replace(" ", "_")}.png')
    plt.close()
    
    print(f"\
Feature importance plot for {target_name} has been saved.")
    
    return rf_model, feature_importance

# Train and evaluate models for both targets
X = df_clean[features]

for target in targets:
    y = df_clean[target]
    model, importance = train_evaluate_rf(X, y, target)
    print(f"\
Top 5 important features for {target}:")
    print(importance.head())

# Correlation heatmap
corr_matrix = df_clean[features + targets].corr()
plt.figure(figsize=(14, 12))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title('Correlation Heatmap of Features, Death Rate, and Survival Rate')
plt.tight_layout()
plt.savefig('correlation_heatmap_all_variables.png')
plt.close()

print("\
Correlation heatmap has been saved as 'correlation_heatmap_all_variables.png'")