In [1]:
# forecast.ipynb - Solar Panel Adoption Forecasting

# Cell 1: Import libraries and load data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import r2_score, mean_absolute_error, roc_auc_score
import shap
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries loaded successfully!")

Libraries loaded successfully!


In [11]:
# Cell 2: Load the dataset and trained models
df_full = pd.read_csv("/home/elsherif/Desktop/Thesis/ViewPython/data/CleanupDataSet/final_model_ev_updated.csv")

# Create log-transformed target
df_full['panel_area_log'] = np.log1p(df_full['panel_area_m2'])
df_full['has_solar'] = (df_full['panel_area_log'] > 0).astype(int)

print(f"Dataset loaded: {len(df_full)} rows")
print(f"Years available: {df_full['year'].min()} to {df_full['year'].max()}")
print(f"\nDataset info:")
df_full.info()


Dataset loaded: 12479 rows
Years available: 2006 to 2024

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12479 entries, 0 to 12478
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   tile                         12479 non-null  object 
 1   total_rooftops               12479 non-null  int64  
 2   rooftops_without_solar       12479 non-null  int64  
 3   square_meters_with_solar_m2  12479 non-null  float64
 4   panel_area_m2                12479 non-null  float64
 5   tile_centroid_lat            12479 non-null  float64
 6   tile_centroid_lon            12479 non-null  float64
 7   district_number              12479 non-null  int64  
 8   year                         12479 non-null  int64  
 9   Unemployment_Rate            12479 non-null  float64
 10  Average_Age                  12479 non-null  float64
 11  Elderly_Population           12479 non-null  float64
 12  Yo

In [13]:

# Cell 3: Define features and prepare data
feature_cols = [
    'year',
    'total_rooftops',
    'Unemployment_Rate',
    'Average_Age',
    'Elderly_Population',
    'Young_Population',
    'Total_Population',
    'tile_encoded',
        "employed",
        "pv_price",
        'panel_area_lag1'
        ,'ev_points_164m'
]

# Drop rows with missing values
df_model = df_full.dropna(subset=feature_cols + ['panel_area_log'])
print(f"After dropping NaN: {len(df_model)} rows")

After dropping NaN: 12479 rows


In [4]:
# # Cell 4: Load or train Stage 1 model (Classification)
# # Sort data
# df_model = df_model.sort_values("year")

# # Split data
# test_year = df_model["year"].max()
# val_year = df_model[df_model["year"] < test_year]["year"].max()
# train_df = df_model[df_model["year"] < val_year]
# val_df = df_model[df_model["year"] == val_year]
# test_df = df_model[df_model["year"] == test_year]

# # Prepare features
# X_train_1 = train_df[feature_cols]
# y_train_1 = train_df["has_solar"]
# X_val_1 = val_df[feature_cols]
# y_val_1 = val_df["has_solar"]
# X_test_1 = test_df[feature_cols]
# y_test_1 = test_df["has_solar"]

# # Train Stage 1 classifier
# print("Training Stage 1 (Classification)...")
# clf = lgb.LGBMClassifier(
#     n_estimators=5000,
#     learning_rate=0.05,
#     num_leaves=32,
#     class_weight="balanced",
#     random_state=42
# )

# clf.fit(
#     X_train_1, y_train_1,
#     eval_set=[(X_val_1, y_val_1)],
#     eval_metric="auc",
#     callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
# )

# # Evaluate
# p_solar_test = clf.predict_proba(X_test_1)[:, 1]
# test_auc = roc_auc_score(y_test_1, p_solar_test)
# print(f"âœ“ Stage 1 TEST ROC-AUC: {test_auc:.3f}")

In [14]:
import lightgbm as lgb

# 1. Load the model from JSON
# We load it as a Booster object, which is the native LightGBM engine
clf = lgb.Booster(model_file="lgb_model_1Stage_lag_ev.txt")

In [6]:
# # Train Stage 2 (Regression) with selected features only
# train_pos = train_df[train_df['has_solar'] == 1]

# X_train_2 = train_pos[feature_cols]
# y_train_2 = train_pos['panel_area_log']

# # Train XGBoost regressor
# model_stage2 = xgb.XGBRegressor(
#     n_estimators=500,
#     learning_rate=0.05,
#     max_depth=6,
#     random_state=42
# )

# model_stage2.fit(X_train_2, y_train_2)

# # Save the new model
# model_stage2.save_model("stage2_xgb_8features.json")
# print("âœ“ Stage 2 model saved with 8 features")


# # Test it
# X_test_2 = test_df[feature_cols]
# y_pred_log = model_stage2.predict(X_test_2)
# y_pred_size = np.expm1(y_pred_log)
# # y_pred_final = p_solar * y_pred_size
# # y_test_real = np.expm1(test_df['panel_area_log'].values)

# # r2 = r2_score(y_test_real, y_pred_final)
# # mae = mean_absolute_error(y_test_real, y_pred_final)
# # print(f"2-Stage Model RÂ²: {r2:.3f}, MAE: {mae:.3f}")

In [15]:
import joblib

# Load your pre-trained Random Forest model
model_stage2 = joblib.load("random_forest_model_lag_ev.joblib")
print("âœ“ Stage 2 RFR model loaded")


âœ“ Stage 2 RFR model loaded


In [None]:

# --- Function to create forecast features ---
def create_forecast_features(df, forecast_year, feature_cols):
    """
    Create features for forecasting by extrapolating demographic/economic trends
    Handles non-consecutive years and updates lag based on last known value
    """
    latest_year = df['year'].max()
    latest_data = df[df['year'] == latest_year].copy()
    
    forecast_data = latest_data.copy()
    forecast_data['year'] = forecast_year
    
    for tile in latest_data['tile'].unique():
        tile_history = df[df['tile'] == tile].sort_values('year')
        
        if len(tile_history) < 2:
            continue
            
        years = tile_history['year'].values.astype(float)
        
        # Forecast demographic features using linear trends, including EV infrastructure
        for col in ['Average_Age', 'Elderly_Population', 'Young_Population',
                    'Total_Population', 'Unemployment_Rate', 'pv_price', 'ev_points_164m']:
            if col in tile_history.columns:
                values = tile_history[col].values.astype(float)
                if len(values) > 1 and not np.all(np.isnan(values)):
                    valid_mask = ~np.isnan(values)
                    valid_years = years[valid_mask]
                    valid_values = values[valid_mask]
                    
                    if len(valid_years) > 1:
                        slope, intercept, _, _, _ = stats.linregress(valid_years, valid_values)
                        forecast_value = slope * forecast_year + intercept
                        
                        # Apply reasonable bounds
                        if col == 'Unemployment_Rate':
                            forecast_value = np.clip(forecast_value, 0, 20)
                        elif col == 'Average_Age':
                            forecast_value = np.clip(forecast_value, 20, 60)
                        else:
                            forecast_value = max(0, forecast_value)
                        
                        forecast_data.loc[forecast_data['tile'] == tile, col] = forecast_value
        
        # --- Correct lag computation ---
        # Always take the last known panel_area_m2 (historical or previously forecasted)
        latest_panel_area = tile_history['panel_area_m2'].iloc[-1]
        forecast_data.loc[forecast_data['tile'] == tile, 'panel_area_lag1'] = latest_panel_area
    
    return forecast_data

print("âœ“ Forecast feature function defined (lag updates correctly)")


âœ“ Forecast feature function defined (lag updates correctly)


In [17]:
def predict_future(clf, model_stage2, forecast_data, feature_cols):
    """
    Make 2-stage predictions for future data
    """
    X_forecast = forecast_data[feature_cols]
    
    # Stage 1: Predict probability of solar adoption
    p_solar = clf.predict(X_forecast)
    
    # Stage 2: Predict panel area (log scale)
    y_pred_log = model_stage2.predict(X_forecast)
    y_pred_size = np.expm1(y_pred_log)
    
    # Combine both stages (probability-aware weighting)
    y_pred_final = np.where(
        p_solar > 0.4,
        (p_solar + 0.3) * y_pred_size,      # High confidence boost
        np.where(
            p_solar < 0.3,
            0.0,                            # Low confidence â†’ no adoption
            p_solar * y_pred_size           # Medium confidence
        )
    )
  
    
    # Add predictions to dataframe
    forecast_data['predicted_adoption_prob'] = p_solar
    forecast_data['predicted_panel_area_m2'] = y_pred_final
    forecast_data['predicted_panel_area_raw'] = y_pred_size
    
    return forecast_data


In [18]:
# Cell 8: Generate multi-year forecasts (2025-2030)
print("=" * 60)
print("GENERATING MULTI-YEAR FORECASTS")
print("=" * 60)
df_current = df_full.copy()
forecast_years = [2025, 2026, 2027, 2028, 2029, 2030]
forecasts_all = []
prev_panel_area = None  # store previous year's predictions by tile

for year in forecast_years:
    print(f"Forecasting {year}...", end=" ")
    
    # Build features (starts from latest data)
    forecast_data = create_forecast_features(df_current, year, feature_cols)
    
    # Before predicting, inject lag values from the prior forecast year when available
    if prev_panel_area is not None:
        mapped_lag = forecast_data['tile'].map(prev_panel_area)
        forecast_data['panel_area_lag1'] = mapped_lag.fillna(forecast_data['panel_area_lag1'])
    
    # Predict outcomes
    forecast_pred = predict_future(clf, model_stage2, forecast_data, feature_cols)
    forecasts_all.append(forecast_pred)
    
    # Persist this year's results for the next loop iteration
    prev_panel_area = dict(zip(forecast_pred['tile'], forecast_pred['predicted_panel_area_m2']))
    
    print(f"âœ“ {len(forecast_pred)} tiles")
    

# Combine all forecasts
df_forecasts = pd.concat(forecasts_all, ignore_index=True)
print(f"\nâœ“ Total forecast rows: {len(df_forecasts)}")




GENERATING MULTI-YEAR FORECASTS
Forecasting 2025... âœ“ 1564 tiles
Forecasting 2026... âœ“ 1564 tiles
Forecasting 2027... âœ“ 1564 tiles
Forecasting 2028... âœ“ 1564 tiles
Forecasting 2029... âœ“ 1564 tiles
Forecasting 2030... âœ“ 1564 tiles

âœ“ Total forecast rows: 9384


In [None]:
# Cell 9: Aggregate forecasts by year
yearly_summary = df_forecasts.groupby('year').agg({
    'predicted_panel_area_m2': 'sum',
    'predicted_adoption_prob': 'mean',
    'tile': 'count'
}).reset_index()

yearly_summary.columns = ['year', 'total_panel_area_m2', 'avg_adoption_prob', 'num_tiles']

print("\n" + "=" * 60)
print("MULTI-YEAR FORECAST SUMMARY")
print("=" * 60)
print(yearly_summary.to_string(index=False))

# Calculate growth rate
growth_rate = ((yearly_summary.iloc[-1]['total_panel_area_m2'] / 
                yearly_summary.iloc[0]['total_panel_area_m2']) - 1) * 100
print(f"\nðŸ“ˆ Projected growth from {forecast_years[0]} to {forecast_years[-1]}: {growth_rate:.1f}%")


MULTI-YEAR FORECAST SUMMARY
 year  total_panel_area_m2  avg_adoption_prob  num_tiles
 2025        709087.729642           0.827331       1564
 2026        730698.452397           0.818807       1564
 2027        753556.741326           0.811574       1564
 2028        763322.903068           0.812224       1564
 2029        762250.673629           0.808855       1564
 2030        764441.910451           0.808249       1564

ðŸ“ˆ Projected growth from 2025 to 2030: 7.8%


In [None]:
# Cell 10: EV infrastructure outlook per year/tile
ev_summary = df_forecasts.groupby('year').agg({
    'ev_points_164m': ['sum', 'mean'],
    'predicted_panel_area_m2': 'sum'
}).reset_index()
ev_summary.columns = [
    'year',
    'total_ev_points',
    'avg_ev_points',
    'total_predicted_panel_area_m2'
]

print("\n" + "=" * 60)
print("EV CHARGING INFRASTRUCTURE & SOLAR PANEL OUTLOOK")
print("=" * 60)
print(ev_summary.to_string(index=False))

# Preview EV stats per tile/year to feed the app
per_tile_ev = df_forecasts[['year', 'tile', 'district_number', 'ev_points_164m', 'predicted_panel_area_m2']]
print("\nSample tile-level EV summary:")
print(per_tile_ev.head(10).to_string(index=False))


In [None]:
# ===========================
# Combine historical + forecast for summary
# ===========================
# Historical data (up to last observed year)
historical_summary = df_full.copy()
historical_summary['predicted_panel_area_m2'] = historical_summary['panel_area_m2']
historical_summary['predicted_panel_area_p10'] = historical_summary['panel_area_m2']
historical_summary['predicted_panel_area_p90'] = historical_summary['panel_area_m2']
historical_summary['predicted_adoption_prob'] = np.nan  # unknown for historical

# Combine historical + forecast
df_all_years = pd.concat([historical_summary, df_forecasts], ignore_index=True)

# Aggregate by year, including panel_area_lag1
yearly_summary = df_all_years.groupby('year').agg({
    'predicted_panel_area_m2': 'sum',    # total panel area
    'predicted_panel_area_p10': 'sum',
    'predicted_panel_area_p90': 'sum',
    'panel_area_lag1': 'sum',           # average lag
    'predicted_adoption_prob': 'mean',   # NaN for historical years
    'tile': 'count'                       # number of tiles
}).reset_index()

# Rename columns
yearly_summary.columns = [
    'year',
    'total_panel_area_m2',
    'total_panel_area_p10',
    'total_panel_area_p90',
    'avg_panel_area_lag1',
    'avg_adoption_prob',
    'num_tiles'
 ]

# Print summary
print("\n" + "=" * 60)
print("MULTI-YEAR FORECAST SUMMARY")
print("=" * 60)
print(yearly_summary.to_string(index=False))



MULTI-YEAR FORECAST SUMMARY
 year  total_panel_area_m2  total_panel_area_p10  total_panel_area_p90  avg_panel_area_lag1  avg_adoption_prob  num_tiles
 2006        297383.481000            297383.481            297383.481        267691.406000                NaN       1563
 2009        568881.732000            568881.732            568881.732        297472.406000                NaN       1564
 2012        550567.832000            550567.832            550567.832        568881.732000                NaN       1564
 2015        431982.977000            431982.977            431982.977        543426.584000                NaN       1532
 2018        565953.291000            565953.291            565953.291        439124.225000                NaN       1564
 2020        545613.070000            545613.070            545613.070        565953.291000                NaN       1564
 2022        663773.115000            663773.115            663773.115        545613.070000                NaN       

In [None]:
# Cell 12: District-level forecasts for 2027
forecast_2027 = df_forecasts[df_forecasts['year'] == 2027]

district_forecast_2027 = forecast_2027.groupby('district_number').agg({
    'predicted_panel_area_m2': 'sum',
    'predicted_panel_area_p10': 'sum',
    'predicted_panel_area_p90': 'sum',
    'predicted_adoption_prob': 'mean',
    'Total_Population': 'mean',
    'Average_Age': 'mean',
    'Unemployment_Rate': 'mean'
}).reset_index()

district_forecast_2027 = district_forecast_2027.sort_values(
    'predicted_panel_area_m2', ascending=False
)

print("\n" + "=" * 60)
print("TOP 10 DISTRICTS FOR SOLAR ADOPTION IN 2027")
print("=" * 60)
print(district_forecast_2027.head(10).to_string(index=False))

KeyError: "Column(s) ['predicted_panel_area_p10', 'predicted_panel_area_p90'] do not exist"

In [None]:
# Cell 14: Scenario Analysis (What-If Analysis)
def scenario_analysis(clf, model_stage2, base_data, feature_cols, changes, quantile_models=None):
    """
    Perform what-if analysis by modifying features
    """
    scenario_data = base_data.copy()
    
    for feature, change_type, value in changes:
        if feature in scenario_data.columns:
            if change_type == 'add':
                scenario_data[feature] = scenario_data[feature] + value
            elif change_type == 'multiply':
                scenario_data[feature] = scenario_data[feature] * value
    
    scenario_pred = predict_future(
        clf, model_stage2, scenario_data, feature_cols, quantile_models=quantile_models
    )
    return scenario_pred

# Define scenarios for 2027
forecast_2027_base = create_forecast_features(df_full, 2027, feature_cols)

scenarios = {
    'Baseline (2027)': [],
    'Economic Boom': [('Unemployment_Rate', 'add', -1.5)],
    'Economic Recession': [('Unemployment_Rate', 'add', 2.0)],
    'Aging Population': [('Average_Age', 'add', 3), ('Elderly_Population', 'multiply', 1.15)],
    'Youth Growth': [('Average_Age', 'add', -2), ('Young_Population', 'multiply', 1.1)],
    'Population Decline': [('Total_Population', 'multiply', 0.9)],
}

print("\n" + "=" * 60)
print("SCENARIO ANALYSIS FOR 2027")
print("=" * 60)

scenario_results = []
baseline_area = None
baseline_area_p10 = None
baseline_area_p90 = None

for scenario_name, changes in scenarios.items():
    if scenario_name == 'Baseline (2027)':
        result = predict_future(
            clf, model_stage2, forecast_2027_base.copy(), feature_cols, quantile_models=quantile_models
        )
    else:
        result = scenario_analysis(
            clf, model_stage2, forecast_2027_base.copy(), feature_cols, changes, quantile_models=quantile_models
        )
    
    total_area = result['predicted_panel_area_m2'].sum()
    total_area_p10 = result['predicted_panel_area_p10'].sum() if 'predicted_panel_area_p10' in result else np.nan
    total_area_p90 = result['predicted_panel_area_p90'].sum() if 'predicted_panel_area_p90' in result else np.nan
    avg_prob = result['predicted_adoption_prob'].mean()
    
    if baseline_area is None:
        baseline_area = total_area
        baseline_area_p10 = total_area_p10
        baseline_area_p90 = total_area_p90
        change_pct = 0
        change_pct_low = 0
        change_pct_high = 0
    else:
        change_pct = ((total_area / baseline_area) - 1) * 100
        change_pct_low = ((total_area_p10 / baseline_area_p10) - 1) * 100 if baseline_area_p10 is not None else np.nan
        change_pct_high = ((total_area_p90 / baseline_area_p90) - 1) * 100 if baseline_area_p90 is not None else np.nan
    
    scenario_results.append({
        'Scenario': scenario_name,
        'Total Solar Area (mÂ²)': f"{total_area:,.0f}",
        'Total Solar Area p10 (mÂ²)': f"{total_area_p10:,.0f}",
        'Total Solar Area p90 (mÂ²)': f"{total_area_p90:,.0f}",
        'Avg Adoption Prob': f"{avg_prob:.3f}",
        'Median Change (%)': f"{change_pct:+.2f}%",
        'Range Change (%)': f"{change_pct_low:+.2f}% â†’ {change_pct_high:+.2f}%"
    })

scenario_df = pd.DataFrame(scenario_results)
print(scenario_df.to_string(index=False))


In [None]:
# Cell 15: Visualize scenario analysis
scenario_values = []
scenario_names = []
scenario_colors = []

for s in scenario_results:
    scenario_names.append(s['Scenario'])
    scenario_values.append(float(s['Total Solar Area (mÂ²)'].replace(',', '')))
    if s['Scenario'] == 'Baseline (2027)':
        scenario_colors.append('#2E86AB')
    else:
        scenario_colors.append('#95A3A4')

fig, ax = plt.subplots(figsize=(12, 7))
bars = ax.bar(range(len(scenario_names)), scenario_values, color=scenario_colors)
ax.set_xticks(range(len(scenario_names)))
ax.set_xticklabels(scenario_names, rotation=45, ha='right', fontsize=11)
ax.set_ylabel('Total Solar Panel Area (mÂ²)', fontsize=13, fontweight='bold')
ax.set_title('Scenario Analysis: Impact on Solar Adoption (2027)', 
             fontsize=16, fontweight='bold', pad=20)
ax.tick_params(labelsize=11)
ax.grid(True, axis='y', alpha=0.3)

# Add value labels
for i, (bar, val) in enumerate(zip(bars, scenario_values)):
    ax.text(bar.get_x() + bar.get_width()/2, val, 
            f'{val:,.0f}', ha='center', va='bottom', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.savefig('/home/elsherif/Desktop/Thesis/ViewPython/scenario_analysis_2027.png', dpi=300, bbox_inches='tight')
plt.show()

print("âœ“ Visualization saved as scenario_analysis_2027.png")