In [3]:
# forecast.ipynb - Solar Panel Adoption Forecasting

# Cell 1: Import libraries and load data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import r2_score, mean_absolute_error, roc_auc_score
import shap
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries loaded successfully!")

Libraries loaded successfully!


In [4]:
# Cell 2: Load the dataset and trained models
df_full = pd.read_csv("/home/elsherif/Desktop/Thesis/ViewPython/data/CleanupDataSet/final_model.csv")

# Create log-transformed target
df_full['panel_area_log'] = np.log1p(df_full['panel_area_m2'])
df_full['has_solar'] = (df_full['panel_area_log'] > 0).astype(int)

print(f"Dataset loaded: {len(df_full)} rows")
print(f"Years available: {df_full['year'].min()} to {df_full['year'].max()}")
print(f"\nDataset info:")
df_full.info()


Dataset loaded: 12479 rows
Years available: 2006 to 2024

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12479 entries, 0 to 12478
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   tile                         12479 non-null  object 
 1   total_rooftops               12479 non-null  int64  
 2   rooftops_without_solar       12479 non-null  int64  
 3   square_meters_with_solar_m2  12479 non-null  float64
 4   panel_area_m2                12479 non-null  float64
 5   district_number              12479 non-null  int64  
 6   year                         12479 non-null  int64  
 7   Unemployment_Rate            12479 non-null  float64
 8   Average_Age                  12479 non-null  float64
 9   Elderly_Population           12479 non-null  float64
 10  Young_Population             12479 non-null  float64
 11  Total_Population             12479 non-null  int64  
 12  em

In [5]:

# Cell 3: Define features and prepare data
feature_cols = [
    'total_rooftops',
    'Unemployment_Rate',
    'Average_Age',
    'Elderly_Population',
    'Young_Population',
    'Total_Population',
    'tile_encoded',
    'panel_area_lag1',
]

# Drop rows with missing values
df_model = df_full.dropna(subset=feature_cols + ['panel_area_log'])
print(f"After dropping NaN: {len(df_model)} rows")

After dropping NaN: 12479 rows


In [6]:
# Cell 4: Load or train Stage 1 model (Classification)
# Sort data
df_model = df_model.sort_values("year")

# Split data
test_year = df_model["year"].max()
val_year = df_model[df_model["year"] < test_year]["year"].max()
train_df = df_model[df_model["year"] < val_year]
val_df = df_model[df_model["year"] == val_year]
test_df = df_model[df_model["year"] == test_year]

# Prepare features
X_train_1 = train_df[feature_cols]
y_train_1 = train_df["has_solar"]
X_val_1 = val_df[feature_cols]
y_val_1 = val_df["has_solar"]
X_test_1 = test_df[feature_cols]
y_test_1 = test_df["has_solar"]

# Train Stage 1 classifier
print("Training Stage 1 (Classification)...")
clf = lgb.LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.05,
    num_leaves=32,
    class_weight="balanced",
    random_state=42
)

clf.fit(
    X_train_1, y_train_1,
    eval_set=[(X_val_1, y_val_1)],
    eval_metric="auc",
    callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
)

# Evaluate
p_solar_test = clf.predict_proba(X_test_1)[:, 1]
test_auc = roc_auc_score(y_test_1, p_solar_test)
print(f"✓ Stage 1 TEST ROC-AUC: {test_auc:.3f}")

Training Stage 1 (Classification)...
[LightGBM] [Info] Number of positive: 7760, number of negative: 1591
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001036 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1172
[LightGBM] [Info] Number of data points in the train set: 9351, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
✓ Stage 1 TEST ROC-AUC: 0.996


In [7]:
# Cell 5: Load Stage 2 model (Regression)
print("Loading Stage 2 (Regression)...")
model_stage2 = xgb.XGBRegressor()
model_stage2.load_model("/home/elsherif/Desktop/Thesis/ViewPython/Training/stage2_xgb.json")
print("✓ Stage 2 model loaded successfully")

Loading Stage 2 (Regression)...
✓ Stage 2 model loaded successfully


In [8]:
# Cell 6: Function to create forecast features
def create_forecast_features(df, forecast_year, feature_cols):
    """
    Create features for forecasting by extrapolating demographic/economic trends
    """
    latest_year = df['year'].max()
    latest_data = df[df['year'] == latest_year].copy()
    
    forecast_data = latest_data.copy()
    forecast_data['year'] = forecast_year
    
    for tile in latest_data['tile'].unique():
        tile_history = df[df['tile'] == tile].sort_values('year')
        
        if len(tile_history) < 2:
            continue
            
        years = tile_history['year'].values
        
        # Forecast demographic features using linear trends
        for col in ['Average_Age', 'Elderly_Population', 'Young_Population', 
                    'Total_Population', 'Unemployment_Rate']:
            if col in tile_history.columns:
                values = tile_history[col].values
                if len(values) > 1 and not np.all(np.isnan(values)):
                    slope, intercept, _, _, _ = stats.linregress(years, values)
                    forecast_value = slope * forecast_year + intercept
                    
                    # Apply reasonable bounds
                    if col == 'Unemployment_Rate':
                        forecast_value = np.clip(forecast_value, 0, 20)
                    elif col == 'Average_Age':
                        forecast_value = np.clip(forecast_value, 20, 60)
                    else:
                        forecast_value = max(0, forecast_value)
                    
                    forecast_data.loc[forecast_data['tile'] == tile, col] = forecast_value
        
        # Use latest panel_area_m2 as lag1
        latest_panel_area = tile_history.iloc[-1]['panel_area_m2']
        forecast_data.loc[forecast_data['tile'] == tile, 'panel_area_lag1'] = latest_panel_area
    
    return forecast_data

print("✓ Forecast feature function defined")

✓ Forecast feature function defined


In [9]:
# Cell 7: Function to make predictions
def predict_future(clf, model_stage2, forecast_data, feature_cols):
    """
    Make 2-stage predictions for future data
    """
    X_forecast = forecast_data[feature_cols]
    
    # Stage 1: Predict probability of solar adoption
    p_solar = clf.predict_proba(X_forecast)[:, 1]
    
    # Stage 2: Predict panel area (log scale)
    y_pred_log = model_stage2.predict(X_forecast)
    y_pred_size = np.expm1(y_pred_log)
    
    # Combine both stages
    y_pred_final = p_solar * y_pred_size
    
    # Add predictions to dataframe
    forecast_data['predicted_adoption_prob'] = p_solar
    forecast_data['predicted_panel_area_m2'] = y_pred_final
    forecast_data['predicted_panel_area_raw'] = y_pred_size
    
    return forecast_data

print("✓ Prediction function defined")

✓ Prediction function defined


In [None]:
# Cell 8: Generate multi-year forecasts (2025-2030)
print("=" * 60)
print("GENERATING MULTI-YEAR FORECASTS")
print("=" * 60)

forecast_years = [2025, 2026, 2027, 2028, 2029, 2030]
forecasts_all = []

for year in forecast_years:
    print(f"Forecasting {year}...", end=" ")
    forecast_data = create_forecast_features(df_full, year, feature_cols)
    forecast_pred = predict_future(clf, model_stage2, forecast_data, feature_cols)
    forecasts_all.append(forecast_pred)
    print(f"✓ {len(forecast_pred)} tiles")

# Combine all forecasts
df_forecasts = pd.concat(forecasts_all, ignore_index=True)
print(f"\n✓ Total forecast rows: {len(df_forecasts)}")

GENERATING MULTI-YEAR FORECASTS
Forecasting 2025... 