In [2]:
import pandas as pd

# Load the datasets
load_data = pd.read_csv('../Load_history_final.csv')
temp_data = pd.read_csv('../Temp_history_final.csv')

# Function to create a datetime column from year, month, and day
def create_datetime(df):
    # Ensure year, month, and day are integers and create a datetime column
    df['datetime'] = pd.to_datetime(df[['year', 'month', 'day']].astype(int))
    return df

# Apply the function to both datasets
load_data = create_datetime(load_data)
temp_data = create_datetime(temp_data)

# Exclude data from June 2008
load_data = load_data[~((load_data['datetime'].dt.month == 6) & (load_data['datetime'].dt.year == 2008))]
temp_data = temp_data[~((temp_data['datetime'].dt.month == 6) & (temp_data['datetime'].dt.year == 2008))]

# Melting the data to convert hours columns into rows
def melt_data(df, id_vars, value_name):
    return df.melt(id_vars=id_vars, var_name='hour', value_vars=[f'h{i}' for i in range(1, 25)], value_name=value_name)

load_data_melt = melt_data(load_data, ['zone_id', 'datetime'], 'load')
temp_data_melt = melt_data(temp_data, ['station_id', 'datetime'], 'temperature')

# Merge the datasets on datetime and hour
combined_data = pd.merge(load_data_melt, temp_data_melt, on=['datetime', 'hour'])

# Calculate correlations and determine the best station for each zone
zone_station_correlation = combined_data.groupby(['zone_id', 'station_id']).apply(lambda df: df['load'].corr(df['temperature']))

# Reset index so we can access the correlation values
correlation_data = zone_station_correlation.reset_index(name='correlation')

# Handle NaN correlations that can occur if there is no variance in temperature or load
correlation_data = correlation_data.dropna(subset=['correlation'])

# Determine the best station for each zone based on the highest correlation
def get_best_station(group):
    return group.loc[group['correlation'].idxmax()]

best_stations = correlation_data.groupby('zone_id').apply(get_best_station)

# Output results
print("Best station for each load zone based on maximum correlation:")
print(best_stations[['zone_id', 'station_id', 'correlation']])

# Optionally, save the merged dataset for further analysis
combined_data.to_csv('combined_data.csv', index=False)


Best station for each load zone based on maximum correlation:
         zone_id  station_id  correlation
zone_id                                  
1            1.0         8.0    -0.072835
2            2.0        10.0    -0.153490
3            3.0         1.0    -0.119071
4            4.0         8.0    -0.206846
5            5.0         8.0     0.011229
6            6.0         1.0    -0.119579
7            7.0         1.0    -0.126853
8            8.0         6.0    -0.013531
9            9.0         1.0    -0.186813
10          10.0         8.0    -0.215592
11          11.0         1.0    -0.424392
12          12.0        10.0    -0.031053
13          13.0         8.0    -0.129365
14          14.0         1.0    -0.302143
15          15.0         3.0    -0.091166
16          16.0        10.0    -0.095418
17          17.0         1.0    -0.119998
18          18.0         1.0    -0.385365
19          19.0         2.0    -0.061400
20          20.0         8.0    -0.205292


In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, r2_score

# Load the dataset
combined_data = pd.read_csv('combined_data.csv')
combined_data['datetime'] = pd.to_datetime(combined_data['datetime'])
combined_data['hour'] = pd.to_numeric(combined_data['hour'].str.extract('(\d+)')[0])

# Zone to station mapping based on best correlation
zone_station_map = {
    1: 8, 2: 10, 3: 1, 4: 8, 5: 8, 6: 1, 7: 1, 8: 6, 
    9: 1, 10: 8, 11: 1, 12: 10, 13: 8, 14: 1, 15: 3, 
    16: 10, 17: 1, 18: 1, 19: 2, 20: 8
}

# Define hyperparameters for grid search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 4, 6]
}

# Store models and their scores for each zone
models = {}
model_scores = {}

for zone, station in zone_station_map.items():
    # Filter data for this zone and station
    zone_data = combined_data[(combined_data['zone_id'] == zone) & (combined_data['station_id'] == station)]
    
    # Apply specific filter for zone 15
    if zone == 15:
        zone_data = zone_data[zone_data['load'] < 200000]  # Exclude load values above 200,000 for zone 15
    
    # Features and target
    X = zone_data[['temperature', 'hour']]
    y = zone_data['load']
    
    # Split data into training/validation (80%) and test (20%) sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Further split the training/validation data into training (80%) and validation (20%) sets
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)
    
    # Initialize Grid Search with cross-validation
    grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring=make_scorer(r2_score))
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_

    # Evaluate the best model on the training, validation, and test sets
    train_score = best_model.score(X_train, y_train)
    val_score = best_model.score(X_val, y_val)
    test_score = best_model.score(X_test, y_test)

    # Store the best model and scores
    models[zone] = best_model
    model_scores[zone] = {'Training Score': train_score, 'Validation Score': val_score, 'Test Score': test_score}
    
    # Print model performance and best parameters for each zone
    print(f'Zone {zone}: Best Parameters = {grid_search.best_params_}')
    print(f'Zone {zone}: Training Score = {train_score:.4f}, Validation Score = {val_score:.4f}, Test Score = {test_score:.4f}')

# Calculate weighted average scores
general_train_score = total_train_score / total_samples_train
general_val_score = total_val_score / total_samples_val
general_test_score = total_test_score / total_samples_test

# Output the general scores
print(f'General Training Score: {general_train_score:.4f}')
print(f'General Validation Score: {general_val_score:.4f}')
print(f'General Test Score: {general_test_score:.4f}')

Zone 1: Best Parameters = {'max_depth': 10, 'min_samples_split': 6, 'n_estimators': 150}
Zone 1: Training Score = 0.7523, Validation Score = 0.7278, Test Score = 0.7280
Zone 2: Best Parameters = {'max_depth': 10, 'min_samples_split': 6, 'n_estimators': 150}
Zone 2: Training Score = 0.7754, Validation Score = 0.7562, Test Score = 0.7582
Zone 3: Best Parameters = {'max_depth': 10, 'min_samples_split': 6, 'n_estimators': 150}
Zone 3: Training Score = 0.7207, Validation Score = 0.6853, Test Score = 0.6966
Zone 4: Best Parameters = {'max_depth': 10, 'min_samples_split': 6, 'n_estimators': 150}
Zone 4: Training Score = 0.8160, Validation Score = 0.7911, Test Score = 0.7964
Zone 5: Best Parameters = {'max_depth': 10, 'min_samples_split': 6, 'n_estimators': 150}
Zone 5: Training Score = 0.7587, Validation Score = 0.7334, Test Score = 0.7341
Zone 6: Best Parameters = {'max_depth': 10, 'min_samples_split': 6, 'n_estimators': 50}
Zone 6: Training Score = 0.7210, Validation Score = 0.6855, Test Sc

KeyboardInterrupt: 

In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Load the dataset
combined_data = pd.read_csv('combined_data.csv')
combined_data['datetime'] = pd.to_datetime(combined_data['datetime'])
combined_data['hour'] = pd.to_numeric(combined_data['hour'].str.extract('(\d+)')[0])

# Zone to station mapping based on best correlation
zone_station_map = {
    1: 8, 2: 10, 3: 1, 4: 8, 5: 8, 6: 1, 7: 1, 8: 6, 
    9: 1, 10: 8, 11: 1, 12: 10, 13: 8, 14: 1, 15: 3, 
    16: 10, 17: 1, 18: 1, 19: 2, 20: 8
}

# Initialize accumulators for weighted average calculations
total_train_score = 0
total_val_score = 0
total_test_score = 0
total_samples_train = 0
total_samples_val = 0
total_samples_test = 0

for zone, station in zone_station_map.items():
    # Filter data for this zone and station
    zone_data = combined_data[(combined_data['zone_id'] == zone) & (combined_data['station_id'] == station)]

    # Apply specific filter for zone 15
    if zone == 15:
        zone_data = zone_data[zone_data['load'] < 200000]  # Exclude load values above 200,000 for zone 15
    
    # Features and target
    X = zone_data[['temperature', 'hour']]
    y = zone_data['load']
    
    # Split data into training/validation (80%) and test (20%) sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Further split the training/validation data into training (80%) and validation (20%) sets
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # Note: 0.25 * 0.8 = 0.2
    
    # Initialize and train the Random Forest Regressor
    model = RandomForestRegressor(n_estimators=150, max_depth = 10, min_samples_split = 6, random_state=42)
    model.fit(X_train, y_train)
    
    # Calculate scores
    train_score = model.score(X_train, y_train)
    val_score = model.score(X_val, y_val)
    test_score = model.score(X_test, y_test)
    
    # Accumulate scores weighted by number of samples
    total_train_score += train_score * len(y_train)
    total_val_score += val_score * len(y_val)
    total_test_score += test_score * len(y_test)
    total_samples_train += len(y_train)
    total_samples_val += len(y_val)
    total_samples_test += len(y_test)

# Calculate weighted average scores
general_train_score = total_train_score / total_samples_train
general_val_score = total_val_score / total_samples_val
general_test_score = total_test_score / total_samples_test

# Output the general scores
print(f'General Training Score: {general_train_score:.4f}')
print(f'General Validation Score: {general_val_score:.4f}')
print(f'General Test Score: {general_test_score:.4f}')


General Training Score: 0.7248
General Validation Score: 0.6969
General Test Score: 0.7024


In [5]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score

# Load the dataset
combined_data = pd.read_csv('combined_data.csv')
combined_data['datetime'] = pd.to_datetime(combined_data['datetime'])
combined_data['hour'] = pd.to_numeric(combined_data['hour'].str.extract('(\d+)')[0])

# Zone to station mapping based on best correlation
zone_station_map = {
    1: 8, 2: 10, 3: 1, 4: 8, 5: 8, 6: 1, 7: 1, 8: 6, 
    9: 1, 10: 8, 11: 1, 12: 10, 13: 8, 14: 1, 15: 3, 
    16: 10, 17: 1, 18: 1, 19: 2, 20: 8
}

# Define the parameter grid for Grid Search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 8],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Initialize accumulators for weighted average calculations
total_train_score = 0
total_val_score = 0
total_test_score = 0
total_samples_train = 0
total_samples_val = 0
total_samples_test = 0

for zone, station in zone_station_map.items():
    # Filter data for this zone and station
    zone_data = combined_data[(combined_data['zone_id'] == zone) & (combined_data['station_id'] == station)]

    # Apply specific filter for zone 15
    if zone == 15:
        zone_data = zone_data[zone_data['load'] < 200000]  # Exclude load values above 200,000 for zone 15
    
    # Features and target
    X = zone_data[['temperature', 'hour']]
    y = zone_data['load']
    
    # Split data into training/validation (80%) and test (20%) sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Further split the training/validation data into training (80%) and validation (20%) sets
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)
    
    # Set up GridSearchCV to find the best parameters
    grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid, cv=3, scoring='r2')
    grid_search.fit(X_train, y_train)

    # Retrieve the best estimator
    best_model = grid_search.best_estimator_
    
    # Calculate scores
    train_score = best_model.score(X_train, y_train)
    val_score = best_model.score(X_val, y_val)
    test_score = best_model.score(X_test, y_test)
    
    # Accumulate scores weighted by number of samples
    total_train_score += train_score * len(y_train)
    total_val_score += val_score * len(y_val)
    total_test_score += test_score * len(y_test)
    total_samples_train += len(y_train)
    total_samples_val += len(y_val)
    total_samples_test += len(y_test)

    # Output best parameters and scores for each zone
    print(f'Zone {zone}: Best Parameters = {grid_search.best_params_}')
    print(f'Zone {zone}: Training Score = {train_score:.4f}, Validation Score = {val_score:.4f}, Test Score = {test_score:.4f}')

# Calculate weighted average scores
general_train_score = total_train_score / total_samples_train
general_val_score = total_val_score / total_samples_val
general_test_score = total_test_score / total_samples_test

# Output the general scores
print(f'General Training Score: {general_train_score:.4f}')
print(f'General Validation Score: {general_val_score:.4f}')
print(f'General Test Score: {general_test_score:.4f}')


Zone 1: Best Parameters = {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 150}
Zone 1: Training Score = 0.7461, Validation Score = 0.7359, Test Score = 0.7351
Zone 2: Best Parameters = {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 150}
Zone 2: Training Score = 0.7691, Validation Score = 0.7616, Test Score = 0.7650
Zone 3: Best Parameters = {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 150}
Zone 3: Training Score = 0.7160, Validation Score = 0.6948, Test Score = 0.7030
Zone 4: Best Parameters = {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 150}
Zone 4: Training Score = 0.8111, Validation Score = 0.8003, Test Score = 0.8035
Zone 5: Best Parameters = {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 150}
Zone 5: Training Score = 0.7515, Validation Score = 0.7425, Test Score = 0.7411
Zone 6: Best Parameters = {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 150}
Zone 6: Training Score = 0.7166, Validation Score = 0.6957, Test Score = 0.7033
Zone

In [4]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Load the dataset
combined_data = pd.read_csv('combined_data.csv')
combined_data['datetime'] = pd.to_datetime(combined_data['datetime'])
combined_data['hour'] = pd.to_numeric(combined_data['hour'].str.extract('(\d+)')[0])

# Zone to station mapping based on best correlation
zone_station_map = {
    1: 8, 2: 10, 3: 1, 4: 8, 5: 8, 6: 1, 7: 1, 8: 6, 
    9: 1, 10: 8, 11: 1, 12: 10, 13: 8, 14: 1, 15: 3, 
    16: 10, 17: 1, 18: 1, 19: 2, 20: 8
}

# Initialize accumulators for weighted average calculations
total_train_score = 0
total_val_score = 0
total_test_score = 0
total_samples_train = 0
total_samples_val = 0
total_samples_test = 0

for zone, station in zone_station_map.items():
    # Filter data for this zone and station
    zone_data = combined_data[(combined_data['zone_id'] == zone) & (combined_data['station_id'] == station)]

    # Apply specific filter for zone 15
    if zone == 15:
        zone_data = zone_data[zone_data['load'] < 200000]  # Exclude load values above 200,000 for zone 15
    
    # Features and target
    X = zone_data[['temperature', 'hour']]
    y = zone_data['load']
    
    # Split data into training/validation (80%) and test (20%) sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Further split the training/validation data into training (80%) and validation (20%) sets
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)
    
    # Initialize and train the Gradient Boosting Regressor
    model = GradientBoostingRegressor(learning_rate = 0.2, max_depth = 3, n_estimators = 150, random_state=42)
    model.fit(X_train, y_train)
    
    # Calculate scores
    train_score = model.score(X_train, y_train)
    val_score = model.score(X_val, y_val)
    test_score = model.score(X_test, y_test)
    
    # Accumulate scores weighted by number of samples
    total_train_score += train_score * len(y_train)
    total_val_score += val_score * len(y_val)
    total_test_score += test_score * len(y_test)
    total_samples_train += len(y_train)
    total_samples_val += len(y_val)
    total_samples_test += len(y_test)

# Calculate weighted average scores
general_train_score = total_train_score / total_samples_train
general_val_score = total_val_score / total_samples_val
general_test_score = total_test_score / total_samples_test

# Output the general scores
print(f'General Training Score: {general_train_score:.4f}')
print(f'General Validation Score: {general_val_score:.4f}')
print(f'General Test Score: {general_test_score:.4f}')


General Training Score: 0.7178
General Validation Score: 0.7047
General Test Score: 0.7095


In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
import warnings
from sklearn.exceptions import DataConversionWarning

warnings.filterwarnings(action='ignore', category=UserWarning)
                       
# Load the dataset
combined_data = pd.read_csv('combined_data.csv')
combined_data['datetime'] = pd.to_datetime(combined_data['datetime'])
combined_data['hour'] = pd.to_numeric(combined_data['hour'].str.extract('(\d+)')[0])

# Simulate loading temperature data
temp_data = pd.read_csv('../Temp_history_final.csv')

# Prepare the DataFrame to store all predictions
all_predictions = []

# Zone to station mapping based on best correlation
zone_station_map = {
    1: 8, 2: 10, 3: 1, 4: 8, 5: 8, 6: 1, 7: 1, 8: 6, 
    9: 1, 10: 8, 11: 1, 12: 10, 13: 8, 14: 1, 15: 3, 
    16: 10, 17: 1, 18: 1, 19: 2, 20: 8
}

# Train a model for each zone using its specific station's historical data
for zone, station in zone_station_map.items():
    # Filter data for this zone and station
    zone_data = combined_data[(combined_data['zone_id'] == zone) & (combined_data['station_id'] == station)]
    X = zone_data[['temperature', 'hour']]
    y = zone_data['load']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = GradientBoostingRegressor(learning_rate=0.2, max_depth=3, n_estimators=150, random_state=42)
    model.fit(X_train, y_train)

    # Predict for specific days using temperature forecasts
    for day in range(1, 8):  # June 1 to June 7
        for hour in range(1, 25):  # 24 hours, adjusting index for correct hour labels
            hour_col = 'h' + str(hour)
            if hour_col in temp_data.columns and not temp_data[temp_data['station_id'] == station][hour_col].isnull().all():
                hourly_temp = temp_data.loc[(temp_data['station_id'] == station) & (temp_data['day'] == day), hour_col].values
                if hourly_temp.size > 0:
                    # Prepare the feature array for prediction
                    features = np.array([[hourly_temp[0], hour - 1]])  # Adjust hour for zero-indexing in the model
                    prediction = model.predict(features.reshape(1, -2))
                    # Store the prediction
                    all_predictions.append({
                        'zone_id': zone,
                        'year': 2008,
                        'month': 6,
                        'day': day,
                        'hour': hour,  # Use the correct hour for output
                        'predicted_load': prediction[0]
                    })

# Convert predictions list to DataFrame
predictions_df = pd.DataFrame(all_predictions)

# Pivot DataFrame to wide format to match required output
pivot_df = predictions_df.pivot_table(index=['zone_id', 'year', 'month', 'day'], columns='hour', values='predicted_load', aggfunc='first').reset_index()
pivot_df.columns = [f'h{col}' if isinstance(col, int) else col for col in pivot_df.columns]

# Save to CSV
pivot_df.to_csv('Load_prediction.csv', index=False)

warnings.filterwarnings(action='default', category=UserWarning)