# Pipeline

**Loading of datasets**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import zscore, iqr
from sklearn.model_selection import train_test_split
import numpy as np
import missingno as msno 
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
data_weather = pd.read_csv("spanish-cities-energy-consumption/weather_features.csv")
data_generation = pd.read_csv("spanish-cities-energy-consumption/energy_dataset.csv")
data_weather['dt_iso'] = pd.to_datetime(data_weather['dt_iso'])
data_generation['time'] = pd.to_datetime(data_generation['time'])


**Data Exploration**

In [None]:
data_weather.head()

In [None]:
data_generation.head()

In [None]:
data_weather.describe()

In [None]:
data_generation.describe()


In [6]:
data_generation.drop(columns=["generation fossil coal-derived gas",
                              "generation hydro pumped storage aggregated", 
                              "generation fossil oil shale",
                              "generation fossil peat", 
                              "generation geothermal", 
                              "generation marine", 
                              "generation wind offshore",
                              "forecast wind offshore eday ahead" ], inplace=True)

In [None]:
for col, dtype in data_weather.dtypes.items():
    print(f"Column '{col}' has data type: {dtype}")

In [None]:
for col, dtype in data_generation.dtypes.items():
    print(f"Column '{col}' has data type: {dtype}")

In [None]:
print(data_weather.isnull().values.any())

In [None]:
print(data_generation.isnull().values.any())

In [11]:
#Divide the numerical features from the categorical features
weather_num = data_weather.select_dtypes(include='number')
weather_num.drop('weather_id', axis=1, inplace=True)
weather_cat = data_weather.select_dtypes(exclude='number').drop('dt_iso', axis=1)
weather_cat['weather_id']=data_weather['weather_id']
generation_num = data_generation.select_dtypes(include='number')

In [None]:
#see the unique values of categorical features to check for outliers
unique_values_all = {col: weather_cat[col].unique() for col in weather_cat.columns}

print(unique_values_all)


In [13]:
def visualData(data, time):
   for col in data.columns:
        print(col)
     # Plot Histogram of Z-scores
        plt.figure(figsize=(12, 5))

        # Histogram plot
        plt.subplot(1, 2, 1)
        plt.hist(data[col], bins=100, edgecolor='k', alpha=0.7)
        plt.title(f'Histogram of values {col}')
        plt.xlabel('value')
        plt.ylabel('Frequency')
        plt.legend()

        # Scatter Plot with Z-scores
        plt.subplot(1, 2, 2)
        plt.scatter(time, data[col], alpha=0.7)
        plt.title(f'Scatter Plot of values over time {col}')
        plt.xlabel('time')
        plt.ylabel('value')
        plt.grid(True)

        plt.tight_layout()
        plt.show()

In [None]:
visualData(data=weather_num, time=data_weather['dt_iso'])

In [None]:
visualData(data=generation_num, time=data_generation['time'])

In [16]:
#weather dataset
normal_weather_columns=["temp", 'temp_min', 'temp_max', 'pressure']
skewed_weather_columns=['humidity', 'wind_speed', 'wind_deg', 'rain_1h', 'rain_3h', 'snow_3h', 'clouds_all']

#energy dataset

normal_energy_columns=['generation biomass', 'generation fossil hard coal', 'generation fossil oil',
                        'generation other', 'generation other renewable', 'total load forecast','total load actual',
                         'price day ahead', 'price actual']
skewed_energy_columns=['generation fossil brown coal/lignite', 'generation fossil gas','generation hydro pumped storage consumption',
                        'generation hydro run-of-river and poundage', 'generation hydro water reservoir', 'generation nuclear',
                         'generation solar', 'generation waste', 'generation wind onshore',
                           'forecast solar day ahead', 'forecast wind onshore day ahead']

**Missing Values**

In [None]:
missing_values_sum = data_generation.isnull().sum()
missing_values_sum

In [None]:


columns_to_median_impute=['generation fossil brown coal/lignite', 'generation fossil gas','generation hydro pumped storage consumption',
                        'generation hydro run-of-river and poundage', 'generation hydro water reservoir', 'generation nuclear',
                         'generation solar', 'generation waste', 'generation wind onshore']

columns_to_mean_impute=['generation biomass', 'generation fossil hard coal', 'generation fossil oil',
                        'generation other', 'generation other renewable', 'total load actual']

generation_num[columns_to_median_impute] = generation_num[columns_to_median_impute].fillna(generation_num.median(numeric_only=True))
generation_num[columns_to_mean_impute] = generation_num[columns_to_mean_impute].fillna(generation_num.mean(numeric_only=True))


print(generation_num.isna().sum())

print("Number of missing values now is:", pd.isnull(generation_num).sum().sum())

**Outliers**

In [19]:
def Zscore(data):
# Calculate Z-scores for each data point
    z_scores = zscore(data)
    # Define a threshold for identifying outliers
    threshold = 3
    # Identify outliers
    outliers = (z_scores > threshold) | (z_scores < -threshold)
    return outliers, z_scores

weather_outliers, weather_zscores = Zscore(weather_num[normal_weather_columns])
generation_outliers, generation_zscore = Zscore(generation_num[normal_energy_columns])

In [20]:
def visualZscore(z_scores, outliers, threshold=3):
    for col in z_scores.columns:
        # Plot Histogram of Z-scores
        plt.figure(figsize=(12, 5))

        # Histogram plot
        plt.subplot(1, 2, 1)
        plt.hist(z_scores[col], bins=100, edgecolor='k', alpha=0.7)
        plt.axvline(threshold, color='r', linestyle='dashed', linewidth=1.5, label=f'Outlier Threshold (+{threshold})')
        plt.axvline(-threshold, color='r', linestyle='dashed', linewidth=1.5, label=f'Outlier Threshold (-{threshold})')
        plt.title(f'Histogram of Z-scores {col}')
        plt.xlabel('Z-score')
        plt.ylabel('Frequency')
        plt.yscale('log')
        plt.legend()

        # Scatter Plot with Z-scores
        plt.subplot(1, 2, 2)
        plt.scatter(range(len(z_scores[col])), z_scores[col], c=['red' if outlier else 'blue' for outlier in outliers[col]], alpha=0.7)
        plt.axhline(threshold, color='r', linestyle='dashed', linewidth=1.5)
        plt.axhline(-threshold, color='r', linestyle='dashed', linewidth=1.5)
        plt.title(f'Scatter Plot of Z-scores {col}')
        plt.xlabel('Index')
        plt.ylabel('Z-score')
        plt.grid(True)

        

        # Highlight outliers
        for i, (score, is_outlier) in enumerate(zip(z_scores[col], outliers)):
            if is_outlier:
                plt.text(i, score, f'{score:.2f}', fontsize=9, color='red', ha='left', va='bottom')

        plt.tight_layout()
        plt.show()

In [21]:


def IQR(df):
    summary = df.describe()
    Q1 = summary.loc['25%']
    Q3 = summary.loc['75%']
    IQR = Q3 - Q1

    lower_bounds = Q1 - 1.5 * IQR
    upper_bounds = Q3 + 1.5 * IQR
    outliers_dict = {}
    
    for feature in df.columns:
        lower_bound = lower_bounds[feature]
        upper_bound = upper_bounds[feature]
        
        # Identify outliers
        outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
        outliers_dict[feature] = outliers
        
    return outliers_dict, lower_bounds, upper_bounds

weather_iqr, weather_lower_bound, weather_upper_bound = IQR(weather_num[skewed_weather_columns])
generation_iqr, generation_lower_bound, generation_upper_bound = IQR(generation_num[skewed_energy_columns])



In [22]:
def visualIQR(data, lower_bound, upper_bound):
    for col in data.columns:
        # Plot Histogram of Z-scores
        plt.figure(figsize=(12, 5))

        # Histogram plot
        plt.subplot(1, 2, 1)
        plt.hist(data[col], bins=100, color='lightblue', edgecolor='black', alpha=0.7)
        plt.axvline(x=lower_bound[col], color='r', linestyle='--', label=f'Lower Bound ({lower_bound[col]})')
        plt.axvline(x=upper_bound[col], color='g', linestyle='--', label=f'Upper Bound ({upper_bound[col]})')

        # Customize plot
        plt.xlabel('Value')
        plt.ylabel('Frequency')
        plt.yscale('log')
        plt.title(f'Histogram with IQR Boundaries {col}')
        plt.legend()

        # Scatter Plot with Z-scores
        plt.subplot(1, 2, 2)
        plt.scatter(range(len(data[col])), data[col], label='Data Points', color='b')
        plt.axhline(y=lower_bound[col], color='r', linestyle='--', label=f'Lower Bound ({lower_bound[col]})')
        plt.axhline(y=upper_bound[col], color='g', linestyle='--', label=f'Upper Bound ({upper_bound[col]})')

        # Marking outliers
        outliers = data[(data[col] < lower_bound[col]) | (data[col] > upper_bound[col])]
        plt.scatter(np.where((data[col] < lower_bound[col]) | (data[col] > upper_bound[col]))[0], outliers[col], color='r', label='Outliers')

        # Customize plot
        plt.xlabel('Index')
        plt.ylabel('Value')
        plt.title(f'Scatter Plot with IQR Boundaries and Outliers {col}')
        plt.legend()
        


        plt.tight_layout()
        plt.show()

In [None]:
visualIQR(data=weather_num[skewed_weather_columns], lower_bound=weather_lower_bound, upper_bound=weather_upper_bound)

In [None]:
visualIQR(data=generation_num[skewed_energy_columns], lower_bound=generation_lower_bound, upper_bound=generation_upper_bound)

In [None]:
visualZscore(z_scores=weather_zscores, outliers=weather_outliers)

In [None]:
visualZscore(z_scores=generation_zscore, outliers=generation_outliers)

**Handling of outliers**

In [27]:
#weather dataset

cap_weather_columns=["temp", 'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed', 'rain_1h', 'snow_3h']
remove_weather_columns=[]
transform_weather_columns=['rain_3h']

#cap rain at a different threshold 

#energy dataset

normal_energy_columns=['generation biomass', 'generation fossil hard coal', 'generation fossil oil',
                        'generation other', 'generation other renewable', 'total load forecast','total load actual',
                         'price day ahead', 'price actual']
skewed_energy_columns=['generation fossil brown coal/lignite', 'generation fossil gas','generation hydro pumped storage consumption',
                        'generation hydro run-of-river and poundage', 'generation hydro water reservoir', 'generation nuclear',
                         'generation solar', 'generation waste', 'generation wind onshore',
                           'forecast solar day ahead', 'forecast wind onshore day ahead']

cap_energy_columns=['generation fossil oil', 'generation nuclear', 'generation other renewable','generation waste', 'generation wind onshore']
remove_energy_columns=['generation biomass']
transform_energy_columns=['generation fossil gas','generation hydro pumped storage consumption','generation hydro water reservoir' ]

In [None]:
#capping
for col in weather_num[cap_weather_columns].columns:
    if col in skewed_weather_columns:
        if col == 'rain_1h':
            weather_num[col] = np.clip(weather_num[col], weather_lower_bound[col], 3)
        else:
            weather_num[col] = np.clip(weather_num[col], weather_lower_bound[col], weather_upper_bound[col])
    else:
        mean = np.mean(weather_num[col])
        std_dev = np.std(weather_num[col])
        lower_bound = mean - (3 * std_dev)
        upper_bound = mean + (3 * std_dev)
        weather_num[col] = np.clip(weather_num[col], lower_bound, upper_bound)

#log-transform
for col in weather_num[transform_weather_columns].columns:
    weather_num[col] = np.log1p(weather_num[col])

#capping
for col in generation_num[cap_energy_columns].columns:
    if col in skewed_energy_columns:
        generation_num[col] = np.clip(generation_num[col], generation_lower_bound[col], generation_upper_bound[col])
    else:
        mean = np.mean(generation_num[col])
        std_dev = np.std(generation_num[col])
        lower_bound = mean - (3 * std_dev)
        upper_bound = mean + (3 * std_dev)
        generation_num[col] = np.clip(generation_num[col], lower_bound, upper_bound)

#log-transform
for col in generation_num[transform_energy_columns].columns:
    generation_num[col] = np.log1p(generation_num[col])

#remove
#print(generation_outliers[remove_energy_columns])
true_indexes = generation_outliers[generation_outliers['generation biomass'] == True].index

print(true_indexes)

generation_num.drop(true_indexes, inplace=True)
data_generation.drop(true_indexes, inplace=True)
generation_num.reset_index(drop=True, inplace=True)
data_generation.reset_index(drop=True, inplace=True)

In [None]:
visualData(data=generation_num, time=data_generation['time'])

In [None]:
df_encoded = pd.get_dummies(weather_cat, columns=weather_cat.columns)
df_encoded = df_encoded.astype(int)
df_encoded.head()

In [None]:
X = weather_num.drop('wind_deg', axis=1)  # Features
y = weather_num['wind_deg']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)