# Pipeline

**Loading of datasets**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import zscore, iqr
from sklearn.model_selection import train_test_split
import numpy as np
import missingno as msno 
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
data_weather = pd.read_csv("spanish-cities-energy-consumption/weather_features.csv")
data_generation = pd.read_csv("spanish-cities-energy-consumption/energy_dataset.csv")


**Data Exploration**

In [None]:
data_weather.head()

In [None]:
data_generation.head()

In [None]:
data_weather.describe()

In [None]:
data_generation.describe()


In [6]:
data_generation.drop(columns=["generation fossil coal-derived gas", "generation fossil oil shale","generation fossil peat", "generation geothermal", "generation marine", "generation wind offshore" ], inplace=True)

In [None]:
for col, dtype in data_weather.dtypes.items():
    print(f"Column '{col}' has data type: {dtype}")

In [None]:
for col, dtype in data_generation.dtypes.items():
    print(f"Column '{col}' has data type: {dtype}")

In [None]:
print(data_weather.isnull().values.any())

In [None]:
print(data_generation.isnull().values.any())

In [11]:
#Divide the numerical features from the categorical features
weather_num = data_weather.select_dtypes(include='number')
weather_cat = data_weather.select_dtypes(exclude='number').drop('dt_iso', axis=1)
generation_num = data_generation.select_dtypes(include='number')

In [None]:
#see the unique values of categorical features to check for outliers
unique_values_all = {col: weather_cat[col].unique() for col in weather_cat.columns}

print(unique_values_all)


**Missing Values**

In [None]:
missing_values_sum = data_generation.isnull().sum()
missing_values_sum

In [None]:

generation_num.drop(columns=["generation hydro pumped storage aggregated", "forecast wind offshore eday ahead"], inplace=True)

columns_to_median_impute = [
    'total load actual',
    'generation biomass', 
    'generation fossil brown coal/lignite', 
    'generation fossil gas', 
    'generation fossil hard coal', 
    'generation fossil oil',  
    'generation hydro pumped storage consumption', 
    'generation hydro run-of-river and poundage', 
    'generation hydro water reservoir', 
    'generation nuclear', 
    'generation other', 
    'generation other renewable', 
    'generation solar', 
    'generation waste', 
    'generation wind onshore'
]

generation_num[columns_to_median_impute] = generation_num[columns_to_median_impute].fillna(generation_num.median(numeric_only=True))

print(generation_num.isna().sum())

print("Number of missing values now is:", pd.isnull(generation_num).sum().sum())

**Outliers**

In [15]:
def Zscore(data):
# Calculate Z-scores for each data point
    z_scores = zscore(data)
    # Define a threshold for identifying outliers
    threshold = 3
    # Identify outliers
    outliers = (z_scores > threshold) | (z_scores < -threshold)
    return outliers, z_scores

weather_outliers, weather_zscores = Zscore(weather_num)
generation_outliers, generation_zscore = Zscore(generation_num)

In [16]:
def visualZscore(z_scores, outliers, threshold=3):
    for col in z_scores.columns:
        # Plot Histogram of Z-scores
        plt.figure(figsize=(12, 5))

        # Histogram plot
        plt.subplot(1, 2, 1)
        plt.hist(z_scores[col], bins=100, edgecolor='k', alpha=0.7)
        plt.axvline(threshold, color='r', linestyle='dashed', linewidth=1.5, label=f'Outlier Threshold (+{threshold})')
        plt.axvline(-threshold, color='r', linestyle='dashed', linewidth=1.5, label=f'Outlier Threshold (-{threshold})')
        plt.title(f'Histogram of Z-scores {col}')
        plt.xlabel('Z-score')
        plt.ylabel('Frequency')
        plt.yscale('log')
        plt.legend()

        # Scatter Plot with Z-scores
        plt.subplot(1, 2, 2)
        plt.scatter(range(len(z_scores[col])), z_scores[col], c=['red' if outlier else 'blue' for outlier in outliers[col]], alpha=0.7)
        plt.axhline(threshold, color='r', linestyle='dashed', linewidth=1.5)
        plt.axhline(-threshold, color='r', linestyle='dashed', linewidth=1.5)
        plt.title(f'Scatter Plot of Z-scores {col}')
        plt.xlabel('Index')
        plt.ylabel('Z-score')
        plt.grid(True)

        # Highlight outliers
        for i, (score, is_outlier) in enumerate(zip(z_scores[col], outliers)):
            if is_outlier:
                plt.text(i, score, f'{score:.2f}', fontsize=9, color='red', ha='left', va='bottom')

        plt.tight_layout()
        plt.show()

In [17]:


def IQR(df):
    summary = df.describe()
    Q1 = summary.loc['25%']
    Q3 = summary.loc['75%']
    IQR = Q3 - Q1

    lower_bounds = Q1 - 1.5 * IQR
    upper_bounds = Q3 + 1.5 * IQR
    outliers_dict = {}
    
    for feature in df.columns:
        lower_bound = lower_bounds[feature]
        upper_bound = upper_bounds[feature]
        
        # Identify outliers
        outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
        outliers_dict[feature] = outliers
        
    return outliers_dict


In [None]:
import seaborn as sns
for col in generation_num.columns:
    sns.boxplot(data=generation_num[col])
    plt.show()

In [None]:
visualZscore(z_scores=weather_zscores, outliers=weather_outliers)

In [None]:
visualZscore(z_scores=generation_zscore, outliers=generation_outliers)