#### Load Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

dpt_1_2_pv = pd.read_csv("TwInSolar_consolidated_data-main/Dpt_1_2_PV.txt", delimiter=",")
enerpos_pv = pd.read_csv("TwInSolar_consolidated_data-main/ENERPOS_PV.txt", delimiter=",")
esiroi_pv = pd.read_csv("TwInSolar_consolidated_data-main/ESIROI_PV.txt", delimiter=",")
meteo = pd.read_csv("TwInSolar_consolidated_data-main/Meteo_Terre_Sainte.txt", delimiter=",")


#### Convert Timestamp

In [17]:
dpt_1_2_pv['datetime'] = pd.to_datetime(dpt_1_2_pv['datetime'])
enerpos_pv['datetime'] = pd.to_datetime(enerpos_pv['datetime'])
esiroi_pv['datetime'] = pd.to_datetime(esiroi_pv['datetime'])
meteo['datetime'] = pd.to_datetime(meteo['datetime'])


#### Handle Missing Values

In [18]:
# inear interpolation
dpt_1_2_pv = dpt_1_2_pv.interpolate(method='linear')


#### Remove or Correct Outliers

In [19]:
# Removing outliers beyond 3 standard deviations for PV data
dpt_1_2_pv = dpt_1_2_pv[(np.abs(dpt_1_2_pv['Prod_kW'] - dpt_1_2_pv['Prod_kW'].mean()) <= (3*dpt_1_2_pv['Prod_kW'].std()))]


#### Normalize Data

In [20]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
dpt_1_2_pv[['Prod_kW']] = scaler.fit_transform(dpt_1_2_pv[['Prod_kW']])


#### Align Temporal Data and Merge

In [21]:
merged_dpt_1_2 = pd.merge(dpt_1_2_pv, meteo, on='datetime')
merged_enerpos = pd.merge(enerpos_pv, meteo, on='datetime')
merged_esiroi = pd.merge(esiroi_pv, meteo, on='datetime')


## Data Validation

##### Check for Missing Values Post-Cleaning



In [22]:
print(merged_dpt_1_2.isnull().sum())
print(merged_enerpos.isnull().sum())
print(merged_esiroi.isnull().sum())


datetime    0
Prod_kW     0
GHI         0
BNI         0
DHI         0
DBT         0
RH          0
Ws10        0
Wd10        0
Patmo       0
Rainfall    0
dtype: int64
datetime    0
Prod_kW     0
GHI         0
BNI         0
DHI         0
DBT         0
RH          0
Ws10        0
Wd10        0
Patmo       0
Rainfall    0
dtype: int64
datetime    0
Prod_kW     0
GHI         0
BNI         0
DHI         0
DBT         0
RH          0
Ws10        0
Wd10        0
Patmo       0
Rainfall    0
dtype: int64


#### Visualize Data to Identify Anomalies

In [2]:
import matplotlib.pyplot as plt

plt.plot(merged_dpt_1_2['datetime'], merged_dpt_1_2['Prod_kW'])
plt.title('Dpt 1_2 PV Production Over Time')
plt.xlabel('Datetime')
plt.ylabel('PV Production (kW)')
plt.show()


sns.boxplot(data=dpt_1_2_pv[['Prod_kW']])
plt.title('Boxplot for Dpt_1_2 PV Production')
plt.show()

sns.boxplot(data=enerpos_pv[['Prod_kW']])
plt.title('Boxplot for Enerpos PV Production')
plt.show()

sns.boxplot(data=esiroi_pv[['Prod_kW']])
plt.title('Boxplot for ESIROI PV Production')
plt.show()





NameError: name 'merged_dpt_1_2' is not defined

#### Statistical Summary

In [24]:
print(merged_dpt_1_2.describe())
print(merged_enerpos.describe())
print(merged_esiroi.describe())


             Prod_kW            GHI            BNI            DHI  \
count  105120.000000  105120.000000  105120.000000  105120.000000   
mean        0.204017     236.880172     223.856274      81.696537   
std         0.282640     328.751888     328.659286     122.230949   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000300       0.000000       0.000000       0.000000   
50%         0.000300       3.899850       0.168500       3.557952   
75%         0.403529     451.667500     518.752500     124.222500   
max         1.000000    1305.900000    1200.894352    1046.900000   

                 DBT             RH           Ws10           Wd10  \
count  105120.000000  105120.000000  105120.000000  105120.000000   
mean       23.778520      74.261499       2.311256     122.588027   
std         3.585584      10.171373       1.937093      58.672895   
min        14.297000       0.084948       0.000000       0.082400   
25%        20.954000      67.0750

#### Saving Cleaned Data

In [None]:
merged_dpt_1_2.to_csv("cleaned_data/cleaned_merged_dpt_1_2.csv", index=False)
merged_enerpos.to_csv("cleaned_data/cleaned_merged_enerpos.csv", index=False)
merged_esiroi.to_csv("cleaned_data/cleaned_merged_esiroi.csv", index=False)