In [5]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import mutual_info_regression

# Library data visualization
import plotly
import matplotlib.pylab as pl
import matplotlib as m
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.express as px
from matplotlib import pyplot as plt
from IPython.display import SVG

from scipy import stats

In [92]:
file_path = "Datasets\solar_sensor_data.csv" 
sensor_df = pd.read_csv(file_path)

In [93]:
test_file_path = "Datasets\weather_sensor_data.csv"
weather_df = pd.read_csv(test_file_path)

In [94]:
sensor_df

Unnamed: 0,LOCATION,DATE_TIME,SENSOR_ID,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD
0,A,15-05-2020 00:00,sensor 1,0.0,0.0,0.0,6259559.0
1,A,15-05-2020 00:00,sensor 2,0.0,0.0,0.0,6183645.0
2,A,15-05-2020 00:00,sensor 3,0.0,0.0,0.0,6987759.0
3,A,15-05-2020 00:00,sensor 5,0.0,0.0,0.0,7602960.0
4,A,15-05-2020 00:00,sensor 12,0.0,0.0,0.0,7158964.0
...,...,...,...,...,...,...,...
136471,B,2020-06-17 23:45:00,sensor 33,0.0,0.0,4157.0,520758.0
136472,B,2020-06-17 23:45:00,sensor 35,0.0,0.0,3931.0,121131356.0
136473,B,2020-06-17 23:45:00,sensor 38,0.0,0.0,4322.0,2427691.0
136474,B,2020-06-17 23:45:00,sensor 40,0.0,0.0,4218.0,106896394.0


In [95]:
weather_df

Unnamed: 0,LOCATION,DATE_TIME,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
0,A,2020-05-15 00:00:00,25.184316,22.857507,0.0
1,A,2020-05-15 00:15:00,25.084589,22.761668,0.0
2,A,2020-05-15 00:30:00,24.935753,22.592306,0.0
3,A,2020-05-15 00:45:00,24.846130,22.360852,0.0
4,A,2020-05-15 01:00:00,24.621525,22.165423,0.0
...,...,...,...,...,...
6436,B,2020-06-17 22:45:00,23.511703,22.856201,0.0
6437,B,2020-06-17 23:00:00,23.482282,22.744190,0.0
6438,B,2020-06-17 23:15:00,23.354743,22.492245,0.0
6439,B,2020-06-17 23:30:00,23.291048,22.373909,0.0


In [96]:
def customDescription(df: pd.DataFrame, numeric_only: bool = False):
    if numeric_only:
        df = df.select_dtypes(include=np.number)
    
    desc = pd.DataFrame(index=df.columns.to_list())
    desc['type'] = df.dtypes
    desc['count'] = df.count()
    desc['nunique'] = df.nunique()
    desc['null'] = df.isnull().sum()
    
    # Handle numeric columns separately
    if not df.select_dtypes(include=np.number).empty:
        numeric_desc = df.describe().T.drop(columns=['count','std','25%','50%','75%'], axis=1)
        for col in df.select_dtypes(include=np.number).columns:
            desc.loc[col, 'mean'] = numeric_desc.loc[col, 'mean']
            desc.loc[col, 'min'] = numeric_desc.loc[col, 'min']
            desc.loc[col, 'max'] = numeric_desc.loc[col, 'max']
    
    # Handle non-numeric columns separately
    if not df.select_dtypes(exclude=np.number).empty:
        non_numeric_desc = df.select_dtypes(exclude=np.number).describe().T
        desc = pd.concat([desc, non_numeric_desc.drop(columns=['count', 'unique', 'top', 'freq'], axis=1, errors='ignore')], axis=1)
    
    return desc

In [97]:
customDescription(sensor_df)

Unnamed: 0,type,count,nunique,null,mean,min,max
LOCATION,object,136476,2,0,,,
DATE_TIME,object,136476,6417,0,,,
SENSOR_ID,object,136476,44,0,,,
DC_POWER,float64,136476,63581,0,1708.541,0.0,14471.12
AC_POWER,float64,136476,62872,0,274.8035,0.0,1410.95
DAILY_YIELD,float64,136476,59249,0,3295.434,0.0,9873.0
TOTAL_YIELD,float64,136476,70381,0,330382100.0,0.0,2247916000.0


In [98]:
customDescription(weather_df)

Unnamed: 0,type,count,nunique,null,mean,min,max
LOCATION,object,6441,2,0,,,
DATE_TIME,object,6441,3262,0,,,
AMBIENT_TEMPERATURE,float64,6441,6441,0,26.815672,20.398505,39.181638
MODULE_TEMPERATURE,float64,6441,6441,0,31.941762,18.140415,66.635953
IRRADIATION,float64,6441,3620,0,0.230551,0.0,1.221652


In [99]:
# Convert DATE_TIME to the desired format only where LOCATION is "A"
sensor_df.loc[sensor_df['LOCATION'] == "A", 'DATE_TIME'] = pd.to_datetime(
    sensor_df.loc[sensor_df['LOCATION'] == "A", 'DATE_TIME'], 
    format='%d-%m-%Y %H:%M'
).dt.strftime('%Y-%m-%d %H:%M:%S')

In [100]:
# Merge the sensor and weather DataFrames on LOCATION and DATE_TIME
merged_df = pd.merge(sensor_df, weather_df, on=["LOCATION", "DATE_TIME"], how="left")

In [101]:
# Split the DataFrame into two based on LOCATION
df_A = merged_df[merged_df['LOCATION'] == "A"].reset_index(drop=True)
df_B = merged_df[merged_df['LOCATION'] == "B"].reset_index(drop=True)

In [102]:
df_A

Unnamed: 0,LOCATION,DATE_TIME,SENSOR_ID,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
0,A,2020-05-15 00:00:00,sensor 1,0.0,0.0,0.000,6259559.0,25.184316,22.857507,0.0
1,A,2020-05-15 00:00:00,sensor 2,0.0,0.0,0.000,6183645.0,25.184316,22.857507,0.0
2,A,2020-05-15 00:00:00,sensor 3,0.0,0.0,0.000,6987759.0,25.184316,22.857507,0.0
3,A,2020-05-15 00:00:00,sensor 5,0.0,0.0,0.000,7602960.0,25.184316,22.857507,0.0
4,A,2020-05-15 00:00:00,sensor 12,0.0,0.0,0.000,7158964.0,25.184316,22.857507,0.0
...,...,...,...,...,...,...,...,...,...,...
68773,A,2020-06-17 23:45:00,sensor 37,0.0,0.0,5967.000,7287002.0,21.909288,20.427972,0.0
68774,A,2020-06-17 23:45:00,sensor 39,0.0,0.0,5147.625,7028601.0,21.909288,20.427972,0.0
68775,A,2020-06-17 23:45:00,sensor 42,0.0,0.0,5819.000,7251204.0,21.909288,20.427972,0.0
68776,A,2020-06-17 23:45:00,sensor 43,0.0,0.0,5817.000,6583369.0,21.909288,20.427972,0.0


In [103]:
df_B

Unnamed: 0,LOCATION,DATE_TIME,SENSOR_ID,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
0,B,2020-05-15 00:00:00,sensor 4,0.0,0.0,9425.000000,2.429011e+06,27.004764,25.060789,0.0
1,B,2020-05-15 00:00:00,sensor 6,0.0,0.0,0.000000,1.215279e+09,27.004764,25.060789,0.0
2,B,2020-05-15 00:00:00,sensor 7,0.0,0.0,3075.333333,2.247720e+09,27.004764,25.060789,0.0
3,B,2020-05-15 00:00:00,sensor 8,0.0,0.0,269.933333,1.704250e+06,27.004764,25.060789,0.0
4,B,2020-05-15 00:00:00,sensor 9,0.0,0.0,3177.000000,1.994153e+07,27.004764,25.060789,0.0
...,...,...,...,...,...,...,...,...,...,...
67693,B,2020-06-17 23:45:00,sensor 33,0.0,0.0,4157.000000,5.207580e+05,23.202871,22.535908,0.0
67694,B,2020-06-17 23:45:00,sensor 35,0.0,0.0,3931.000000,1.211314e+08,23.202871,22.535908,0.0
67695,B,2020-06-17 23:45:00,sensor 38,0.0,0.0,4322.000000,2.427691e+06,23.202871,22.535908,0.0
67696,B,2020-06-17 23:45:00,sensor 40,0.0,0.0,4218.000000,1.068964e+08,23.202871,22.535908,0.0


In [104]:
customDescription(df_A)

Unnamed: 0,type,count,nunique,null,mean,min,max
LOCATION,object,68778,1,0,,,
DATE_TIME,object,68778,3158,0,,,
SENSOR_ID,object,68778,22,0,,,
DC_POWER,float64,68778,32909,0,3147.426,0.0,14471.12
AC_POWER,float64,68778,32686,0,307.8028,0.0,1410.95
DAILY_YIELD,float64,68778,29900,0,3295.969,0.0,9163.0
TOTAL_YIELD,float64,68778,37267,0,6978712.0,6183645.0,7846821.0
AMBIENT_TEMPERATURE,float64,68774,3157,4,25.55852,20.3985,35.25249
MODULE_TEMPERATURE,float64,68774,3157,4,31.245,18.14042,65.54571
IRRADIATION,float64,68774,1755,4,0.232305,0.0,1.221652


In [105]:
customDescription(df_B)

Unnamed: 0,type,count,nunique,null,mean,min,max
LOCATION,object,67698,1,0,,,
DATE_TIME,object,67698,3259,0,,,
SENSOR_ID,object,67698,22,0,,,
DC_POWER,float64,67698,30799,0,246.702,0.0,1420.933
AC_POWER,float64,67698,30744,0,241.2778,0.0,1385.42
DAILY_YIELD,float64,67698,30432,0,3294.89,0.0,9873.0
TOTAL_YIELD,float64,67698,33114,0,658944800.0,0.0,2247916000.0
AMBIENT_TEMPERATURE,float64,67698,3259,0,27.98676,20.942385,39.18164
MODULE_TEMPERATURE,float64,67698,3259,0,32.60723,20.265123,66.63595
IRRADIATION,float64,67698,1863,0,0.2292042,0.0,1.098766
