In [2]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import toolz as tz
import os 
from google.colab import drive
drive.mount('/content/drive')
from plotly.subplots import make_subplots
import plotly.graph_objects as go


Mounted at /content/drive


# PM2: Data - Solar Data -  Anitha, Emily, Mercy, Daniel, James


# Reading Data

In [3]:
print(os.getcwd())
print(os.getcwd())
#os.chdir("/content/drive/MyDrive/Colab Notebooks/Datasets/Solar")
os.chdir("/content/drive/MyDrive/solar_data/")
files_dir = "/content/drive/MyDrive/solar_data/"
filename1 = files_dir + "Plant_1_Generation_Data.csv"
print(filename1)
print(os.getcwd())

/content
/content
/content/drive/MyDrive/solar_data/Plant_1_Generation_Data.csv
/content/drive/MyDrive/solar_data


In [4]:
filename2 = files_dir + "Plant_2_Generation_Data.csv"
filename3 = files_dir + "Plant_1_Weather_Sensor_Data.csv"
filename4 = files_dir + "Plant_2_Weather_Sensor_Data.csv"

Using the attribute infer_datetime_format to get the right date time format. 

In [None]:
df_plant1 = pd.read_csv(filename1, parse_dates=['DATE_TIME'], infer_datetime_format=True)
df_plant2 = pd.read_csv(filename2,parse_dates=['DATE_TIME'], infer_datetime_format=True)
df_plant1.info()

In [None]:
df_plant2.info()

Month wise readings from Plant 1 and Plant 2 using the groupby method reveal 33,112 readings for Plant 1 in May compared to 31,838 for Plant 2. For June, we find 35,666 readings for Plant 1 and 35,860 Plant 2. Looking at the dataframes' info we can see that there is a different number of data points in each set. **This might imply missing values.**

In [None]:
print(df_plant1.groupby([df_plant1['DATE_TIME'].dt.to_period('M')]).size())
print(df_plant2.groupby([df_plant2['DATE_TIME'].dt.to_period('M')]).size())

In [None]:
print("Plant 1 data points: ", 33112 + 35666)
print("Plant 2 data points: ",31838 + 35860)
print("Plant 1 Shape:", df_plant1.shape)
print("Plant 2 Shape:",df_plant2.shape)

In [None]:
# Converting df date time column to datetime datatype
pd.to_datetime(df_plant1['DATE_TIME'])
pd.to_datetime(df_plant2['DATE_TIME'])

Looking at the top of the dataframe gives an idea of the features and their datatypes

In [None]:
df_plant1.head()

We can see seven features. Source key and plant id look are categorical features, while DC power, AC power, Daily Yield and Total Yield are numeric features.


We also concatenate the two months' weather data and take an initial look.

In [None]:
df_weather1 = pd.read_csv(filename3,parse_dates=['DATE_TIME'], infer_datetime_format=True)
df_weather2 = pd.read_csv(filename4,parse_dates=['DATE_TIME'], infer_datetime_format=True)
df_weather12 = pd.concat([df_weather1,df_weather2],axis=0)
df_weather12.info()

In [None]:
df_weather12.head()

An initial glance at the data shows six features. Source key and plant id look like categorical features, while ambient temperature, module temperature and irradiation are numeric features. 


# Data Analysis & Feature Engineering
**List the features that are categorical.**

*   Plant ID
*   Source Key

**List the features that are numeric.**

*   Date time
*   AC power 
* DC power
* Daily yield 
* Total yield 
* Ambient temperature 
* Module temperature 
* Irradiation

**The feature that will be our label.**

* Daily yield 
* Performance

To further aid with our exploration of the data, we separate out the date and time into new Date and Time columns.

In [None]:
timeinfoplant1 = df_plant1['DATE_TIME'].dt.time
dateinfoplant1= df_plant1['DATE_TIME'].dt.date
#print(df_plant1['DATE_TIME'].dt.time)
#print(df_plant1['DATE_TIME'].dt.date)
#ds_plant1_grp_src=df_plant1.groupby("SOURCE_KEY")["SOURCE_KEY"].count()

timeinfoplant2 = df_plant2['DATE_TIME'].dt.time
dateinfoplant2= df_plant2['DATE_TIME'].dt.date

#df_plant1['DATE'] = pd.to_datetime(dateinfoplant1.values,format='%Y-%m-%d',infer_datetime_format=False)
df_plant1['DATE'] = pd.to_datetime(dateinfoplant1.values)
df_plant1['TIME'] = timeinfoplant1
#df_plant2['DATE'] = pd.to_datetime(dateinfoplant2.values,format='%Y-%m-%d',infer_datetime_format=False)
df_plant2['DATE'] = pd.to_datetime(dateinfoplant2.values)
df_plant2['TIME'] = timeinfoplant2
df_plant1
pd.to_datetime(df_plant1['DATE'])
pd.to_datetime(df_plant2['DATE'])
#pd.to_datetime(df_plant1['TIME'])
df_plant1.info()
df_plant2.info()

# Immediate issues with the dataset.
**Missing values**
*   During our initial exploration of the data we noticed that the two plants have unequal numbers of readings. We notice that plant 2 has some readings with fewer than 22 inverters, and that plant 1 has missing data for May 20 and 28.

**Duplicate data**
* None noticed

**Misalignment of values and columns**
* None noticed



In [None]:
#Define a function to look for values with NaN
def check_for_missing_values(df):
  return tz.pipe(df.isna(),
                 tz.partial(np.any, axis=1),
                 np.any)

print("NaN values for Plant 1:", check_for_missing_values(df_plant1)) and check_for_missing_values(PLANT_WEATHER_DF)
print("NaN values for Plant 2:", check_for_missing_values(df_plant2))
print("NaN values for Weather data:", check_for_missing_values(df_weather12))

In [None]:
#Define a function to look for missing values
def check_for_duplicate_values(df):
  return tz.pipe(df.duplicated(),
                 np.any)
  
print("Missing values for Plant 1:", check_for_duplicate_values(df_plant1))  
print("Missing values for Plant 2:", check_for_duplicate_values(df_plant2))
print("Missing values for Weather data:", check_for_duplicate_values(df_weather12))

We also group Plant 1 by datetime to get the inverter count.Some days show less than 22 inverter readings. This could be explained by the inverter could be shut down. It might also explain our missing data points. 


In [None]:
ds_plant1_grp_dt=df_plant1.groupby("DATE_TIME")
ds_plant1_agg_dt = ds_plant1_grp_dt.agg('count').reset_index()
ds_plant2_grp_dt=df_plant2.groupby("DATE_TIME")
ds_plant2_agg_dt = ds_plant2_grp_dt.agg('count').reset_index()
ds_plant1_agg_dt


In [None]:
ds_plant2_agg_dt

In [None]:
label = "DAILY_YIELD"

numerical_features = np.array(["DATE_TIME", "DC_POWER", \
                               "AC_POWER", "DAILY_YIELD", "TOTAL_YIELD","DATE","TIME"])
non_numerical_features = np.array([column for column in df_plant1.columns \
                                   if column not in numerical_features and \
                                      column != label])
print(numerical_features)
print(non_numerical_features)

Missaligned Data: The check for missing values also indicates no missaligned samples, as Pandas interprets ",," as "NaN, NaN,". 

# Univariate Analysis

Below we plot histograms and box-and-whisker plots for each feature in the dataset.

**Features with a Gaussian distribution**


*   AC_POWER
*   DC_POWER
* IRRADIATION
* MODULE_TEMPERATURE
* AMBIENT_TEMPERATURE

**Features which show outliers**

* Some outliers in AC_POWER, for example we see a value zero for some time at noon. This might be because the inverters are shut off or undergoing maintenance.



In [None]:
df_plant1.loc[:, ["DC_POWER"]]
df_plant1.loc[df_plant1["DC_POWER"] > 0]
df_plant1.loc[df_plant1["DC_POWER"] == 0].describe()
df_plant1["DC_POWER"].describe()


In [None]:
df_plant2["AC_POWER"].describe()

In [None]:
df_plant1.loc[df_plant1["DC_POWER"] < 0].describe()

In [None]:
df_plant2.loc[df_plant2["DC_POWER"] < 0].describe()


In [None]:
df_plant1.loc[df_plant1["DC_POWER"] > 0].describe()


In [None]:
plant1_dc = df_plant1.loc[df_plant1["DC_POWER"] > 0]
px.histogram(plant1_dc, x="DC_POWER", marginal="box")



In [None]:
plant2_dc = df_plant2.loc[df_plant2["DC_POWER"] > 0]
px.histogram(plant2_dc, x="DC_POWER", marginal="box")


In [None]:
for feature in numerical_features:
  fig = px.histogram(df_plant1, x=feature, marginal="box")
  fig.show()

In [None]:
for feature in numerical_features:
  fig = px.histogram(df_plant2, x=feature, marginal="box")
  fig.show()

In [None]:
#Grab the weather feature names which we will plot
weather_feature_names = ['DATE_TIME',	'AMBIENT_TEMPERATURE',	'MODULE_TEMPERATURE',	'IRRADIATION']

for feature in weather_feature_names:
  fig = px.histogram(df_weather12, x=feature, marginal="box", height=300)
  fig.show()


#Bivariate analysis

**Features that have binary values**

* No features with binary values in solar dataset

**Features that have positive or negative correlation**

*   DAILY_YIELD increases linearly with time every day
*   AC_POWER and DC_POWER have a high degree of correlation
*  Power has a positive correlation with time until noon
* Power is also correlated to weather 


This is to the check the readings for each invertor in plant one..

In [None]:
#ds_plant1_grp_src=df_plant1.groupby("SOURCE_KEY",as_index=True)["SOURCE_KEY"].count()
ds_plant1_grp_src=df_plant1.groupby("SOURCE_KEY")
ds_plant1_agg_inv = ds_plant1_grp_src.agg('count').reset_index()
#dfgrpsrc=ds_plant1_grp_src.to_frame()
#dfgrpsrc.sort_values("SOURCE_KEY", ascending=True)
#dfgrpsrc
ds_plant1_agg_inv


This is to get the readings for all invertors throughout the day for all days.

In [None]:
ds_plant1_grp_time=df_plant1.groupby("TIME")
ds_plant1_agg_time = ds_plant1_grp_time=df_plant1.groupby("TIME").agg('mean').reset_index()
ds_plant1_agg_time


Monthly mean for plant1

In [None]:
ds_plant1_agg_dt = df_plant1.groupby([df_plant1['DATE'].dt.month_name().rename('month')]).agg('mean').reset_index()
ds_plant1_agg_dt

Daily mean of power readings in plant 1

In [None]:
ds_plant1_agg_dt_D = df_plant1.groupby(df_plant1['DATE']).agg('mean').reset_index()
ds_plant1_agg_dt_D 


Looking at average power readings throughout the day, AC_POWER and DC_POWER are normally distributed. Daily yield is cumulative and increases linearly. Total yield is more spread.

In [None]:
fig = px.scatter_matrix(ds_plant1_agg_time, color="DAILY_YIELD" ,title="plant 1 Time Data")

fig.update_layout(width=1600,
                 height=1600,
                 margin=dict(l=0, r=0, t=0, b=0))

fig.show()


Below is the average Montly reading for May and June

In [None]:
fig = px.scatter_matrix(ds_plant1_agg_dt, title="plant 1 Month Data")

fig.update_layout(width=1200,
                 height=1200,
                 margin=dict(l=0, r=0, t=0, b=0))
fig.show()

Concatenate the two plants dataframes and separate numerical from non-numerical features

In [None]:
df_plant12= pd.concat([df_plant1,df_plant2],axis = 0)
pd.to_datetime(df_plant12['DATE_TIME'])
label = "DAILY_YIELD"

numerical_features = np.array(["DATE_TIME", "DC_POWER", \
                               "AC_POWER", "DAILY_YIELD", "TOTAL_YIELD","DATE","TIME"])
non_numerical_features = np.array([column for column in df_plant12.columns \
                                   if column not in numerical_features and \
                                      column != label])


Below is the average Montly reading for May and June

In [None]:
fig = px.scatter_matrix(df_plant12, color=label, title="Combined Monthly Data")

fig.update_layout(width=len(numerical_features) * 200,
                 height=len(numerical_features) * 200,
                 margin=dict(l=0, r=0, t=0, b=0))
fig.show()

Plant 1 seems to have an average count of 3000+
In Plant2 the counts are less for some Inverters. Either shutdown or no data 

In [None]:
fig = px.histogram(df_plant1, x="SOURCE_KEY",  histfunc='sum', title ="Invertors(Plant1)")
fig.show()

Some of the inverter readings are missing for plant 2

In [None]:
fig = px.histogram(df_plant2, x="SOURCE_KEY",  histfunc='sum', title ="Invertors(Plant2)")
fig.show()

# Additional insights

Invertor comparison between plant 1 and 2 (DC Yields and AC Yield per Invertor)
Invertors are performing suboptimally in plant 2

In [None]:

fig = make_subplots(rows=2, cols=1)
fig.append_trace(go.Box(x=df_plant1['SOURCE_KEY'],y=df_plant1['DC_POWER'],name='Plant1 Invertor DC_POWER YIELD'),row=1,col = 1)
fig.append_trace(go.Box(x=df_plant2['SOURCE_KEY'],y=df_plant2['DC_POWER'],name='Plant2 Invertor DC_POWER YIELD'),row=2,col = 1)
fig.show()

In [None]:

fig = make_subplots(rows=2, cols=1)
fig.append_trace(go.Box(x=df_plant1['SOURCE_KEY'],y=df_plant1['AC_POWER'],name='Plant1 Invertor AC_POWER YIELD'),row=1,col = 1)
fig.append_trace(go.Box(x=df_plant2['SOURCE_KEY'],y=df_plant2['AC_POWER'],name='Plant2 Invertor AC_POWER YIELD'),row=2,col = 1)
fig.show()

## Outliers - Plant 1 and 2 
Some AC, DC POWER are seen to be zero in the noon time zones
Could be inverter malfunctions or bad data?

In [None]:
fig = make_subplots(rows=2, cols=2)

fig.append_trace(go.Box(x=df_plant1['TIME'],y=df_plant1['DC_POWER'],name='Plant1 Daily DC_POWER YIELD'),row=1,col = 1)
fig.append_trace(go.Box(x=df_plant2['TIME'],y=df_plant2['DC_POWER'],name='Plant2 Daily DC_POWER YIELD'),row=1,col = 2)
fig.append_trace(go.Box(x=df_plant1['TIME'],y=df_plant1['AC_POWER'],name='Plant1 Daily AC_POWER YIELD'),row=2,col = 1)
fig.append_trace(go.Box(x=df_plant2['TIME'],y=df_plant2['AC_POWER'],name='Plant2 Daily AC_POWER YIELD'),row=2,col = 2)
fig.show()

In [None]:
timeinfoplant1 = df_plant1['DATE_TIME'].dt.time
dateinfoplant1= df_plant1['DATE_TIME'].dt.date
print(df_plant1['DATE_TIME'].dt.time)
print(df_plant1['DATE_TIME'].dt.date)


Correlation of features in plant1.Mostly positive.

In [None]:
px.imshow(df_plant1.corr())

## Correlation - Power

In [None]:
is_plant1 = df_plant12.PLANT_ID == 4136001
df_plant12[is_plant1].corr(method ='pearson')


In [None]:
is_plant2 = df_plant12.PLANT_ID == 4135001
df_plant12[is_plant2].corr(method ='pearson')


In [None]:
corr = df_plant12.corr()

trace = go.Heatmap(z=corr.values,
                  x=corr.index.values,
                  y=corr.columns.values)
data=[trace]
fig = go.Figure(
    data,layout_title_text="Combined Power correlation"
)
fig.show()

##Scatter Plots for Plant 1 and 2 combined

Scatter Plot for Plant 1 and 2 combined

In [None]:
df_plant12["DATE_TIME"] = df_plant12["DATE_TIME"].dt.strftime("%d-%m-%Y")
fig = px.scatter_matrix(df_plant12, dimensions=numerical_features, color=label)

fig.update_layout(width=len(numerical_features) * 200,
                 height=len(numerical_features) * 200,
                 margin=dict(l=0, r=0, t=0, b=0))

fig.show()



## Comparison of Plant1 and Plant2 YIELD
*Daily yield comparison of plant1 and plant 2.
*AC DC outputs in plant1 and plant2
**Looks like plant2 is not performing as good as plant1

In [None]:
result_plant_dt = result12.groupby(['PLANT_ID','TIME']).agg({'DC_POWER': 'mean', 'AC_POWER': 'mean', 'DAILY_YIELD': 'mean', }).reset_index()
result_plant_dt

In [None]:
result_plant = result12.groupby(['PLANT_ID','SOURCE_KEY_x','DATE']).agg({'DC_POWER': 'max', 'AC_POWER': 'max', 'DAILY_YIELD': 'max', 'IRRADIATION': 'max','MODULE_TEMPERATURE':'max' }).reset_index()
result_plant_Model1 = result_plant.groupby(['PLANT_ID','DATE']).agg({'DC_POWER': 'sum', 'AC_POWER': 'sum', 'DAILY_YIELD': 'sum', 'IRRADIATION': 'max','MODULE_TEMPERATURE':'max' }).reset_index()
df_p1ant1_dt = result_plant_Model1[result_plant_Model1['PLANT_ID']==4135001]
df_p1ant2_dt = result_plant_Model1[result_plant_Model1['PLANT_ID']==4136001]

#df_p1ant1_dt = df_plant1.groupby('DATE').sum().reset_index() 
#df_p1ant2_dt = df_plant2.groupby('DATE').sum().reset_index()

result_plant_dt = result12.groupby(['PLANT_ID','TIME']).agg({'DC_POWER': 'max', 'AC_POWER': 'max', 'DAILY_YIELD': 'max', }).reset_index()


#df_p1ant1_time = df_plant1.groupby('TIME').sum().reset_index()
#df_p1ant2_time = df_plant2.groupby('TIME').sum().reset_index()

df_p1ant1_time = result_plant_dt[result_plant_dt['PLANT_ID']==4135001]
df_p1ant1_time = result_plant_dt[result_plant_dt['PLANT_ID']==4136001]

fig = make_subplots(rows=1, cols=1)

fig.append_trace(go.Scatter(x=df_p1ant1_dt['DATE'],
                 y=df_p1ant1_dt['DAILY_YIELD'],
name='Plant1 Daily Yield'), row=1, col=1)
fig.append_trace(go.Scatter(x=df_p1ant2_dt['DATE'],
                 y=df_p1ant2_dt['DAILY_YIELD'],
name='Plant2 Daily Yield'), row=1, col=1)
#fig.append_trace(go.Scatter(x=df_p1ant1_time['TIME'],
#                 y=df_p1ant1_time['AC_POWER'],
#name='Plant1 AC_POWER by TIME'), row=1, col=2)
#fig.append_trace(go.Scatter(x=df_p1ant2_time['TIME'],
 #                y=df_p1ant2_time['AC_POWER'],
#name='Plant2 AC_POWER by TIME'), row=1, col=2)
#fig.append_trace(go.Scatter(x=df_p1ant1_time['TIME'],
 #                y=df_p1ant1_time['DC_POWER'],
#name='Plant1 DC_POWER by TIME'), row=2, col=1)
#fig.append_trace(go.Scatter(x=df_p1ant2_time['TIME'],
#                 y=df_p1ant2_time['DC_POWER'],
#name='Plant2 DC_POWER by TIME'), row=2, col=1)
#fig.append_trace(go.Scatter(x=df_p1ant1_time['AC_POWER'],
#                 y=df_p1ant1_time['DC_POWER'],
#name='Plant1 AC vs DC_POWER by TIME'), row=2, col=2)
#fig.append_trace(go.Scatter(x=df_p1ant2_time['AC_POWER'],
###                 y=df_p1ant2_time['DC_POWER'],
#name='Plant2 AC vs DC_POWER by TIME'), row=2, col=2)
fig.update_layout(height=400, width=400, title_text="Daily yeild Plant1 and Plant 2")
fig.show()


In [None]:
df_weather1 = pd.read_csv(filename3,parse_dates=['DATE_TIME'], infer_datetime_format=True)
df_weather2 = pd.read_csv(filename4,parse_dates=['DATE_TIME'], infer_datetime_format=True)
df_weather12 = pd.concat([df_weather1,df_weather2],axis=0)
df_weather12.info()
df_weather12.info()

In [None]:
pd.to_datetime(df_weather12['DATE_TIME'])
timeinfoweather12 = df_weather12['DATE_TIME'].dt.time
dateinfoweather12= df_weather12['DATE_TIME'].dt.date


In [None]:
df_weather12['DATE'] = pd.to_datetime(dateinfoweather12.values)
df_weather12['TIME'] = timeinfoweather12
pd.to_datetime(df_weather12['DATE'])
pd.to_datetime(df_weather12['DATE_TIME'])

In [None]:

numerical_features_weather = np.array(["DATE_TIME", "AMBIENT_TEMPERATURE", \
                               "MODULE_TEMPERATURE", "IRRADIATION","DATE","TIME"])
non_numerical_features_weather = np.array([column for column in df_weather12.columns \
                                  if column not in numerical_features_weather ])
print(numerical_features_weather)
print(non_numerical_features_weather)

## Weather Data - scatter plots from sensors in plant1 and 2

Irradiation,module and Ambient temperature is normally distributed with time. There is a positive correlation between irradiation and temperature.

In [None]:
df_weather12.info()
df_weather12["DATE_TIME"] = df_weather12["DATE_TIME"].dt.strftime("%d-%m-%Y")
list =["AMBIENT_TEMPERATURE","MODULE_TEMPERATURE", "IRRADIATION","DATE","TIME","SOURCE_KEY"]
fig = px.scatter_matrix(df_weather12, dimensions=list,color='PLANT_ID')

fig.update_layout(width=len(list) * 200,
                 height=len(list) * 200,
                 margin=dict(l=0, r=0, t=0, b=0))

fig.show()


In [None]:
df_weather12.info()

pd.to_datetime(df_weather12['DATE_TIME'])
ds_weather12_grp_time=df_weather12.groupby("TIME")
ds_weather12_grp_time=ds_weather12_grp_time.agg('mean').reset_index()

In [None]:
fig = px.line(ds_weather12_grp_time, x="TIME", y="IRRADIATION", title='TIME-IRRADIATION')
fig.show()

In [None]:
fig = px.line(ds_weather12_grp_time, x="TIME", y="MODULE_TEMPERATURE", title='TIME-MODULE_TEMPERATURE')
fig.show()

In [None]:
df_weather12.corr()

In [None]:
fig = px.line(ds_weather12_grp_time, x="AMBIENT_TEMPERATURE", y="MODULE_TEMPERATURE", title='AMBIENT-MODULE_TEMPERATURE')
fig.show()

## Merging Plant and Weather Data - Scatter Plot

In [None]:
result12 = pd.merge(df_plant12,df_weather12, how ="left", on=["DATE","TIME","PLANT_ID"])
result12.to_csv('Merged.csv')


In [None]:
result12.info()

list =['DATE_TIME_x', 'PLANT_ID', 'SOURCE_KEY_x', 'DC_POWER', 'AC_POWER', 'DAILY_YIELD',\
       'TOTAL_YIELD', 'DATE', 'TIME', 'DATE_TIME_y', 'SOURCE_KEY_y', 'AMBIENT_TEMPERATURE', \
       'MODULE_TEMPERATURE', 'IRRADIATION']
fig = px.scatter_matrix(result12, dimensions=list)

fig.update_layout(width=len(list) * 200,
                 height=len(list) * 200,
                 margin=dict(l=0, r=0, t=0, b=0))

fig.show()

## Correlation-Power and Weather(Heat Map)

In [None]:
corr = result12.corr()

trace = go.Heatmap(z=corr.values,
                  x=corr.index.values,
                  y=corr.columns.values)
data=[trace]
fig = go.Figure(
    data,layout_title_text="Combined Power and Weather correlation"
)
fig.show()

## DC to AC POwer Conversion ratio

In [None]:
result12['DC_to_AC_Conversion'] = result12['AC_POWER']/result12['DC_POWER']*100
result12['DC_to_AC_Conversion'].fillna(0.0)
#for i, row in result12.iterrows():
#    if(row['AC_POWER'] == 0.0  ):
#       result12.loc[i,'DC_to_AC_Conversion']= 0.0
#    else:
#       result12.loc[i,'DC_to_AC_Conversion'] = result12.loc[i,'DC_POWER']/result12.loc[i,'AC_POWER']
result12.info()

In [None]:
temp = result12[result12['PLANT_ID']==4136001]

list =['DATE_TIME_x','DC_to_AC_Conversion']
fig = px.scatter(temp, x="DATE_TIME_x", y="DC_to_AC_Conversion", title='DATE vs DC-AC conversion-Plant1', color = 'SOURCE_KEY_x',hover_data=['PLANT_ID'])
fig.show()

temp = result12[result12['PLANT_ID']==4135001]
fig = px.scatter(temp, x="DATE_TIME_x", y="DC_to_AC_Conversion", title='DATE vs DC-AC conversion-Plant2', color = 'SOURCE_KEY_x',hover_data=['PLANT_ID'])
fig.show()



In [None]:
result12_dt = result12.groupby('DATE').sum().reset_index() 

result12_time = result12.groupby('TIME').sum().reset_index()


## Irradiation and Power - 

Irradiation when high,there are some zero AC and DC Power generation -indicating some problem.

In [None]:
list =[ 'DC_POWER', 'AC_POWER','IRRADIATION']
fig = px.scatter_matrix(result12, dimensions=list,color='PLANT_ID')

fig.update_layout(width=len(list) * 200,
                 height=len(list) * 200,
                 margin=dict(l=0, r=0, t=0, b=0))

fig.show()

## Module Temperature and Power

In [None]:
list =[ 'DC_POWER', 'AC_POWER','AMBIENT_TEMPERATURE', \
       'MODULE_TEMPERATURE']
fig = px.scatter_matrix(result12, dimensions=list, color = 'PLANT_ID')

fig.update_layout(width=len(list) * 200,
                 height=len(list) * 200,
                 margin=dict(l=0, r=0, t=0, b=0))

fig.show()

# Feature Engineering

**Given the above analysis we will need to do the following feature engineering:**

Encoding: Categorical data such as "Source key" and "Plant ID" encoded as numerical data.
Decompose: Date_Time, Ambient & module temperature
Scale: All our features in both datasets
Transform: Existing features that we would like to extract new features from e.g. Daily yield


# DAU: Data Acquisition and Understanding

**Additional datasets that would be useful**

In order to improve the accuracy of our future QuAM, it would be useful to have acess to more detailed weather data for the location of the plants. Unfortunately, the location of the plants is not given by the providers of the data, but if we had access to that we could augment it with [this dataset](https://www.kaggle.com/hiteshsoneji/historical-weather-data-for-indian-cities). This dataset provides historical weather data for 8 Indian cities. If we could determine the location of the plants and find similar weather data for that location overlapping with the time our readings were taken, this would improve the accuracy of our QuAM. 


---



**Reconsidering our problem definition in light of the data anlysis**



*   Problem definition:
*   Outcome:
* Action:
* Judgement:
* Context: 
* Ethical concerns:


