In [None]:
import os

def scale_input_data(scale_factor):
  file_bases = ['./input/Plant_1_Generation_Data', './input/Plant_1_Weather_Sensor_Data',
                './input/Plant_2_Generation_Data', './input/Plant_2_Weather_Sensor_Data']
  for file_base in file_bases:
    import pandas as pd
    import shutil
    if scale_factor == 1.0:
      shutil.copyfile(file_base + '.csv', file_base + '.scaled.csv')
      continue
    df_to_scale = pd.read_csv(file_base + '.csv')
    new_num_rows = int(scale_factor * len(df_to_scale))
    if scale_factor <= 1.0:
      df_to_scale = df_to_scale.iloc[:new_num_rows]
    else:
      while len(df_to_scale) < new_num_rows:
        df_to_scale = pd.concat([df_to_scale, df_to_scale[:min(new_num_rows - len(df_to_scale), len(df_to_scale))]])
    df_to_scale.to_csv(file_base + '.scaled.csv', index=False)

if 'INPUT_SCALE_FACTOR' in os.environ:
  scale_input_data(float(os.environ['INPUT_SCALE_FACTOR']))

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
exec(os.environ['IREWR_IMPORTS'])
# FIRST-AUTHOR: remove plotting
# import seaborn as sns
# import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# FIRST-AUTHOR: remove path printing
# import os
# for dirname, _, filenames in os.walk('./input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
mydateparser = lambda x: pd.datetime.strptime(x, "%d-%m-%Y %H:%M")


### Let's concentrate on plant 1 for the time being

In [3]:
gen_data = pd.read_csv("./input/Plant_1_Generation_Data.scaled.csv",index_col = "DATE_TIME",parse_dates = ["DATE_TIME"] , date_parser = mydateparser)

  mydateparser = lambda x: pd.datetime.strptime(x, "%d-%m-%Y %H:%M")


In [4]:
gen_data.head()

Unnamed: 0_level_0,PLANT_ID,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-05-15,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6259559.0
2020-05-15,4135001,1IF53ai7Xc0U56Y,0.0,0.0,0.0,6183645.0
2020-05-15,4135001,3PZuoBAID5Wc2HD,0.0,0.0,0.0,6987759.0
2020-05-15,4135001,7JYdWkrLSPkdwr4,0.0,0.0,0.0,7602960.0
2020-05-15,4135001,McdE0feGgRqW7Ca,0.0,0.0,0.0,7158964.0


In [5]:
gen_data["Date"] = pd.to_datetime(gen_data.index.map(lambda x : x.date()))
gen_data["Time"] = gen_data.index.map(lambda x : x.time())

In [6]:
gen_data.loc[(gen_data["DC_POWER"] == 0) & (gen_data["AC_POWER"] != 0)]

Unnamed: 0_level_0,PLANT_ID,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,Date,Time
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1


In [7]:
gen_data.loc[(gen_data["DC_POWER"] != 0) & (gen_data["AC_POWER"] == 0)]

Unnamed: 0_level_0,PLANT_ID,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,Date,Time
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1


### Exploration

Check for NaNs

In [8]:
[any(pd.isnull(gen_data[column])) for column in gen_data.columns]

[False, False, False, False, False, False, False, False]

In [9]:
numeric_columns = [column for column in gen_data.columns if gen_data[column].dtype in ['int64','float64']]
other_columns = [column for column in gen_data.columns if column not in numeric_columns]

In [10]:
[any(np.isnan(gen_data[column])) for column in numeric_columns]

[False, False, False, False, False]

In [11]:
gen_data["Date"].head()

DATE_TIME
2020-05-15   2020-05-15
2020-05-15   2020-05-15
2020-05-15   2020-05-15
2020-05-15   2020-05-15
2020-05-15   2020-05-15
Name: Date, dtype: datetime64[ns]

Excellent, no NaNs or empty entries

Plot power production as a function of time for a few inverters

In [12]:
gen_data.columns

Index(['PLANT_ID', 'SOURCE_KEY', 'DC_POWER', 'AC_POWER', 'DAILY_YIELD',
       'TOTAL_YIELD', 'Date', 'Time'],
      dtype='object')

In [13]:
len(gen_data["SOURCE_KEY"].unique())

22

In [14]:
inverters = gen_data["SOURCE_KEY"].unique()

Visualizing a time lapse of power production for all inverters on a given day (15/05/2020 in this case)

In [15]:
# FIRST-AUTHOR: remove plotting
# fig = plt.figure(figsize = (25,16))
for i,inverter in enumerate(inverters,1):
# FIRST-AUTHOR: remove plotting
#     plt.subplot(6,4,i)
#     plt.yscale("log")
#     gen_data.loc[(gen_data["Date"] == "2020-05-15") &  (gen_data["SOURCE_KEY"] == inverter),"DC_POWER"].plot(label = inverter + " DC")
#     gen_data.loc[(gen_data["Date"] == "2020-05-15") & (gen_data["SOURCE_KEY"] == inverter),"AC_POWER"].plot(label = inverter + " AC")
#     plt.legend()
    gen_data.loc[(gen_data["Date"] == "2020-05-15") &  (gen_data["SOURCE_KEY"] == inverter),"DC_POWER"]
    gen_data.loc[(gen_data["Date"] == "2020-05-15") & (gen_data["SOURCE_KEY"] == inverter),"AC_POWER"]

* DC and AC seem to follow each other i.e., inverter seems to produce DC and convert it to AC (sanity check)
* Inverters run from roughly 6AM to roughly 6PM

In [16]:
gen_data.groupby("SOURCE_KEY").count()

Unnamed: 0_level_0,PLANT_ID,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,Date,Time
SOURCE_KEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1BY6WEcLGh8j5v7,3154,3154,3154,3154,3154,3154,3154
1IF53ai7Xc0U56Y,3119,3119,3119,3119,3119,3119,3119
3PZuoBAID5Wc2HD,3118,3118,3118,3118,3118,3118,3118
7JYdWkrLSPkdwr4,3133,3133,3133,3133,3133,3133,3133
McdE0feGgRqW7Ca,3124,3124,3124,3124,3124,3124,3124
VHMLBKoKgIrUVDU,3133,3133,3133,3133,3133,3133,3133
WRmjgnKYAwPKWDb,3118,3118,3118,3118,3118,3118,3118
YxYtjZvoooNbGkE,3104,3104,3104,3104,3104,3104,3104
ZnxXDlPa8U1GXgE,3130,3130,3130,3130,3130,3130,3130
ZoEaEvLYb1n2sOq,3123,3123,3123,3123,3123,3123,3123


In [17]:
34 * 24 * 4 #Number of data points required

3264

So not all plants have data at all points in time within a data taking period. I'm going to assume the plant is compact enough that the inverters are quite close to each other and get the same amount of solar irradiation at any point in time (the distribution plots roughly attest to this). So an imputing strategy will be to use the average for that day and that hour to fill a missing entry. This means the total power produced in a given time interval is given by the average produced multiplied by the total number of inverters

#### Day totals over the entire period of 34 days for each inverter

In [18]:
gen_data["Date"].unique()

array(['2020-05-15T00:00:00.000000000', '2020-05-16T00:00:00.000000000',
       '2020-05-17T00:00:00.000000000', '2020-05-18T00:00:00.000000000',
       '2020-05-19T00:00:00.000000000', '2020-05-20T00:00:00.000000000',
       '2020-05-21T00:00:00.000000000', '2020-05-22T00:00:00.000000000',
       '2020-05-23T00:00:00.000000000', '2020-05-24T00:00:00.000000000',
       '2020-05-25T00:00:00.000000000', '2020-05-26T00:00:00.000000000',
       '2020-05-27T00:00:00.000000000', '2020-05-28T00:00:00.000000000',
       '2020-05-29T00:00:00.000000000', '2020-05-30T00:00:00.000000000',
       '2020-05-31T00:00:00.000000000', '2020-06-01T00:00:00.000000000',
       '2020-06-02T00:00:00.000000000', '2020-06-03T00:00:00.000000000',
       '2020-06-04T00:00:00.000000000', '2020-06-05T00:00:00.000000000',
       '2020-06-06T00:00:00.000000000', '2020-06-07T00:00:00.000000000',
       '2020-06-08T00:00:00.000000000', '2020-06-09T00:00:00.000000000',
       '2020-06-10T00:00:00.000000000', '2020-06-11

In [19]:
gen_data.groupby("SOURCE_KEY").sum()["DC_POWER"]

  gen_data.groupby("SOURCE_KEY").sum()["DC_POWER"]


SOURCE_KEY
1BY6WEcLGh8j5v7    9.063716e+06
1IF53ai7Xc0U56Y    1.006294e+07
3PZuoBAID5Wc2HD    1.004313e+07
7JYdWkrLSPkdwr4    9.815350e+06
McdE0feGgRqW7Ca    1.003212e+07
VHMLBKoKgIrUVDU    1.005725e+07
WRmjgnKYAwPKWDb    9.838489e+06
YxYtjZvoooNbGkE    9.869145e+06
ZnxXDlPa8U1GXgE    9.997405e+06
ZoEaEvLYb1n2sOq    9.803128e+06
adLQvlD726eNBSB    1.019778e+07
bvBOhCH3iADSZry    8.925490e+06
iCRJl6heRkivqQ3    9.987733e+06
ih0vzX44oOqAx2f    9.695845e+06
pkci93gMrogZuBj    9.895623e+06
rGa61gmuvPhdLxV    9.881965e+06
sjndEbLyjtCKgGv    9.835197e+06
uHbuxQJl8lW7ozc    9.985363e+06
wCURE6d3bPkepu2    9.911622e+06
z9Y9gH1T5YWrNuG    9.797002e+06
zBIq5rxdHJRwDNY    9.820452e+06
zVJPv84UY57bAof    9.956927e+06
Name: DC_POWER, dtype: float64

In [20]:
#split the dataframes by inverter IDs first
split_by_inverters = {}
for inverter in inverters:
    split_by_inverters[inverter] = gen_data.loc[gen_data["SOURCE_KEY"] == inverter]

In [21]:
unique_dates = gen_data.index.map(lambda x : x.date()).unique()

In [22]:
temp = split_by_inverters['1BY6WEcLGh8j5v7']

In [23]:
# FIRST-AUTHOR: remove plotting
# fig = plt.figure(figsize = (30,25))
inverter_daily_power = {}
for i,(inverter,data) in enumerate(split_by_inverters.items(),1):
# FIRST-AUTHOR: remove plotting
#     plt.subplot(6,4,i)
    inverter_daily_power[inverter] = data.groupby("Date").sum()["DC_POWER"]
# FIRST-AUTHOR: remove plotting
#     inverter_daily_power[inverter].plot(label = inverter)
#     plt.legend()
    inverter_daily_power[inverter]

  inverter_daily_power[inverter] = data.groupby("Date").sum()["DC_POWER"]
  inverter_daily_power[inverter] = data.groupby("Date").sum()["DC_POWER"]
  inverter_daily_power[inverter] = data.groupby("Date").sum()["DC_POWER"]
  inverter_daily_power[inverter] = data.groupby("Date").sum()["DC_POWER"]
  inverter_daily_power[inverter] = data.groupby("Date").sum()["DC_POWER"]
  inverter_daily_power[inverter] = data.groupby("Date").sum()["DC_POWER"]
  inverter_daily_power[inverter] = data.groupby("Date").sum()["DC_POWER"]
  inverter_daily_power[inverter] = data.groupby("Date").sum()["DC_POWER"]
  inverter_daily_power[inverter] = data.groupby("Date").sum()["DC_POWER"]
  inverter_daily_power[inverter] = data.groupby("Date").sum()["DC_POWER"]
  inverter_daily_power[inverter] = data.groupby("Date").sum()["DC_POWER"]
  inverter_daily_power[inverter] = data.groupby("Date").sum()["DC_POWER"]
  inverter_daily_power[inverter] = data.groupby("Date").sum()["DC_POWER"]
  inverter_daily_power[inverter] = dat

We see a very rough pattern in the daily power production. Let us try to correlate this with the weather data we have

In [24]:
weather_data = pd.read_csv("./input/Plant_1_Weather_Sensor_Data.scaled.csv",index_col = "DATE_TIME",parse_dates = True)

In [25]:
weather_data["Date"] = pd.to_datetime(weather_data.index.map(lambda x : x.date()))
weather_data["Time"] = weather_data.index.map(lambda x : x.time())

In [26]:
weather_data.head()

Unnamed: 0_level_0,PLANT_ID,SOURCE_KEY,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,Date,Time
DATE_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-05-15 00:00:00,4135001,HmiyD2TTLFNqkNe,25.184316,22.857507,0.0,2020-05-15,00:00:00
2020-05-15 00:15:00,4135001,HmiyD2TTLFNqkNe,25.084589,22.761668,0.0,2020-05-15,00:15:00
2020-05-15 00:30:00,4135001,HmiyD2TTLFNqkNe,24.935753,22.592306,0.0,2020-05-15,00:30:00
2020-05-15 00:45:00,4135001,HmiyD2TTLFNqkNe,24.84613,22.360852,0.0,2020-05-15,00:45:00
2020-05-15 01:00:00,4135001,HmiyD2TTLFNqkNe,24.621525,22.165423,0.0,2020-05-15,01:00:00


Temperature and irradiation profile in a given day (15-05-2020)

In [27]:
# FIRST-AUTHOR: remove plotting
# fig = plt.figure(figsize = (18,4))
# plt.subplot(131)
# weather_data.loc[(weather_data["Date"] == "2020-05-15"), "IRRADIATION"].plot(legend = True)
# #plt.legend()
# plt.subplot(132)
# weather_data.loc[(weather_data["Date"] == "2020-05-15"),"AMBIENT_TEMPERATURE"].plot(legend = True)
# plt.subplot(133)
# weather_data.loc[(weather_data["Date"] == "2020-05-15 00:00:00"),"MODULE_TEMPERATURE"].plot(legend = True)
weather_data.loc[(weather_data["Date"] == "2020-05-15"), "IRRADIATION"]
weather_data.loc[(weather_data["Date"] == "2020-05-15"),"AMBIENT_TEMPERATURE"]
weather_data.loc[(weather_data["Date"] == "2020-05-15 00:00:00"),"MODULE_TEMPERATURE"]

DATE_TIME
2020-05-15 00:00:00    22.857507
2020-05-15 00:15:00    22.761668
2020-05-15 00:30:00    22.592306
2020-05-15 00:45:00    22.360852
2020-05-15 01:00:00    22.165423
                         ...    
2020-05-15 22:00:00    20.895704
2020-05-15 22:15:00    20.732186
2020-05-15 22:30:00    20.829852
2020-05-15 22:45:00    21.072276
2020-05-15 23:00:00    21.241317
Name: MODULE_TEMPERATURE, Length: 93, dtype: float64

In [28]:
# FIRST-AUTHOR: remove plotting
# sns.scatterplot(x = weather_data.loc[(weather_data["Date"] == "2020-05-15"),"AMBIENT_TEMPERATURE"], y =weather_data.loc[(weather_data["Date"] == "2020-05-15"),"MODULE_TEMPERATURE"])
_ = weather_data.loc[(weather_data["Date"] == "2020-05-15"),"AMBIENT_TEMPERATURE"]
_ = weather_data.loc[(weather_data["Date"] == "2020-05-15"),"MODULE_TEMPERATURE"]

The thing with temperatures is that they're not instantaneous (i.e., a change in ambient temperature or irradiation will not immediately change the module temperature). The ambient temperature for example is sort of a delayed response to solar irradiation (because ground takes some time to heat up, and a lot of time to cool down). The module temperature gets complicated because it's influenced by the sun during the day and the cooling ground at night

In [29]:
#Timestamp of maximum irradiation on the 15th of May
weather_data.loc[(weather_data["Date"] == "2020-05-15"),"IRRADIATION"].idxmax()

Timestamp('2020-05-15 14:15:00')

In [30]:
#Timestamp of maximum ambient temperature on the 15th of May
weather_data.loc[(weather_data["Date"] == "2020-05-15"),"AMBIENT_TEMPERATURE"].idxmax()

Timestamp('2020-05-15 15:45:00')

In [31]:
weather_data.loc[(weather_data["Date"] == "2020-05-15"),"MODULE_TEMPERATURE"].idxmax()

Timestamp('2020-05-15 14:15:00')

Checking the above assertion that the ambient temperature peaks a while after maximum irradiation/module temperature for all days

In [32]:
inverter_daily_power.keys()

dict_keys(['1BY6WEcLGh8j5v7', '1IF53ai7Xc0U56Y', '3PZuoBAID5Wc2HD', '7JYdWkrLSPkdwr4', 'McdE0feGgRqW7Ca', 'VHMLBKoKgIrUVDU', 'WRmjgnKYAwPKWDb', 'ZnxXDlPa8U1GXgE', 'ZoEaEvLYb1n2sOq', 'adLQvlD726eNBSB', 'bvBOhCH3iADSZry', 'iCRJl6heRkivqQ3', 'ih0vzX44oOqAx2f', 'pkci93gMrogZuBj', 'rGa61gmuvPhdLxV', 'sjndEbLyjtCKgGv', 'uHbuxQJl8lW7ozc', 'wCURE6d3bPkepu2', 'z9Y9gH1T5YWrNuG', 'zBIq5rxdHJRwDNY', 'zVJPv84UY57bAof', 'YxYtjZvoooNbGkE'])

Rough plot of net solar irradiation vs Power produced in a given inverter over the data taking period

In [33]:
weather_data["date"] = weather_data.index.map(lambda x : x.date())
daily_irradiation = weather_data.groupby("date").sum()["IRRADIATION"]
# FIRST-AUTHOR: remove plotting
# sns.scatterplot(x = daily_irradiation, y = inverter_daily_power["1BY6WEcLGh8j5v7"])
_ = inverter_daily_power["1BY6WEcLGh8j5v7"]

  daily_irradiation = weather_data.groupby("date").sum()["IRRADIATION"]


The above plot is only an estimate because we have issues with data taking over the course of days (i.e., some intervals don't have data). We need to take the intersection of timestamps of the inverter output data and the irradiation data to get a better estimate

 #### Irradiation vs Different temperature metrics 

#### Max and Min temperature vs data taking period

In [34]:
max_temps = weather_data.groupby("date").max()["AMBIENT_TEMPERATURE"]
min_temps = weather_data.groupby("date").min()["AMBIENT_TEMPERATURE"]

In [35]:
# FIRST-AUTHOR: remove plotting
# plt.figure(figsize = (12,6))
# max_temps.plot(label = "Maximum Temperature")
# min_temps.plot(label = "Minimum Temperature")
# plt.legend()

In [36]:
max_temps = weather_data.groupby("date").max()["AMBIENT_TEMPERATURE"]
min_temps = weather_data.groupby("date").min()["AMBIENT_TEMPERATURE"]
diff_temps = max_temps - min_temps
daily_irradiation = weather_data.groupby("date").sum()["IRRADIATION"]

  daily_irradiation = weather_data.groupby("date").sum()["IRRADIATION"]


Irradiation vs (Maximum - Minimum) temperature

In [37]:
# FIRST-AUTHOR: remove plotting
# sns.scatterplot(daily_irradiation,diff_temps)

The first hint of nonzero radiation is when the sun appears in Line of Sight of the solar panels. The temperature at this point is our "baseline" temperature before any solar irradiation, and the maximum temperature is, well, the maximum. The difference between these two temperatures should tell us a measure of irradiation

In [38]:
temp_before_sunrise = weather_data.loc[(weather_data["Time"] < pd.to_datetime("07:00").time()) & (weather_data["IRRADIATION"] > 0)].groupby("date")["AMBIENT_TEMPERATURE"].min()

In [39]:
diff_temps = max_temps - temp_before_sunrise

In [40]:
# FIRST-AUTHOR: remove plotting
# sns.scatterplot(daily_irradiation,diff_temps)

The above temperature metric doesn't seem to be a very good indicator of irradiation. Temperature and irradiation seem to have a complicated relationship that could not be modelled linearly

Since not all inverters have readings for all intervals of time, we "impute" entries for the inverters (and subsequently calculate the total power produced in any instance of time) by using the average DC/AC power produced in that time period as the power produced by an inverter with a missing reading. This implies the total power produced in an interval of time is just the average for that time interval multiplied by the total number of inverters. 

Calculate the average DC and AC power produced for a given timestamp

In [41]:
average_power = gen_data.reset_index().groupby("DATE_TIME").mean()[["DC_POWER","AC_POWER"]]

  average_power = gen_data.reset_index().groupby("DATE_TIME").mean()[["DC_POWER","AC_POWER"]]


In [42]:
total_power = average_power * gen_data["PLANT_ID"].nunique()

In [43]:
total_power["Date"] = total_power.index.map(lambda x : x.date())
# FIRST-AUTHOR: remove plotting
# fig = plt.figure()
# total_power.groupby("Date").sum().plot()
# plt.yscale("log")
total_power.groupby("Date").sum()

Unnamed: 0_level_0,DC_POWER,AC_POWER
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-05-15,255798.931278,25039.251981
2020-05-16,263915.341722,25834.537338
2020-05-17,307254.297619,30044.335877
2020-05-18,218752.206168,21407.714042
2020-05-19,249834.902688,24453.503003
2020-05-20,244344.807736,23867.480336
2020-05-21,308260.988439,30127.611073
2020-05-22,291849.29551,28537.732181
2020-05-23,346828.053734,33903.831272
2020-05-24,325470.631493,31809.900487


In [44]:
# FIRST-AUTHOR: remove plotting
# sns.regplot(x = weather_data.groupby("Date")["IRRADIATION"].sum(), y = total_power.groupby("Date")["DC_POWER"].sum())
_ = weather_data.groupby("Date")["IRRADIATION"].sum()
_ = total_power.groupby("Date")["DC_POWER"].sum()

In [45]:
# FIRST-AUTHOR: remove ML code
# from sklearn.linear_model import LinearRegression
# model = LinearRegression()
# model.fit(weather_data.groupby("Date")["IRRADIATION"].sum().values.reshape(-1,1),total_power.groupby("Date")["DC_POWER"].sum())
# model.intercept_,model.coef_
_ = weather_data.groupby("Date")["IRRADIATION"].sum().values.reshape(-1,1)
_ = total_power.groupby("Date")["DC_POWER"].sum()

In [46]:
# FIRST-AUTHOR: remove plotting
# sns.regplot(x = total_power.groupby("Date")["DC_POWER"].sum(),y = total_power.groupby("Date")["AC_POWER"].sum())
_ = total_power.groupby("Date")["DC_POWER"].sum()
_ = total_power.groupby("Date")["AC_POWER"].sum()

In [47]:
# FIRST-AUTHOR: remove ML code
# from sklearn.linear_model import LinearRegression

In [48]:
# FIRST-AUTHOR: remove ML code
# a = LinearRegression()
# a.fit(total_power.groupby("Date")["DC_POWER"].sum().values.reshape(-1,1),total_power.groupby("Date")["AC_POWER"].sum())
# a.intercept_,a.coef_
_ = total_power.groupby("Date")["DC_POWER"].sum().values.reshape(-1,1)
_ = total_power.groupby("Date")["AC_POWER"].sum()

The first plant has an efficiency of 9.74%

### Repeating these with plant 2 and cross-checking the plants

In [49]:
gen_data_2 = pd.read_csv("./input/Plant_2_Generation_Data.scaled.csv",index_col = "DATE_TIME",parse_dates = ["DATE_TIME"])

In [50]:
gen_data_2["Date"] = gen_data_2.index.map(lambda x : x.date())
gen_data_2["Time"] = gen_data_2.index.map(lambda x : x.time())

In [51]:
gen_data_2.groupby("SOURCE_KEY").count()

Unnamed: 0_level_0,PLANT_ID,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,Date,Time
SOURCE_KEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4UPUqMRk7TRMgml,3195,3195,3195,3195,3195,3195,3195
81aHJ1q11NBPMrL,3259,3259,3259,3259,3259,3259,3259
9kRcWv60rDACzjR,3259,3259,3259,3259,3259,3259,3259
Et9kgGMDl729KT4,3195,3195,3195,3195,3195,3195,3195
IQ2d7wF4YD8zU1Q,2355,2355,2355,2355,2355,2355,2355
LYwnQax7tkwH5Cb,3259,3259,3259,3259,3259,3259,3259
LlT2YUhhzqhg5Sw,3259,3259,3259,3259,3259,3259,3259
Mx2yZCDsyf6DPfv,3195,3195,3195,3195,3195,3195,3195
NgDl19wMapZy17u,2355,2355,2355,2355,2355,2355,2355
PeE6FRyGXUgsRhN,3259,3259,3259,3259,3259,3259,3259


In [52]:
[any(pd.isnull(gen_data_2[column])) for column in gen_data_2.columns]

[False, False, False, False, False, False, False, False]

In [53]:
inverters = gen_data_2["SOURCE_KEY"].unique()
# FIRST-AUTHOR: remove plotting
# fig = plt.figure(figsize = (25,16))
for i,inverter in enumerate(inverters,1):
# FIRST-AUTHOR: remove plotting
#     plt.subplot(6,4,i)
#     plt.yscale("log")
#     gen_data_2.loc[(gen_data_2["Date"] == pd.to_datetime("2020-05-15")) &  (gen_data_2["SOURCE_KEY"] == inverter),"DC_POWER"].plot(label = inverter + " DC")
#     gen_data_2.loc[(gen_data_2["Date"] == pd.to_datetime("2020-05-15")) & (gen_data_2["SOURCE_KEY"] == inverter),"AC_POWER"].plot(label = inverter + " AC")
#     plt.legend()
    gen_data_2.loc[(gen_data_2["Date"] == pd.to_datetime("2020-05-15")) &  (gen_data_2["SOURCE_KEY"] == inverter),"DC_POWER"]
    gen_data_2.loc[(gen_data_2["Date"] == pd.to_datetime("2020-05-15")) & (gen_data_2["SOURCE_KEY"] == inverter),"AC_POWER"]

  gen_data_2.loc[(gen_data_2["Date"] == pd.to_datetime("2020-05-15")) &  (gen_data_2["SOURCE_KEY"] == inverter),"DC_POWER"]
  gen_data_2.loc[(gen_data_2["Date"] == pd.to_datetime("2020-05-15")) & (gen_data_2["SOURCE_KEY"] == inverter),"AC_POWER"]


In [54]:
inverters

array(['4UPUqMRk7TRMgml', '81aHJ1q11NBPMrL', '9kRcWv60rDACzjR',
       'Et9kgGMDl729KT4', 'IQ2d7wF4YD8zU1Q', 'LYwnQax7tkwH5Cb',
       'LlT2YUhhzqhg5Sw', 'Mx2yZCDsyf6DPfv', 'NgDl19wMapZy17u',
       'PeE6FRyGXUgsRhN', 'Qf4GUc1pJu5T6c6', 'Quc1TzYxW2pYoWX',
       'V94E5Ben1TlhnDV', 'WcxssY2VbP4hApt', 'mqwcsP2rE7J0TFp',
       'oZ35aAeoifZaQzV', 'oZZkBaNadn6DNKz', 'q49J1IKaHRwDQnt',
       'rrq4fwE8jgrTyWY', 'vOuJvMaM2sgwLmb', 'xMbIugepa2P7lBB',
       'xoJJ8DcxJEcupym'], dtype=object)

In [55]:
average_power_2 = gen_data_2.reset_index().groupby("DATE_TIME").mean()[["DC_POWER","AC_POWER"]]
total_power_2 = average_power_2 * gen_data_2["SOURCE_KEY"].nunique()
total_power_2["Date"] = total_power_2.index.map(lambda x : x.date())

  average_power_2 = gen_data_2.reset_index().groupby("DATE_TIME").mean()[["DC_POWER","AC_POWER"]]


In [56]:
weather_data_2 = pd.read_csv("./input/Plant_2_Weather_Sensor_Data.scaled.csv",index_col = "DATE_TIME",parse_dates = True)
weather_data_2["Date"] = weather_data_2.index.map(lambda x : x.date())
weather_data_2["Time"] = weather_data_2.index.map(lambda x : x.time())

In [57]:
# FIRST-AUTHOR: remove plotting
# sns.regplot(x = weather_data_2.groupby("Date")["IRRADIATION"].sum(),y = total_power_2.groupby("Date").sum()["DC_POWER"])
_ = weather_data_2.groupby("Date")["IRRADIATION"].sum()
_ = total_power_2.groupby("Date").sum()["DC_POWER"]

The fit is not as good as the first power plant. Maybe the nonlinearities play a bigger role here, but hey, we have 100% efficiency!

In [58]:
# FIRST-AUTHOR: remove plotting
# sns.regplot(x = total_power_2.groupby("Date").sum()["DC_POWER"], y = total_power_2.groupby("Date").sum()["AC_POWER"])
_ = total_power_2.groupby("Date").sum()["DC_POWER"]
_ = total_power_2.groupby("Date").sum()["AC_POWER"]

In [59]:
# FIRST-AUTHOR: remove ML code
# model_2 = LinearRegression()
# model_2.fit(total_power_2.groupby("Date").sum()["DC_POWER"].values.reshape(-1,1), y = total_power_2.groupby("Date").sum()["AC_POWER"])
# model_2.coef_,model_2.intercept_
_ = total_power_2.groupby("Date").sum()["DC_POWER"].values.reshape(-1,1)
_ = total_power_2.groupby("Date").sum()["AC_POWER"]

97% efficiency! (too good to be true!), compared to 10% in case of the other plant