In [8]:
import joblib
import itertools as it

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.style.use('seaborn-dark-palette')

font = {'family': 'normal',
        'weight': 'normal',
        'size'  : 16}
plt.tight_layout()

mpl.rc('font', **font)

sns.set_style("whitegrid")

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

pd.options.display.float_format = '{:,.4f}'.format

## Load wind power data

In [9]:
df_wind_power = pd.read_csv('../data/wind/wind_power_train.csv',
                            header=0,
                            index_col=0,
                            parse_dates=True)

In [10]:
df_wind_meteo = pd.read_csv('../data/wind/wind_meteo_train.csv',
                            header=[0, 1],
                            skiprows=[2],
                            index_col=0,
                            parse_dates=True)

In [11]:
df_wind_power.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10
TIMESTAMP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2012-01-01 01:00:00,0.0,0.5963,0.4255,0.3782,0.2737,0.2681,0.0,0.0,0.0,0.5944
2012-01-01 02:00:00,0.0549,0.4112,0.3639,0.063,0.0868,0.0345,0.0147,0.015,0.0,0.5697
2012-01-01 03:00:00,0.1102,0.1672,0.2971,0.0367,0.0068,0.0206,0.0351,0.0718,0.0,0.3305
2012-01-01 04:00:00,0.1651,0.0373,0.2354,0.0343,0.0186,0.0019,0.0451,0.0668,0.0,0.2113
2012-01-01 05:00:00,0.1569,0.0512,0.1209,0.0336,0.0348,0.0152,0.0502,0.0348,0.0,0.1721


In [12]:
df_wind_meteo.head()

Unnamed: 0_level_0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10
Unnamed: 0_level_1,U10,V10,U100,V100,U10,V10,U100,V100,U10,V10,U100,V100,U10,V10,U100,V100,U10,V10,U100,V100,U10,V10,U100,V100,U10,V10,U100,V100,U10,V10,U100,V100,U10,V10,U100,V100,U10,V10,U100,V100
2012-01-01 01:00:00,2.1246,-2.682,2.8643,-3.6661,-0.1716,-5.466,-0.1126,-7.1013,2.1932,-4.6492,3.0284,-6.2216,0.5349,-3.6602,0.7654,-4.4867,0.5349,-3.6602,0.7654,-4.4867,1.2056,-2.4179,1.4678,-2.9561,-0.3075,0.3713,-0.6254,0.2985,-0.3075,0.3713,-0.6254,0.2985,1.9033,-1.495,2.5167,-2.0781,1.6558,-4.6496,2.0243,-5.9922
2012-01-01 02:00:00,2.5217,-1.797,3.3449,-2.4648,-0.0889,-4.6432,-0.0141,-5.8968,2.9701,-3.9443,3.9811,-5.2541,0.3308,-2.6764,0.4699,-3.2138,0.3308,-2.6764,0.4699,-3.2138,0.9811,-1.5776,1.1491,-1.9371,0.14,0.7516,0.025,0.7975,0.14,0.7516,0.025,0.7975,2.1733,-0.6436,2.8371,-1.0069,2.1786,-4.1849,2.6231,-5.3181
2012-01-01 03:00:00,2.6722,-0.8225,3.5084,-1.2141,-0.246,-3.6134,-0.2252,-4.4894,3.2389,-3.1445,4.3558,-4.1634,-0.0658,-2.0291,-0.0209,-2.4181,-0.0658,-2.0291,-0.0209,-2.4181,0.6133,-1.0112,0.6974,-1.2607,0.6761,1.4722,0.6748,1.6268,0.6761,1.4722,0.6748,1.6268,2.3127,0.1366,3.0341,-0.0155,2.2286,-3.2424,2.6452,-4.0522
2012-01-01 04:00:00,2.4575,-0.1436,3.2152,-0.3555,-0.6807,-2.9196,-0.772,-3.5988,3.261,-2.2288,4.3139,-2.9155,-0.4195,-1.7991,-0.4449,-2.1497,-0.4195,-1.7991,-0.4449,-2.1497,0.2427,-0.7467,0.2621,-0.9565,0.9072,2.2588,0.9369,2.6258,0.9072,2.2588,0.9369,2.6258,2.2176,0.7972,2.9418,0.8235,2.0772,-2.3451,2.4329,-2.9138
2012-01-01 05:00:00,2.2459,0.3896,2.9577,0.3327,-1.2611,-2.6198,-1.4875,-3.2447,2.94,-2.0401,3.9336,-2.6898,-0.7542,-1.6615,-0.8383,-1.9918,-0.7542,-1.6615,-0.8383,-1.9918,-0.1384,-0.5014,-0.1675,-0.6918,0.7394,2.9079,0.7206,3.5017,0.7394,2.9079,0.7206,3.5017,1.9006,1.2857,2.5592,1.4711,1.8147,-1.5776,2.0978,-1.9761


## Load solar power data

In [13]:
df_solar_power = pd.read_csv('../data/solar/solar_power_train.csv',
                             header=0,
                             index_col=0,
                             parse_dates=True)

df_solar_meteo = pd.read_csv('../data/solar/solar_meteo_train.csv',
                             header=[0, 1],
                             skiprows=[2],
                             index_col=0,
                             parse_dates=True)

In [14]:
df_solar_power.head()

Unnamed: 0_level_0,1,2,3
TIMESTAMP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-04-01 01:00:00,0.7541,0.6473,0.799
2012-04-01 02:00:00,0.555,0.6218,0.8175
2012-04-01 03:00:00,0.4384,0.4727,0.5715
2012-04-01 04:00:00,0.1454,0.0989,0.0638
2012-04-01 05:00:00,0.112,0.1044,0.1037


In [15]:
df_solar_meteo.head()

ZONEID,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3
Unnamed: 0_level_1,VAR134,VAR134,VAR134,VAR157,VAR157,VAR157,VAR164,VAR164,VAR164,VAR165,VAR165,VAR165,VAR166,VAR166,VAR166,VAR167,VAR167,VAR167,VAR169,VAR169,VAR169,VAR175,VAR175,VAR175,VAR178,VAR178,VAR178,VAR228,VAR228,VAR228,VAR78,VAR78,VAR78,VAR79,VAR79,VAR79
2012-04-01 01:00:00,94843.625,94290.625,92667.625,60.2219,57.3743,57.0032,0.2446,0.191,0.2381,1.0393,0.8579,0.9862,-2.503,-2.8016,-3.1334,294.4485,294.3933,293.9036,2577830.0,2595302.0,2606438.0,1202532.0,1192092.0,1158284.0,2861797.0,2884677.0,2901861.0,0.0,0.0,0.0,0.002,0.0015,0.0035,0.0036,0.0054,0.0247
2012-04-01 02:00:00,94757.9375,94217.6875,92622.6875,54.6786,57.1298,64.9608,0.4571,0.5171,0.5616,2.4829,2.5941,2.6884,-2.9933,-2.6532,-2.5749,295.6514,294.9624,293.1965,5356093.0,5374973.0,5352637.0,2446757.0,2420485.0,2341093.0,5949378.0,5984578.0,5980930.0,0.0,0.0,0.0008,0.0055,0.0183,0.0275,0.0336,0.1016,0.1405
2012-04-01 03:00:00,94732.8125,94201.0625,92598.8125,61.2949,63.0761,66.7959,0.7714,0.7052,0.6616,3.3399,2.8814,2.8845,-1.9825,-1.6478,-1.8937,294.4546,293.8767,292.5925,7921788.0,7858492.0,7714300.0,3681336.0,3643656.0,3531304.0,8939176.0,8929064.0,8822632.0,0.0013,0.0013,0.0017,0.0301,0.0353,0.0324,0.132,0.1129,0.1366
2012-04-01 04:00:00,94704.0625,94156.0625,92542.0625,67.7753,62.9755,65.1776,0.9659,0.9934,0.9416,3.1061,3.0248,3.1757,-1.4461,-1.4968,-1.6399,293.2615,293.0713,292.1748,9860520.0,9719720.0,9619368.0,4921504.0,4885120.0,4740096.0,11331679.0,11258463.0,11178847.0,0.0025,0.0017,0.0021,0.0572,0.0645,0.032,0.1106,0.1078,0.0974
2012-04-01 05:00:00,94675.0,94124.25,92508.0,70.173,65.0538,65.7443,0.9447,0.9376,0.9447,2.6011,2.5216,2.6973,-1.9045,-1.8366,-1.8801,292.7329,292.5444,291.5845,11143097.0,11054009.0,11054009.0,6254380.0,6211372.0,6014412.0,13105558.0,13079318.0,13079958.0,0.0033,0.0021,0.0025,0.051,0.0595,0.0485,0.1896,0.1576,0.1638


In [17]:
df_solar_power.describe()

Unnamed: 0,1,2,3
count,15360.0,15360.0,15360.0
mean,0.1735,0.1923,0.1992
std,0.2628,0.2795,0.288
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0027,0.0024,0.003
75%,0.2848,0.3575,0.3707
max,0.9162,0.9778,1.0035


In [16]:
df_solar_meteo.describe()

ZONEID,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3
Unnamed: 0_level_1,VAR134,VAR134,VAR134,VAR157,VAR157,VAR157,VAR164,VAR164,VAR164,VAR165,VAR165,VAR165,VAR166,VAR166,VAR166,VAR167,VAR167,VAR167,VAR169,VAR169,VAR169,VAR175,VAR175,VAR175,VAR178,VAR178,VAR178,VAR228,VAR228,VAR228,VAR78,VAR78,VAR78,VAR79,VAR79,VAR79
count,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0,15359.0
mean,94718.3937,94161.1335,92512.0336,68.2732,66.988,67.6927,0.4205,0.4243,0.4416,1.0103,0.8743,0.8528,-0.3474,-0.2465,-0.2285,285.0612,284.7675,283.983,12398208.8161,12375548.7599,12336392.7126,13715585.1388,13655192.0478,13378715.9572,14371885.9191,14390984.8888,14381453.0647,0.0009,0.0009,0.001,0.0358,0.0371,0.0384,0.0154,0.0155,0.0159
std,602.1085,602.4702,593.2916,19.735,19.8961,20.3269,0.398,0.3957,0.3976,2.8104,2.4464,2.1638,1.9077,1.9664,2.2252,7.0096,6.9243,6.7524,6563589.3376,6510942.5721,6464259.5407,7653794.9908,7643903.161,7541980.3139,6983524.3402,6975865.8447,6962384.7479,0.0034,0.0033,0.0034,0.1063,0.106,0.1079,0.0496,0.0487,0.0481
min,92587.8125,92047.375,90359.0,10.1523,10.1776,8.5254,0.0,0.0,0.0,-8.9086,-8.8352,-8.2788,-7.6361,-7.5002,-8.4042,270.2664,269.4363,269.5964,187820.375,194134.0,149878.0,865972.8125,838841.5625,821617.5625,657726.0625,709579.0,646523.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,94310.375,93751.7188,92105.4062,53.5192,51.7607,52.0255,0.008,0.0097,0.0153,-0.9151,-0.6259,-0.4029,-1.5657,-1.6192,-1.7715,279.871,279.5812,278.8286,7477069.5,7520687.5,7578839.5,7267721.0,7218057.0,7019813.0,9321300.0,9391159.0,9445564.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,94701.875,94149.625,92513.4375,71.0432,69.7245,70.3611,0.3154,0.3342,0.3693,0.8628,0.4722,0.435,-0.5147,-0.4115,-0.3975,284.1389,283.9165,283.075,11285847.0,11225345.0,11226525.0,13613786.0,13532295.0,13232039.0,13247678.0,13289360.0,13230410.0,0.0,0.0,0.0,0.0021,0.0025,0.003,0.0001,0.0001,0.0002
75%,95144.125,94587.8438,92930.6562,85.2612,84.013,84.9329,0.8592,0.8581,0.883,2.6371,2.2466,2.0376,0.81,0.9877,1.1027,289.5986,289.2996,288.4045,17357920.0,17292656.0,17088208.0,19817088.0,19762840.0,19424888.0,19746144.0,19837912.0,19761216.0,0.0002,0.0003,0.0003,0.0289,0.0311,0.0322,0.0066,0.007,0.0076
max,96317.125,95743.375,94034.625,101.7909,101.7601,102.8384,1.0,1.0,1.0,13.0567,12.1859,11.5511,8.9799,9.5909,10.711,310.0623,309.3418,308.5383,34185728.0,34257408.0,34426112.0,34175376.0,34015120.0,33640528.0,37201360.0,37450896.0,37722000.0,0.0473,0.0503,0.056,2.069,1.8669,1.8503,0.6681,0.7031,0.7346


In [27]:
from IPython.core.display import display

In [30]:
df_solar_meteo[col].value_counts().sort_values(ascending=False).iloc[:3]

0.0000    6108
0.0000      53
0.0000      46
Name: (3, VAR79), dtype: int64

In [34]:
for df in [df_solar_meteo, df_solar_power]:
    for col in df:
        tmpdf = pd.DataFrame(df_solar_meteo[col].value_counts().sort_values(ascending=False).iloc[:3])
        if tmpdf.iloc[0, 0] > 100:
            display(tmpdf)

Unnamed: 0_level_0,1
Unnamed: 0_level_1,VAR164
0.0,3094
1.0,569
0.0017,90


Unnamed: 0_level_0,2
Unnamed: 0_level_1,VAR164
0.0,2976
1.0,585
0.0021,102


Unnamed: 0_level_0,3
Unnamed: 0_level_1,VAR164
0.0,2722
1.0,616
0.0025,95


Unnamed: 0_level_0,1
Unnamed: 0_level_1,VAR228
0.0,10154
0.0001,22
0.0001,21


Unnamed: 0_level_0,2
Unnamed: 0_level_1,VAR228
0.0,9989
0.0005,20
0.0002,20


Unnamed: 0_level_0,3
Unnamed: 0_level_1,VAR228
0.0,9935
0.0032,18
0.0013,18


Unnamed: 0_level_0,1
Unnamed: 0_level_1,VAR78
0.0,4303
0.0001,99
0.0002,81


Unnamed: 0_level_0,2
Unnamed: 0_level_1,VAR78
0.0,4157
0.0001,71
0.0002,63


Unnamed: 0_level_0,3
Unnamed: 0_level_1,VAR78
0.0,4008
0.0001,107
0.0002,65


Unnamed: 0_level_0,1
Unnamed: 0_level_1,VAR79
0.0,6531
0.0,42
0.0,38


Unnamed: 0_level_0,2
Unnamed: 0_level_1,VAR79
0.0,6344
0.0,44
0.0,40


Unnamed: 0_level_0,3
Unnamed: 0_level_1,VAR79
0.0,6108
0.0,53
0.0,46


AttributeError: 'DataFrame' object has no attribute 'value_counts'

In [20]:
df_solar_power.value_counts()

AttributeError: 'DataFrame' object has no attribute 'value_counts'