In [1]:
import helper

import difflib 
import random

import seaborn as sns
import numpy as np
import pandas as pd
import collections
import matplotlib.pyplot as plt

from scipy.stats.stats import pearsonr  
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
def create_dataset(data):
    dataX, dataY = [],[]
    for i in range(len(data)):
        dataX.append(data[i][:-1])
        dataY.append([data[i][-1]])
        
    return np.array(dataX), np.array(dataY)

def generate_mape_plot(dataset,save=False):

    results = []
    for corr in ['strong','moderate','low']: 
        data = dataset[dataset['correlation']==corr]
        drop_col = [d_col for d_col in data.columns if d_col[:5] != 'panel']
        data = data.drop(drop_col,axis=1)
        res, _ = model.model_predict(data)
        results.append(res)
        
        
    # make the plot
    fig = plt.figure(figsize=(8,5))
    ax = fig.add_subplot(1,1,1)
    
    title_name = "Power Prediction Error - East Roof Dataset"
    box = plt.boxplot(results,patch_artist=False, showmeans=True)
    
    ax.set_title(title_name,fontsize=20)
    ax.xaxis.set_ticklabels(["Strong", "Moderate", "Low"], fontsize=15)
    ax.set_xlabel('Correlation',fontsize=20)

    power_y = [0,5,10,15,20,25,30,35,40]
    ax.set_yticks(power_y)
    ax.yaxis.set_ticklabels(power_y, fontsize=15)
    ax.set_ylabel('MAPE(%)',fontsize=20)
    
    if save == True:
        plt.savefig(title_name + '.jpg', format='jpg', dpi=500, bbox_inches='tight')
        
def make_plot(results,save=False):
    # make the plot
    fig = plt.figure(figsize=(8,5))
    ax = fig.add_subplot(1,1,1)
    
    title_name = "Power Prediction Error - East Roof Dataset"
    box = plt.boxplot(results,patch_artist=False, showmeans=True)
    
    ax.set_title(title_name,fontsize=20)
    ax.xaxis.set_ticklabels(["2 panels", "4 panels", "6 panels", '8 panels', '10 panels'], fontsize=15)
    ax.set_xlabel('Correlation',fontsize=20)

#     power_y = [0,5,10,15,20,25,30,35,40]
#     ax.set_yticks(power_y)
#     ax.yaxis.set_ticklabels(power_y, fontsize=15)
    ax.set_ylabel('MAPE(%)',fontsize=20)
    
    if save == True:
        plt.savefig(title_name + '.jpg', format='jpg', dpi=500, bbox_inches='tight')
    
        
def cond_filter(dataset,target_condition):
    
    day_list = list(set(dataset['date']))

    target_day = []
    corr_dict = {'strong':0,'moderate':0}

    for day in day_list:

        data = dataset[dataset['date']==day]
    
        if len(data) <= 5:
            continue
    
        weather = collections.Counter(data['weather_icon']).most_common()[0][0]

        if weather not in target_condition:
            continue
        else:
            target_day.append(day)
        
    drop_col = [d_col for d_col in dataset.columns if d_col[:5] != 'panel' and d_col not in ['date','correlation']]
    new_dataset = dataset.drop(drop_col,axis=1)
          
    df = np.array([0]*9)       
    for day in target_day:
        data = new_dataset[dataset['date']==day]
        
        correlation = data['correlation'].values[0]
        if correlation == 'strong':
            corr_dict['strong'] += 1
        else:
            corr_dict['moderate'] += 1
    
        data = data.drop(['date','correlation'],axis=1).values
    
        df = np.vstack((df, data))
        
    return pd.DataFrame(df), corr_dict

In [3]:
# Load data
# east_data = pd.read_csv('east_data_clean.csv').dropna()
# east_data = east_data[east_data['correlation']!='low']
# west_data = pd.read_csv('west_data_clean.csv').dropna()
# west_data = west_data[west_data['correlation']!='low']
# lower_data = pd.read_csv('lower_data_clean.csv').dropna()
# lower_data = lower_data[lower_data['correlation']!='low']

dataset = pd.read_csv('lower_data_clean.csv').dropna()
dataset = dataset[dataset['correlation']!='low']

col_drop = [d_col for d_col in dataset.columns if d_col[:5] != 'panel']
dataset = dataset.drop(col_drop,axis=1)
    

# day_list = list(set(dataset['date']))

# Create object
visual = helper.visualizer()
model = helper.modeling()
process = helper.data_processing()

In [4]:
all_results = []

In [5]:
from sklearn.linear_model import LinearRegression

res,_ = model.model_predict(dataset,"Naive")
res

NameError: name 'lr_model' is not defined

In [6]:
all_results.append(res)

In [27]:
x_test

[1.9096,
 2.5032,
 2.7521499999999994,
 2.9423,
 3.1421,
 3.4539,
 3.8357666666999997,
 4.0394333333,
 4.9393,
 5.512400000000001,
 6.7905,
 7.531499999999999,
 8.498899999999999,
 9.4495,
 9.809000000000001,
 10.8417,
 12.2647,
 14.285100000000003,
 16.2748,
 16.8303,
 18.1478,
 23.4697,
 30.604400000000005,
 43.8159,
 45.604299999999995,
 44.7993,
 40.4687,
 44.5069,
 47.02419999999999,
 49.4929,
 77.40119999999999,
 68.8683,
 69.68120000000002,
 75.3058,
 90.9511,
 166.12410000000003,
 170.4538,
 174.2151,
 177.4137,
 180.5559,
 183.2246,
 185.8991,
 194.0475,
 204.2434,
 205.0491,
 205.47089999999997,
 208.1035,
 210.9478,
 212.8163,
 214.8917,
 216.8822,
 215.0226,
 219.38209999999998,
 226.34679999999997,
 226.78339999999997,
 228.2949,
 231.4299,
 234.8558,
 219.2625,
 243.0799,
 247.05519999999996,
 232.48719999999997,
 231.28579999999997,
 167.1083,
 168.75310000000002,
 257.5811,
 150.1651,
 137.11219999999997,
 164.79489999999998,
 176.4899,
 150.23200000000003,
 157.5677999