## Explorative Datenanalyse und Wahrscheinlichkeitsrechnen

#### Import Packages

In [1]:
import config as cfg
import Import_Data_API as ImpData

import pandas as pd
from influxdb import DataFrameClient, InfluxDBClient
import pytz
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
import numpy as np
from fhnw_ds_hs2019_weatherstation_api import data_import as weather

c:\users\simon\appdata\local\programs\python\python37-32\lib\site-packages\numpy\.libs\libopenblas.2V74HQ3MKNZHDCKJELIPPY7V6QMK3UOZ.gfortran-win32.dll
c:\users\simon\appdata\local\programs\python\python37-32\lib\site-packages\numpy\.libs\libopenblas.FN5FF57TWHUYLRG54LA6B33EZPHYZZL4.gfortran-win32.dll
  stacklevel=1)


localhost:8086


In [2]:
client = cfg.client

#### Formulas

In [3]:
def rowindex_as_col(df):
    """add row index(time) to new column. df = dataframe, name_col = new column name"""
    df.index.name = "Time"
    df = df.reset_index(inplace=False)
    df = pd.DataFrame(df)
    return df

def appropriate_dtypes(df, column_name, result_dtype ):
    """Datentypen ändern"""
    df = df.astype({column_name: result_dtype})
    return df
    

#### Sind Daten von Mythenquai und Tiefenbrunnen unterschiedlich?

In [None]:
df1, df2 = ImpData.select_timedelta(0,7)


In [None]:
df1.equals(df2)

In [None]:
df1.corrwith(df2)

Alle Daten ausser den Winddaten sehen sich ziemlich ähnlich. Es wäre somit sinnvoll die Winddaten für beide Stationen grafisch für die Segler darzustellen. Alle die anderen Daten mit einer Korrelation von über 90% können von einer Station übernommen werden.

#### Query Data from InfluxDB

In [None]:
mythenquai_, tiefenbrunnen_ = ImpData.select_timedelta(0, 100)

#### Transform Data

In [None]:
mythenquai_ = appropriate_dtypes(mythenquai_, "humidity" , np.int8)
tiefenbrunnen_ = appropriate_dtypes(tiefenbrunnen_, "wind_direction", np.int16)

In [None]:
mythenquai_ = appropriate_dtypes(mythenquai_, "humidity", np.int8)
mythenquai_ = appropriate_dtypes(mythenquai_,"wind_direction", np.int16 )

### Exploratory Data Analysis

Aesthetische Einstellungen:

In [None]:
sns.set_style("darkgrid")
sns.set_palette("Paired")

In [None]:
round(tiefenbrunnen_.describe(),1)

In [None]:
round(mythenquai_.describe(),1)

Korrelationen Bereich Temperatur und Feuchte:

In [None]:
tiefenbrunnen_adapt_temp = tiefenbrunnen_.drop( columns = ["wind_direction","wind_force_avg_10min",
                                                             "wind_gust_max_10min","wind_speed_avg_10min",
                                                             "windchill", "barometric_pressure_qfe"])
tiefenbrunnen_adapt_temp = rowindex_as_col(tiefenbrunnen_adapt_temp)
tiefenbrunnen_adapt_temp.head()
sns.pairplot(tiefenbrunnen_adapt_temp, kind = "reg", diag_kind = "kde")

#### Auffälligkeiten:
- Starke Negative Korrelation zwischen Luftfeuchte und Temperatur
- Eigentlich sollte der Taupunkt bei 100% relativer Luftfeuchte sein. Erklärung?
- Es bestehen positive Korrelationen zwischen Lufttemperatur, Wassertemperatur und Taupunkt.
- Negative Korrelationen zwischen Lufttemperatur und Luftfeuchtigkeit.

#### Bemerkungen
- Barometrischer Druck entfernt, da schwache Korrelationen

Korrelationen Bereich Wind:

In [None]:
tiefenbrunnen_adapt_wind = tiefenbrunnen_.drop( columns = ["air_temperature","barometric_pressure_qfe","dew_point",
                                                             "humidity", "water_temperature", "windchill"])
sns.pairplot(tiefenbrunnen_adapt_wind, kind = "reg", diag_kind = "kde")

#### Auffällligkeiten:
- Positive Korrelation zwischen Windrichtung, Geschwindigkeiten und WIndstärke
- Starke Positive Korrelation zwischen Windgeschwindigkeit, Stärke

### Prognose

Als Referenz werden die Temperaturdaten aus dem Jahr 2018 genommen

In [None]:
path_2_file = "../influxdb-1.7.8-1/data/messwerte_mythenquai_2007-2018.csv"
df_prediction = pd.read_csv(path_2_file, index_col=0)
#df_prediction.head()

In [None]:
df_pred = df_prediction
df_pred.index = pd.to_datetime(df_pred.index)

df_pred = df_pred.loc["2018-01-01":"2018-12-31"]

len(df_pred)

#### Verfahren mit Kolmogorov-Smirnoff Test:    

Es werden Daten von zwei Jahren in zwei Tage gruppiert. Die gruppierten Daten werden dann mit
dem aktuellen "ganzen" Tag verglichen und geben dann die Temperatur für den nachfolgenden Tag aus.

In [None]:
def get_values_in_grouped_days(df, column, group_string, group_int):
    """ 
    Splits distributions in separate lists of days together as new list of values. This DF is always used as reference to determine temperature of next day.
    Input: df = vector or dataframe, column = specific column index, group_string = "3D", "D", group_int = integer days want to group
    """ 
    
    df.index = pd.to_datetime(df.index) ## Index conversion to datetime
    day_shift = -1

    df1 = df.iloc[:, column]
    
    grouped_df = df1.resample(group_string).aggregate(lambda tdf: tdf.tolist()) #Creates new df by grouping days
    grouped_df = pd.DataFrame(grouped_df)
    
    df2 = df.iloc[:, column]
    
    grouped_df_max = df2.resample("D").aggregate(lambda tdf: tdf.max())
    grouped_df_max = pd.DataFrame(grouped_df_max)
    grouped_df_max = grouped_df_max[::group_int].iloc[1:] ## takes each third row and drops the first one

    new_df = pd.concat([grouped_df, grouped_df_max], axis = 1)
    new_df = new_df.shift(day_shift).dropna() 
    new_df.columns = ["grouped_values", "Temp_next_day"]
    
    return new_df
     

In [None]:
sample_df = get_values_in_grouped_days(df_pred, 0, "1D", 1)
sample_df.head()

In [None]:
#df_max = df_prediction.loc["2010-08-02":"2018-08-02"].max()

df_test = df_prediction.loc["2010-01-01":"2017-12-31"]
df_test = df_test.iloc[:,0]
df_test = pd.DataFrame(df_test)

df_test2 = df_prediction.loc["2018-01-11":"2018-01-12"]

#df_test = df_test.reset_index(drop = True)
#df_test.head()
#df_test2
#df_max

### Vorhersage mit KS-Test

In [None]:
from scipy import stats as spstats

def prediction_ks_test(df_for_test, sample_df):
    """Predicts the value of next day by using the statistical KS-Test of Scipy package. It's used to compare the distributions of two
    test samples. The higher the probability value the better is the fit.
    Input: df_for_test: dataframe of one day to perform the test / sample_df: specific sample df as reference
    Returns: fitting maximum temperature of next day
    """
    KS_p_val_to_compare = 0
    fitting_temp_next_day = 0
    iterations = 0 ## length not equal of sample df 
        
    for row in range(0, len(sample_df)):
                                
        KS_stat, KS_p_val = spstats.ks_2samp(df_for_test.iloc[:, 0], sample_df.iloc[row, 0]) # Perform t-test
          
        if KS_p_val > KS_p_val_to_compare:
            KS_p_val_to_compare = KS_p_val
            fitting_temp_next_day = sample_df.iloc[row, 1]
            
        iterations = iterations + 1        
            
    #print("P-Wert: {} / Iterations: {} / Fitting Temp.: {}".format(KS_p_val_to_compare, iterations, fitting_temp_next_day))                                                                                                                            
    
    return fitting_temp_next_day
     

#### Performance Kolmogorov-Smirnoff:
Der Durchschnitt aller Messwerte aus einem Jahr.


In [None]:
def prediction_test_KS(df_test, sample_df, days_used_for_prediction):
    """Purpose is to test a generated prediction method of one year. Calculates the average deviation of one simulative test year
    Parameters: df_test = df which should be tested(first column) / days_used_... = 1d (how much days're used for the prediction)
    """
    days_offset = days_used_for_prediction         
    start_date_str = "2015-01-01" ## Check all Values for this year
    start_date = datetime.datetime.strptime(start_date_str,  "%Y-%m-%d")
    days_in_df_test = len(df_test.index.dayofyear.unique())
    
    test_results = np.array([])
    
    for day in range(days_used_for_prediction, days_in_df_test):
        ## Maximum Temperature of +1day or how much 
        time_delta1,time_delta2 = datetime.timedelta(days = day), datetime.timedelta(days = day + 1)
        val_date1, val_date2 = start_date + time_delta1, start_date + time_delta2      ## Valuable date
         
        df_ref = df_test.loc[val_date1:val_date2]
        ## df_max = df_ref.values.max()
        if df_ref.empty == False:
            df_max = np.max(df_ref)
        
        ###### Function part: KS_test 
        KS_time_delta = datetime.timedelta(days = day - days_used_for_prediction) ## time-frame for prediction
        KS_val_date1, KS_val_date2 = start_date + KS_time_delta, start_date + time_delta1 ## same as ttest_timedelta +1
        KS_df_test = df_test.loc[KS_val_date1 : KS_val_date2]
        
        if KS_df_test.empty == False:
            KS_test_temp = prediction_ks_test(KS_df_test, sample_df)
        
        deviation_test = abs(df_max - KS_test_temp) ## Absolute difference to real value
        
        test_results = np.append(test_results, deviation_test)
            
    test_result = np.mean(test_results)
        
    print("The average deviation of a year: {} K".format(test_result))

    return test_results


In [None]:
test_results = prediction_test_KS(df_test, sample_df, 1)

In [None]:
print(len(test_results))
sns.distplot(test_results, kde = False)
plt.show()