In [None]:
%load_ext autoreload
import os.path
import sys
from IPython.core.display import display, HTML
HTML("<style>.container { width:90% !important; }</style>")

In [None]:
def get_project_dir(project):
    """Returns project directory path and appends it to sys.path"""
    current_dir = os.path.abspath('./')
    project_dir = current_dir[:current_dir.rfind(project)+len(project)+1]
    sys.path.append(project_dir)
    return project_dir

get_project_dir('ml_library')
project_dir = get_project_dir('project_2')

In [None]:
import random
import numpy as np
import scipy as sp
import pandas as pd
from sklearn import metrics, preprocessing, linear_model

import matplotlib.pyplot as plt
import seaborn as sns

import numba as nb
# from itertools import combinations
pd.set_option('display.max_columns', 40)

In [None]:
%autoreload
from src import ML

In [None]:
# import
data = pd.read_csv(project_dir + 'data/TrainingSet.csv', index_col=0)
submissions = pd.read_csv(project_dir + 'out/SubmissionRows.csv', index_col=0)

In [None]:
# set submission rows
data['is_output'] = 0
data.loc[data.index.isin(submissions.index), 'is_output'] = 1

In [None]:
@nb.jit
def get_years(start=1972, stop=2007):
    """ 
    make a list of column names for specific years
    in the format they appear in the data frame start/stop inclusive
    """
    if isinstance(start, list):
        data_range = start
    elif stop:
        data_range = range(start, stop+1)
    else:
        data_range = [start]
    yrs = []
    for yr in data_range:
        yrs.append("{0} [YR{0}]".format(yr))
    return yrs

In [None]:
country = 'Germany'
country_data = data[data['Country Name']==country]

In [None]:
# country_data[get_years()].T.plot()
# plt.show()

In [None]:
# show example graph
sns.pointplot(country_data.loc[94948,get_years()],country_data.loc[94946,get_years()])
plt.show()

- linear regression between both vars shows how good one var predicts the other one.
- here r_squard ~= 0 -> unlikely a correlation
- however, we first have to remove the trend, otherwise spurious relations appear


In [None]:
sp.stats.linregress(np.array(country_data.loc[94948,get_years()].astype(float)),
                    np.array(country_data.loc[94946,get_years()].astype(float)))

In [None]:
class Wave:
    def __init__(self, y):
        self.ys = y
    
    def __len__(self):
        return len(self.ys)

def serial_corr(wave, lag=1):
    n = len(wave)
    y1 = wave.ys[lag:]
    y2 = wave.ys[:n-lag]
    corr = np.corrcoef(y1, y2, ddof=0)[0, 1]
    return corr

def autocorr(wave):
    lags = range(len(wave.ys)//2)
    corrs = [serial_corr(wave, lag) for lag in lags]
    return lags, corrs

In [None]:
signal = Wave(np.array(country_data.loc[94947,get_years()].astype(float)))

In [None]:
signal = Wave(np.sin(np.linspace(0,13,100)))

In [None]:
fig, ax = plt.subplots()
x, y = autocorr(signal)
ax.plot(x,y)
plt.show()

In [None]:
y

In [None]:
fig, ax = plt.subplots()
y = np.correlate(signal.ys, signal.ys, mode='same')
# ax.plot(y[len(y)//2:] / range(len(y)//2+1, 1, -1) / y[len(y)//2])
ax.plot(y[len(y)//2:] / range(len(y)+1, len(y)//2+1,-1))
plt.show()

In [None]:
def autocorr(x):
    x = np.array(x).astype(float)
    y = np.correlate(x, x, mode='same')
    # take only second half and normalize
    N = len(y)
    y_half = y[N//2:]
    # np.correlate does not normalize when taking sum. for greater lag, the sum has fewer elements
    y_half /= range(N, N//2, -1) * y_half[0] / N
    return y_half

def plot(x, *args, **kwargs):
    fig, ax = plt.subplots()
    ax.plot(x, *args, **kwargs)
    plt.show()

In [None]:
# signal = range(0,100,2)
# autocorr(a)
_, ac = autocorr(signal)
plot(ac, '.')