In [3]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import *
import sklearn.linear_model
import sklearn.preprocessing
import os
import statistics
from datetime import datetime

import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.ion()

def pearson(X,Y):
    try:
        mX = sum(X)/len(X) # mean of X and Y lists
        mY = sum(Y)/len(Y)
    
        cov = sum((a - mX) * (b - mY) for (a,b) in zip(X,Y)) / len(X) # covariance between X and Y
    
        stdevX = (sum((a - mX)**2 for a in X)/len(X))**0.5 # standard deviation of X and Y
        stdevY = (sum((a - mY)**2 for a in Y)/len(Y))**0.5
    
        return round(cov/(stdevX*stdevY),3) # pearson correlation calculation
    except:
        return 0

def calculate_days(date1,date2):
    return pd.Timedelta((date2 - date1)).days

def find_bestPearson(x,y,days_x,days_y,cp,nb_days,max_cp): # param : x list and y list to find best Pearson correlation, days which corresponds to x and y lists, and cp the initial pearson found. nb_days is the INOUT parameter, which is initialized at 0 in first call, and stands for the gap of days between chinese and selected EU markets
    if len(x)==0 or len(y)==0:
        return nb_days
    else:
        if days_x[0] == days_y[0]:
            y = y[1:] # tail of EU prices
            new_cp = abs(pearson(x,y))
            head, *tail = days_y
            if new_cp > cp:
                head2, *tail2 = tail
                return find_bestPearson(x,y,days_x,tail,new_cp,nb_days+calculate_days(head[0],head2[0]),new_cp)
            else:
                return find_bestPearson(x,y,days_x,tail,new_cp,nb_days,cp)
        else:
            while days_x[0] < days_y[0]: # cn data always begins before or at the same time as EU data
                head, *tail = days_x
                x = x[1:]
                days_x = tail
            new_cp = abs(pearson(x,y))
            if new_cp > cp:
                return find_bestPearson(x,y,days_x,days_y,new_cp,nb_days,new_cp)
            else:
                return find_bestPearson(x,y,days_x,days_y,new_cp,nb_days,cp)

def apprendre_skin(filename):
    print(filename + " processing...")
    dataset = pd.read_csv(filename)
    print(dataset)
    try:

        dataset["prix"] = dataset["prix"].astype(float)
        dataset["date"] = pd.to_datetime(dataset["date"])

        # analyses sommaires
        print(dataset.shape)

        # on consulte les types des données:
        print(dataset.dtypes)

        # aperçu des stats de chaque colonne
        pd.set_option('precision', 3)
        print(dataset.describe())
    
        # histograms
        dataset.hist(bins=10,figsize=(15,10),grid=False)
        plt.show()

        # Setting X and Y axis
        x = dataset['date'].values
        x = x.reshape(-1,1)
        y = dataset['prix'].values
    
        plt.title("Variation des prix de l'item en fonction des dates")
        plt.xlabel("Date")
        plt.ylabel("Prix")
        plt.plot(x,y)
        plt.show()
    
        # Train / Validation / Test split (60%/20%/20%)
        X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=7)
        X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.25, random_state=8) # 0.25 * 0.8 = 0.2

        alpha_values = np.linspace(dataset['prix'].min(), dataset['prix'].max()+1, len(dataset.index)) # tous les datasets font max 180 lignes (= 6 mois de données) donc on peut se permettre une telle simplification
    
    
        # input standardization
        scaler = sklearn.preprocessing.StandardScaler()
        scaler.fit(X_train)

        X_train = scaler.transform(X_train)
        X_val = scaler.transform(X_val)
        X_test = scaler.transform(X_test)

        scores = []
        for alpha in alpha_values:
            # alpha est fixé (comme mu=mu0)
            monModele = sklearn.linear_model.Ridge(alpha=alpha)
    
            # je converge theta vers theta*
            monModele.fit(X_train, Y_train)
    
            score = monModele.score(X_val, Y_val)
            scores.append(score)

        alpha_etoile = np.argmax(np.array(scores))

        # Fitting with fixed hyper-parameter
        monModele = sklearn.linear_model.Ridge(alpha=alpha_etoile)
        monModele.fit(X_train, Y_train)
        print("Comparaison des scores entre training, validation et test : ", monModele.score(X_train, Y_train), monModele.score(X_val, Y_val), monModele.score(X_test, Y_test))
    
        # Predictions
        Y_train_pred = monModele.predict(X_train)
        print("train error", sklearn.metrics.mean_squared_error(Y_train_pred, Y_train))

        Y_test_pred = monModele.predict(X_test)
        mse = sklearn.metrics.mean_squared_error(Y_test_pred, Y_test)
        print("test error", mse)
    
        # R^2 calculation
        print("R^2 (best possible value is 1.0) :", sklearn.metrics.r2_score(Y_test,Y_test_pred))
    
        # MAE calculation
        mae = sklearn.metrics.mean_absolute_error(Y_test,Y_test_pred)
        print("The mean absolute error is: {:.2f}".format(mae))
    
        # ----- Cross-validation -----
        # kfold_validation=KFold(5)

        # results=cross_val_score(monModele,x,y,cv=kfold_validation)
        # print(results)
        # print(np.mean(results))

    
        plt.scatter(x=range(0,Y_test.size), y=Y_test, marker='x', label='Actual')
        plt.scatter(x=range(0,Y_test_pred.size) , y=Y_test_pred, marker='+', label='Predicted')
        plt.xlabel('Price index in dataset')
        plt.ylabel('Price to guess/guessed')
        plt.legend()
        plt.title("Actual and predicted values")
        plt.show()
    
        # Visualization with MAE on test values
        plt.scatter(X_train, Y_train, marker='x', label='train')
        plt.scatter(X_test, Y_test, marker='+', label='test')
    
        plt.legend()
        plt.title("Linear regression with MAE visualization on test values")
        plt.xlabel('Date')
        plt.ylabel('Prix')
        plt.errorbar(X_test,Y_test_pred,mae)
        plt.plot(X_test, Y_test_pred, color = "black")
        plt.show()

    
        # Visualization with MSE on test values
        plt.scatter(X_train, Y_train, marker='x', label='train')
        plt.scatter(X_test, Y_test, marker='+', label='test')
    
        plt.legend()
        plt.title("Linear regression with MSE visualization on test values")
        plt.xlabel('Date')
        plt.ylabel('Prix')
        plt.errorbar(X_test,Y_test_pred,mse)
        plt.plot(X_test, Y_test_pred, color = "black")
        plt.show()
    
        return x,y
    except:
        print("Il n'y a aucun prix pour", filename)
        return np.empty(0),np.empty(0,dtype='datetime64[ns]')

os.rmdir('prices/.ipynb_checkpoints')
dirct_cn = os.listdir('prices')
dirct_cn.sort()

os.rmdir('merged_prices/.ipynb_checkpoints')
dirct_eu = os.listdir('merged_prices')
dirct_eu.sort()

for csv_cn in dirct_cn:
    x_cn,y_cn = apprendre_skin("prices/"+csv_cn)
    x_eu,y_eu = apprendre_skin("merged_prices/"+dirct_eu[0])
    dirct_eu = dirct_eu[1:]
    
    corr = pearson(y_cn,y_eu) # pearson correlation
    
    print("Corrélation de Pearson entre le marché chinois et le marché EU pour", csv_cn, ":", corr)
#    nb_days = find_bestPearson(y_cn.tolist(),y_eu.tolist(),x_cn,x_eu,result_eu,0,0)
    
#    print("Chinese prices are in advance of", nb_days, "EU prices.")

OSError: [Errno 39] Directory not empty: 'prices/.ipynb_checkpoints'