In [None]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import *
import sklearn.linear_model
import sklearn.preprocessing
import os
import statistics
from datetime import datetime

import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.ion()

def remove_nonsense_data(array, dataset): # in our case, nonsense data means a too expensive skin/item, always alone in our datasets (happily !)
    maximum = np.max(array)
    i_max = np.where(array == maximum)
    second_max = 0
    for nb in array:
        if second_max < nb and nb!=maximum:
            second_max = nb
    
    percentage_diff = ((maximum-second_max)/maximum)*100
    if percentage_diff>20: # if difference between most and second expensive items is >2%
        dataset = dataset.drop(dataset[dataset.prix == maximum].index)
    
    return dataset

def apprendre_skin(filename):
    print(filename + " processing...")
    dataset = pd.read_csv(filename)

    dataset["prix"] = dataset["prix"].astype(float)
    dataset["date"] = pd.to_datetime(dataset["date"])

    # analyses sommaires
    print(dataset.shape)

    # on consulte les types des données:
    print(dataset.dtypes)
    
    # Removing non-sense data
    dataset = remove_nonsense_data(dataset['prix'].values, dataset)

    # aperçu des stats de chaque colonne
    pd.set_option('precision', 3)
    print(dataset.describe())
    
    # histograms
    dataset.hist(bins=10,figsize=(15,10),grid=False)
    plt.show()

    # Setting X and Y axis
    x = dataset['date'].values
    x = x.reshape(-1,1)
    y = dataset['prix'].values
    
    # Train / Validation / Test split (60%/20%/20%)
    X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=7)
    X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.25, random_state=8) # 0.25 x 0.8 = 0.2


    alpha_values = np.linspace(dataset['prix'].min(), dataset['prix'].max(), dataset['prix'].idxmax()) # tous les datasets font max 180 lignes donc on peut se permettre une telle simplification

    # input standardization
    scaler = sklearn.preprocessing.StandardScaler()
    scaler.fit(X_train)

    X_train = scaler.transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    scores = []
    for alpha in alpha_values:
        # alpha est fixé (comme mu=mu0)
        monModele = sklearn.linear_model.Ridge(alpha=alpha)
    
        # je converge theta vers theta^*
        monModele.fit(X_train, Y_train)
    
        score = monModele.score(X_val, Y_val)
        scores.append(score)

    plt.plot(alpha_values, scores)
    plt.show()
    alpha_etoile = np.argmax(np.array(scores))

    # Fitting with fixed hyper-parameter
    monModele = sklearn.linear_model.Ridge(alpha=alpha_etoile)
    monModele.fit(X_train, Y_train)
    print("Comparaison des scores entre training, validation et test : ", monModele.score(X_train, Y_train), monModele.score(X_val, Y_val), monModele.score(X_test, Y_test))
    
    Y_train_pred = monModele.predict(X_train)
    print("train error", sklearn.metrics.mean_squared_error(Y_train_pred, Y_train))

    Y_test_pred = monModele.predict(X_test)
    print("test error", sklearn.metrics.mean_squared_error(Y_test_pred, Y_test))
    
    # ----- Cross-validation -----
    # kfold_validation=KFold(5)

    # results=cross_val_score(monModele,x,y,cv=kfold_validation)
    # print(results)
    # print(np.mean(results))

    xmin= min(Y_train.min(), Y_train_pred.min() )-1
    xmax= max(Y_train.max(), Y_train_pred.max() )+1

    plt.scatter(X_train, Y_train, marker='x', label='train')
    plt.scatter(X_test , Y_test, marker='+', label='test')
    
    plt.legend()
    plt.xlabel('verite terrain')
    plt.ylabel('prediction du modele')
    # plt.xlim([xmin,xmax])
    # plt.ylim([xmin,xmax])
    plt.plot(X_test, Y_test_pred, color = "black")
    plt.show()
    
for csv_file in os.listdir('prices'):
    if csv_file.endswith(".csv"):
        apprendre_skin("prices/"+csv_file)