In [1]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn import metrics
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV
sns.set()
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.cross_decomposition import CCA
from sklearn.cross_decomposition import PLSCanonical
from sklearn.cross_decomposition import PLSSVD
def softmax(x):
    return(np.exp(x)/np.exp(x).sum())

def my_custom_loss_func_exp(y_true, y_pred):
    return spearmanr(np.exp(y_true), np.exp(y_pred)).correlation  

def my_custom_loss_func(y_true, y_pred):
    #y_pred = np.exp(np.array(y_pred))
    return spearmanr(y_true, y_pred).correlation

def evaluate_model_cca(X, y, model, model_cca, test_size, loss = my_custom_loss_func,   cv = 5, state = 0):
    X = np.array(X)
    y = np.array(y)
    scores = cross_val_score(model, X, y, cv=cv, scoring = make_scorer(loss), n_jobs = -1)
    print('scores = ', scores)
    print(scores.mean())
    #full train
    model.fit(X,y)
    y_predict = model.predict(X)
    train_score = loss(y,y_predict)
    print("Full train_score = ", train_score)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,shuffle=True,random_state=state)
    X_train = np.array(X_train)
    X_test = np.array(X_test)
   
    model.fit(X_train,y_train)
    
    train_score = loss(y_train, model.predict(X_train))
    y_predict = model.predict(X_test)
    y_cca = model_cca.predict(y_predict.reshape(1,-1))
    test_score = loss(y_test , y_cca.reshape(-1))
    print("train_score = ", train_score)
    print("test_score = ",test_score)
    return test_score


def evaluate_model_mean(X, y, model, test_size=0.33, loss = my_custom_loss_func, state = 100):
    
    score = []
    
    for i in range(state):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,random_state=i)
        X_train = np.array(X_train)
        X_test = np.array(X_test)
        model.fit(X_train,y_train)
        train_score = loss(y_train, model.predict(X_train))
        y_predict = model.predict(X_test)
        test_score = loss(y_test , y_predict.reshape(-1))
        score.append(test_score)
        
    return np.mean(np.array(score))



def generate_new_data(X,y,model,test_size, loss = my_custom_loss_func, state=0):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,random_state=state)
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    model.fit(X_train,y_train)
    y_predict = model.predict(X_test)
    test_score = loss(y_test , y_predict)
    return (y_predict,y_test)

def get_cca_model(X_train,y_train,n,m):
    model = Lasso(0.0165)
    X_cca = []
    y_cca = []
    for i in range(n):
        x,y = generate_new_data(X_train,y_train,model,m,state=i)
        X_cca.append(x)
        y_cca.append(y)
        
    X_cca = np.array(X_cca)
    y_cca = np.array(y_cca)

    model_cca = CCA(n_components=m)
    model_cca.fit(X_cca,y_cca)
    return model_cca


def fine_tune(y_sub,X_train, y_train):
    model = Lasso(0.0165)
    X_cca = []
    y_cca = []
    for i in range(654):
        x,y = generate_new_data(X_train,y_train,model,state=i)
        X_cca.append(x)
        y_cca.append(y)
        
    X_cca = np.array(X_cca)
    y_cca = np.array(y_cca)

    model_cca = CCA(n_components=654)
    model_cca.fit(X_cca,y_cca)
    y_sub = model_cca.predict(y_sub.reshape(-1,1)).reshape(-1)

In [3]:
path = 'data/'
X_train = pd.read_csv( path + 'X_train.csv')
Y_train = pd.read_csv(path + 'Y_train.csv')
X_test = pd.read_csv(path + 'X_test.csv')

- ID: Unique row identifier, associated with a day (DAY_ID) and a country (COUNTRY),
- DAY_ID: Day identifier - dates have been anonymized, but all data corresponding to a specific day is consistent,
- COUNTRY: Country identifier - DE = Germany, FR = France, 

and then contains daily commodity price variations:

- GAS_RET: European gas,
- COAL_RET: European coal,
- CARBON_RET: Carbon emissions futures,

weather measures (daily, in the country x):

- x_TEMP: Temperature,
- x_RAIN: Rainfall,
- x_WIND: Wind,

energy production measures (daily, in the country x),

- x_GAS: Natural gas,
- x_COAL: Hard coal,
- x_HYDRO: Hydro reservoir,
- x_NUCLEAR: Daily nuclear production,
- x_SOLAR: Photovoltaic,
- x_WINDPOW: Wind power,
- x_LIGNITE: Lignite,

and electricity use metrics (daily, in the country x),

- x_CONSUMPTON: Total electricity consumption,
- x_RESIDUAL_LOAD: Electricity consumption after using all renewable energies,
- x_NET_IMPORT: Imported electricity from Europe,
- x_NET_EXPORT: Exported electricity to Europe,
- DE_FR_EXCHANGE: Total daily electricity exchange between Germany and France,
- FR_DE_EXCHANGE: Total daily electricity exchange between France and Germany.

Output data sets are composed of two columns:

- ID: Unique row identifier - corresponding to the input identifiers,
- TARGET: Daily price variation for futures of 24H electricity baseload.

In [4]:
data = pd.merge(X_train, Y_train, on=['ID'])
all_data = data.copy()
all_data['train'] = 1
all_data = pd.concat([all_data,X_test])
all_data['TARGET'] = all_data['TARGET'].fillna(0)
all_data['train'] = all_data['train'].fillna(0)

In [5]:
from sklearn.impute import KNNImputer

def regress_var(df, x_columns, y_column, model, out = True):
    temp = df.dropna().copy()
    X = np.array(temp[x_columns])
    y = np.array(temp[y_column])
    model.fit(X,y)
    if(out):
        print("Model score", model.score(X,y))
    return df.apply(lambda row: model.predict(np.array(row[x_columns]).reshape(-1,len(x_columns)))[0] if(np.isnan(row[y_column])) else row[y_column], axis=1)


def clean_knn(all_data_clean):
    imputer = KNNImputer(n_neighbors=2)
    remove_columns = ['ID','DAY_ID','TARGET','train']
    keep = all_data_clean.columns.difference(remove_columns)
    all_data_clean[keep] = imputer.fit_transform(all_data_clean[keep])
    return all_data_clean
    
def clean_regression(all_data_clean):
    
    #all_data_clean['DE_FR_EXCHANGE'] = all_data_clean['DE_FR_EXCHANGE'].fillna(0)
    #all_data_clean['FR_DE_EXCHANGE'] = all_data_clean['FR_DE_EXCHANGE'].fillna(0)
    
    all_data_clean['DE_FR_EXCHANGE'] = all_data_clean['DE_FR_EXCHANGE'].fillna(all_data_clean['DE_FR_EXCHANGE'].mean(numeric_only=True))
    all_data_clean['FR_DE_EXCHANGE'] = all_data_clean['FR_DE_EXCHANGE'].fillna(all_data_clean['FR_DE_EXCHANGE'].mean(numeric_only=True))
    
    x_columns = ['DE_FR_EXCHANGE']
    y_column = 'DE_NET_EXPORT'
    all_data_clean[y_column] = regress_var(all_data_clean, x_columns, y_column, LinearRegression(), out=True)
    all_data_clean['DE_NET_IMPORT'] = - all_data_clean['DE_NET_EXPORT']
    
    x_columns = ['FR_DE_EXCHANGE']
    y_column = 'FR_NET_EXPORT'
    all_data_clean[y_column] = regress_var(all_data_clean, x_columns, y_column, LinearRegression(), out=True)
    all_data_clean['FR_NET_IMPORT'] = - all_data_clean['FR_NET_EXPORT']
    
    all_data_clean = all_data_clean.fillna(all_data_clean.mean(numeric_only=True))
    
    return all_data_clean

def remove_features(all_data_clean):
    
    all_data_clean = all_data_clean.drop(['DE_FR_EXCHANGE', 'FR_NET_IMPORT','DE_NET_IMPORT'],axis=1)
    all_data_clean['FR_NET_EXPORT'] -= all_data_clean['FR_DE_EXCHANGE']
    all_data_clean['DE_NET_EXPORT'] += all_data_clean['FR_DE_EXCHANGE']
    all_data_clean['DE_CONSUMPTION'] -= all_data_clean['DE_RESIDUAL_LOAD']
    all_data_clean['FR_CONSUMPTION'] -= all_data_clean['FR_RESIDUAL_LOAD']

    '''
    all_data_clean = all_data_clean.rename(columns={'DE_CONSUMPTION':'DE_CONSUMPTION_RENEWABLE', 'FR_CONSUMPTION':'FR_CONSUMPTION_RENEWABLE' })

    all_data_clean['DE_FLOW_GAS'] = all_data_clean['DE_GAS']*all_data_clean['GAS_RET']
    all_data_clean['DE_FLOW_COAL'] = all_data_clean['DE_COAL']*all_data_clean['COAL_RET']
    all_data_clean['DE_FLOW_LIGNITE'] = all_data_clean['DE_LIGNITE']*all_data_clean['CARBON_RET']


    all_data_clean['FR_FLOW_GAS'] = all_data_clean['FR_GAS']*all_data_clean['GAS_RET']
    all_data_clean['FR_FLOW_COAL'] = all_data_clean['FR_COAL']*all_data_clean['COAL_RET']
    '''
    return all_data_clean

def add_clusters(k, all_data_clean,cols,c):
    from sklearn.cluster import KMeans
    X_season = all_data_clean[cols]
    all_data_clean = all_data_clean.drop(cols,axis=1)
    kmeans = KMeans(n_clusters=k,random_state = 0).fit(np.array(X_season))
    all_data_clean[c] = kmeans.predict(np.array(X_season))
    all_data_clean = pd.get_dummies(all_data_clean, columns=[c])
    return all_data_clean


def replace_outliers(all_data_clean, cols):
    for c in cols :
        upper_limit = all_data_clean[c].mean() + 2.75*all_data_clean[c].std()
        lower_limit = all_data_clean[c].mean() - 2.75*all_data_clean[c].std()    
        all_data_clean[c] = np.where(
            all_data_clean[c]>upper_limit,
            upper_limit,
            np.where(
                all_data_clean[c]<lower_limit,
                lower_limit,
                all_data_clean[c]
            ))
    return all_data_clean

def remove_outliers(all_data_clean, cols):
    for c in cols :
        upper_limit = all_data_clean[c].mean() + 2.75*all_data_clean[c].std()
        lower_limit = all_data_clean[c].mean() - 2.75*all_data_clean[c].std()    
        all_data_clean[c] = np.where(
            (all_data_clean[c]>upper_limit) & (all_data_clean['train']==1),
            np.nan,
            np.where(
                (all_data_clean[c]<lower_limit) & (all_data_clean['train']==1),
                np.nan,
                all_data_clean[c]
            ))
    all_data_clean = all_data_clean.dropna()
    return all_data_clean

In [6]:
energy_type_columns_FR = ['FR_GAS', 'FR_COAL', 'FR_HYDRO', 'FR_NUCLEAR', 'FR_SOLAR', 'FR_WINDPOW', 'FR_CONSUMPTION_RENEWABLE', 'FR_RESIDUAL_LOAD','FR_NET_EXPORT', 'GAS_RET', 'COAL_RET', 'CARBON_RET','TARGET']
energy_type_columns_DE = ['DE_GAS', 'DE_COAL', 'DE_HYDRO', 'DE_NUCLEAR', 'DE_SOLAR', 'DE_WINDPOW', 'DE_LIGNITE', 'DE_CONSUMPTION_RENEWABLE', 'DE_RESIDUAL_LOAD','DE_NET_EXPORT', 'GAS_RET', 'COAL_RET', 'CARBON_RET','TARGET']   

In [7]:
all_data_clean = all_data.copy()
all_data_clean['COUNTRY'] = all_data_clean['COUNTRY'].apply(lambda x: 0 if x =='FR' else 1)

#clean
all_data_clean = clean_regression(all_data_clean)

#remove features
all_data_clean = remove_features(all_data_clean)


columns = ['FR_DE_EXCHANGE', 'DE_NET_EXPORT',
       'FR_NET_EXPORT', 'DE_GAS', 'FR_GAS', 'DE_COAL', 'FR_COAL', 'DE_HYDRO',
       'FR_HYDRO', 'DE_NUCLEAR', 'FR_NUCLEAR', 'DE_SOLAR', 'FR_SOLAR',
       'DE_WINDPOW', 'FR_WINDPOW', 'DE_LIGNITE', 'DE_RESIDUAL_LOAD',
       'FR_RESIDUAL_LOAD', 'GAS_RET', 'COAL_RET', 'CARBON_RET']

#all_data_clean = remove_outliers(all_data_clean, columns)

#all_data_clean = replace_outliers(all_data_clean, columns)
'''
cols = ['DE_RAIN','FR_RAIN','DE_WIND','FR_WIND','DE_TEMP','FR_TEMP']
all_data_clean = add_clusters(4,all_data_clean,cols,'season')




cols = ['FR_GAS', 'FR_COAL', 'FR_HYDRO', 'FR_NUCLEAR', 'FR_SOLAR', 'FR_WINDPOW']
all_data_clean = add_clusters(4,all_data_clean,cols,'E_FR')


cols = ['DE_GAS', 'DE_COAL', 'DE_HYDRO', 'DE_NUCLEAR', 'DE_SOLAR', 'DE_WINDPOW', 'DE_LIGNITE']
all_data_clean = add_clusters(4,all_data_clean,cols,'E_DE')


cols = ['FR_GAS', 'FR_COAL','FR_RESIDUAL_LOAD']
all_data_clean = add_clusters(4,all_data_clean,cols,'N_FR')

cols = ['DE_GAS', 'DE_COAL','DE_RESIDUAL_LOAD']
all_data_clean = add_clusters(4,all_data_clean,cols,'N_DE')
'''

Model score 0.49328444244850245
Model score 0.4424460230408356


"\ncols = ['DE_RAIN','FR_RAIN','DE_WIND','FR_WIND','DE_TEMP','FR_TEMP']\nall_data_clean = add_clusters(4,all_data_clean,cols,'season')\n\n\n\n\ncols = ['FR_GAS', 'FR_COAL', 'FR_HYDRO', 'FR_NUCLEAR', 'FR_SOLAR', 'FR_WINDPOW']\nall_data_clean = add_clusters(4,all_data_clean,cols,'E_FR')\n\n\ncols = ['DE_GAS', 'DE_COAL', 'DE_HYDRO', 'DE_NUCLEAR', 'DE_SOLAR', 'DE_WINDPOW', 'DE_LIGNITE']\nall_data_clean = add_clusters(4,all_data_clean,cols,'E_DE')\n\n\ncols = ['FR_GAS', 'FR_COAL','FR_RESIDUAL_LOAD']\nall_data_clean = add_clusters(4,all_data_clean,cols,'N_FR')\n\ncols = ['DE_GAS', 'DE_COAL','DE_RESIDUAL_LOAD']\nall_data_clean = add_clusters(4,all_data_clean,cols,'N_DE')\n"

In [8]:
all_data_clean.columns

Index(['ID', 'DAY_ID', 'COUNTRY', 'DE_CONSUMPTION', 'FR_CONSUMPTION',
       'FR_DE_EXCHANGE', 'DE_NET_EXPORT', 'FR_NET_EXPORT', 'DE_GAS', 'FR_GAS',
       'DE_COAL', 'FR_COAL', 'DE_HYDRO', 'FR_HYDRO', 'DE_NUCLEAR',
       'FR_NUCLEAR', 'DE_SOLAR', 'FR_SOLAR', 'DE_WINDPOW', 'FR_WINDPOW',
       'DE_LIGNITE', 'DE_RESIDUAL_LOAD', 'FR_RESIDUAL_LOAD', 'DE_RAIN',
       'FR_RAIN', 'DE_WIND', 'FR_WIND', 'DE_TEMP', 'FR_TEMP', 'GAS_RET',
       'COAL_RET', 'CARBON_RET', 'TARGET', 'train'],
      dtype='object')

In [9]:
X_train = all_data_clean[all_data_clean['train'] == 1].drop(['train','TARGET','ID','DAY_ID'],axis=1)
X_test = all_data_clean[all_data_clean['train'] == 0].drop(['train','TARGET','DAY_ID'],axis=1)
id_test = X_test['ID']
X_test = X_test.drop('ID',axis=1)
y_train = all_data_clean[all_data_clean['train'] == 1]['TARGET']


X_train_FR = all_data_clean[(all_data_clean['train'] == 1) & (all_data_clean['COUNTRY'] == 0)].drop(['train','TARGET','ID','DAY_ID','COUNTRY'],axis=1)
X_train_DE = all_data_clean[(all_data_clean['train'] == 1) & (all_data_clean['COUNTRY'] == 1)].drop(['train','TARGET','ID','DAY_ID','COUNTRY'],axis=1)
y_train_FR = all_data_clean[(all_data_clean['train'] == 1) & (all_data_clean['COUNTRY'] == 0)]['TARGET']
y_train_DE = all_data_clean[(all_data_clean['train'] == 1) & (all_data_clean['COUNTRY'] == 1)]['TARGET']

X_test_FR = all_data_clean[(all_data_clean['train'] == 0) & (all_data_clean['COUNTRY'] == 0)].drop(['train','TARGET','DAY_ID','COUNTRY'],axis=1)
id_test_FR = X_test_FR['ID']
X_test_FR = X_test_FR.drop('ID',axis=1)

X_test_DE = all_data_clean[(all_data_clean['train'] == 0) & (all_data_clean['COUNTRY'] == 1)].drop(['train','TARGET','DAY_ID','COUNTRY'],axis=1)
id_test_DE = X_test_DE['ID']
X_test_DE = X_test_DE.drop('ID',axis=1)

In [10]:
X_train

Unnamed: 0,COUNTRY,DE_CONSUMPTION,FR_CONSUMPTION,FR_DE_EXCHANGE,DE_NET_EXPORT,FR_NET_EXPORT,DE_GAS,FR_GAS,DE_COAL,FR_COAL,...,FR_RESIDUAL_LOAD,DE_RAIN,FR_RAIN,DE_WIND,FR_WIND,DE_TEMP,FR_TEMP,GAS_RET,COAL_RET,CARBON_RET
0,0,-0.416568,0.017203,0.606523,-0.007214,0.086336,0.441238,-0.213766,0.740627,0.288782,...,-0.444661,-0.172680,-0.556356,-0.790823,-0.283160,-1.069070,-0.063404,0.339041,0.124552,-0.002445
1,0,0.373070,0.179741,0.022063,-0.551456,-1.152901,0.174773,0.426940,-0.170392,-0.762153,...,-1.183194,-1.240300,-0.770457,1.522331,0.828412,0.437419,1.831241,-0.659091,0.047114,-0.490365
2,0,0.058410,0.031393,-1.021305,-1.643326,-0.661281,2.351913,2.122241,1.572267,0.777053,...,1.947273,-0.480700,-0.313338,0.431134,0.487608,0.684884,0.114836,0.535974,0.743338,0.204952
3,1,0.208565,0.127776,0.839586,0.568716,-0.276356,0.487818,0.194659,-1.473817,-0.786025,...,-0.976974,-1.114838,-0.507570,-0.499409,-0.236249,0.350938,-0.417514,0.911652,-0.296168,1.073948
4,0,-0.427806,-0.090771,0.924990,0.087168,0.065334,0.238693,-0.240862,1.003734,-0.274975,...,-0.526267,-0.541465,-0.424550,-1.088158,-1.011560,0.614338,0.729495,0.245109,1.526606,2.614378
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1489,1,-0.018578,0.597169,1.855327,1.636668,-0.404901,1.810665,1.388269,0.359723,-0.294001,...,0.509514,-0.000009,0.008455,0.103704,0.109674,-0.005285,-0.009413,0.876984,0.819520,1.320373
1490,0,0.259655,0.086588,-0.611392,-0.162238,0.459246,1.972779,1.558300,0.561356,0.230746,...,1.666252,-0.000009,0.008455,0.103704,0.109674,-0.005285,-0.009413,0.932633,-0.085690,0.356356
1491,1,-0.637471,0.131079,0.255778,-1.275766,-1.085346,2.108764,1.866399,1.072553,-0.180117,...,0.358120,0.207905,0.404763,-0.594595,0.894011,0.256338,0.402316,-1.112899,-0.237835,0.067152
1492,0,0.698356,-0.158915,0.830239,0.525383,0.379991,-0.003973,0.869742,-0.436935,-0.772801,...,-0.184862,-0.682815,-0.390304,-0.972088,-1.501930,1.215528,1.338708,0.962812,-5.392852,-0.843812


In [11]:
'''
lasso_parameters = {
'alpha': np.arange(0.00, 1.0, 0.05)
}

model = Lasso()
#evaluate_model(X_train, y_train, model)
clf = GridSearchCV(model, # model
    param_grid = lasso_parameters, # hyperparameters
    scoring= make_scorer(my_custom_loss_func), # metric for scoring
    cv=5,
    n_jobs=-1, error_score='raise', verbose=3)
clf.fit(X_train, y_train)
print(clf.best_params_)
best_p = clf.best_params_
print(best_p['alpha'])
model = Lasso(best_p['alpha'])

evaluate_model_mean(X_train,y_train,model,state = 200)
'''

"\nlasso_parameters = {\n'alpha': np.arange(0.00, 1.0, 0.05)\n}\n\nmodel = Lasso()\n#evaluate_model(X_train, y_train, model)\nclf = GridSearchCV(model, # model\n    param_grid = lasso_parameters, # hyperparameters\n    scoring= make_scorer(my_custom_loss_func), # metric for scoring\n    cv=5,\n    n_jobs=-1, error_score='raise', verbose=3)\nclf.fit(X_train, y_train)\nprint(clf.best_params_)\nbest_p = clf.best_params_\nprint(best_p['alpha'])\nmodel = Lasso(best_p['alpha'])\n\nevaluate_model_mean(X_train,y_train,model,state = 200)\n"

In [12]:
best_score = []
best_selection = [ ]

In [13]:
'''
for i in range(10,X_train.shape[1]):
    
    lasso_parameters = {
    'alpha': np.arange(0.00, 1.0, 0.005)
    }

   
    sfs = SequentialFeatureSelector(Lasso(best_p['alpha']), n_features_to_select=i,cv=5)
    sfs.fit(X_train, y_train)
    
    model = Lasso()
    #evaluate_model(X_train, y_train, model)
    clf = GridSearchCV(model, # model
        param_grid = lasso_parameters, # hyperparameters
        scoring= make_scorer(my_custom_loss_func), # metric for scoring
        cv=10,
        n_jobs=-1, error_score='raise', verbose=3)
    clf.fit(np.array(X_train)[:,sfs.get_support()], y_train)
    print(clf.best_params_)
    best_p = clf.best_params_
    print(best_p['alpha'])
    model = Lasso(best_p['alpha'])
    best_selection.append(sfs.get_support())
    best_score.append(evaluate_model_mean(np.array(X_train)[:,sfs.get_support()],y_train,model,state = 100))
    print(i,best_score[-1])
    
np.array(X_train.columns)[best_selection[np.argmax(best_score)]]


lasso_parameters = {
'alpha': np.arange(0.00, 1.0, 0.00005)
}

model = Lasso()
#evaluate_model(X_train, y_train, model)
clf = GridSearchCV(model, # model
    param_grid = lasso_parameters, # hyperparameters
    scoring= make_scorer(my_custom_loss_func), # metric for scoring
    cv=5,
    n_jobs=-1, error_score='raise', verbose=3)
clf.fit(np.array(X_train)[:,best_selection[np.argmax(best_score)]], y_train)
print(clf.best_params_)
best_p = clf.best_params_
print(best_p['alpha'])
model = Lasso(best_p['alpha'])

evaluate_model_mean(np.array(X_train)[:,best_selection[np.argmax(best_score)]],y_train,model,state = 1000)

print(best_p['alpha'])
model = Lasso(best_p['alpha'])
evaluate_model_mean(np.array(X_train)[:,best_selection[np.argmax(best_score)]],y_train,model,state = 5000)


y_sub = model.predict(np.array(X_test)[:,best_selection[np.argmax(best_score)]])
sub = pd.DataFrame()
sub['ID'] = id_test
sub['TARGET'] = y_sub
#sub.to_csv("f_COUNTRY,FR_DE_EXCHANGE,DE_NET_EXPORT,DE_HYDRO,FR_NUCLEAR.csv",index=False)
'''

'\nfor i in range(10,X_train.shape[1]):\n    \n    lasso_parameters = {\n    \'alpha\': np.arange(0.00, 1.0, 0.005)\n    }\n\n   \n    sfs = SequentialFeatureSelector(Lasso(best_p[\'alpha\']), n_features_to_select=i,cv=5)\n    sfs.fit(X_train, y_train)\n    \n    model = Lasso()\n    #evaluate_model(X_train, y_train, model)\n    clf = GridSearchCV(model, # model\n        param_grid = lasso_parameters, # hyperparameters\n        scoring= make_scorer(my_custom_loss_func), # metric for scoring\n        cv=10,\n        n_jobs=-1, error_score=\'raise\', verbose=3)\n    clf.fit(np.array(X_train)[:,sfs.get_support()], y_train)\n    print(clf.best_params_)\n    best_p = clf.best_params_\n    print(best_p[\'alpha\'])\n    model = Lasso(best_p[\'alpha\'])\n    best_selection.append(sfs.get_support())\n    best_score.append(evaluate_model_mean(np.array(X_train)[:,sfs.get_support()],y_train,model,state = 100))\n    print(i,best_score[-1])\n    \nnp.array(X_train.columns)[best_selection[np.argmax(b

In [14]:
from scipy.optimize import minimize
import numpy as np

def mean_absolute_percentage_error(y_pred, y_true, sample_weights=None):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    assert len(y_true) == len(y_pred)
    
    if np.any(y_true==0):
        print("Found zeroes in y_true. MAPE undefined. Removing from set...")
        idx = np.where(y_true==0)
        y_true = np.delete(y_true, idx)
        y_pred = np.delete(y_pred, idx)
        if type(sample_weights) != type(None):
            sample_weights = np.array(sample_weights)
            sample_weights = np.delete(sample_weights, idx)
        
    if type(sample_weights) == type(None):
        return(np.mean(np.abs((y_true - y_pred) / y_true)) * 100)
    else:
        sample_weights = np.array(sample_weights)
        assert len(sample_weights) == len(y_true)
        return(100/sum(sample_weights)*np.dot(
                sample_weights, (np.abs((y_true - y_pred) / y_true))
        ))
 
def mean_square_error(y_pred,y_true, sample_weights=None):
    return 2*np.mean(np.abs((y_pred-y_true))) 

def error_corr_2(y_pred):
    return 0*np.abs(spearmanr(y_pred[:,0], y_pred[:,1]).correlation)

def error_corr(y_pred,y_true, sample_weights = None):
    return  spearmanr(y_true, y_pred).correlation
    #return np.mean((y_pred-y_true)**2) - np.mean(y_pred**2)
    

class CustomLinearModel:
    """
    Linear model: Y = XB, fit by minimizing the provided loss_function
    with L2 regularization
    """
    def __init__(self, loss_function=mean_square_error, 
                 X=None, Y=None, sample_weights=None, beta_init=None, 
                 regularization=np.array([0.01,0.001])):
        self.regularization = regularization
        self.beta = None
        self.loss_function = loss_function
        self.sample_weights = sample_weights
        self.beta_init = beta_init
        
        self.X = X
        self.Y = Y
            
    def clip_beta(self):
        self.beta[np.abs(self.beta) < 1e-5] = 0
    
    def cons_f(self,beta):
        return np.var(self.X@beta)
    def cons_J(x):
        return [[2*x[0], 1], [2*x[0], -1]]
    def cons_H(x, v):
        return v[0]*np.array([[2, 0], [0, 0]]) + v[1]*np.array([[2, 0], [0, 0]])
    
    
    
    def predict(self, X):
        prediction = X@self.beta
        return(prediction)

    def model_error(self):
        y_predict = self.predict(self.X)
        error = self.loss_function(
            y_predict, self.Y, sample_weights=self.sample_weights
        ) +  error_corr_2(y_predict) + 2 - error_corr(y_predict[:,0],self.Y[:,0]) - error_corr(y_predict[:,1],self.Y[:,1])
        return(error)
    
    def l2_regularized_loss(self, beta):
        self.beta = beta.reshape(-1,2)
        
        return(self.model_error() + \
               np.sum(self.regularization*np.abs(np.array(self.beta)))) 
    
    def score(self,y_predict,y_true):
        return np.array([error_corr(y_predict[:,0],y_true[:,0]),error_corr(y_predict[:,1],y_true[:,1])])
    
    def fit(self, X, Y, maxiter=10000):   
        self.X = X
        self.Y = Y     
        # Initialize beta estimates (you may need to normalize
        # your data and choose smarter initialization values
        # depending on the shape of your loss function)
        if type(self.beta_init)==type(None):
            # set beta_init = 1 for every feature
            self.beta_init = np.array([1]*2*self.X.shape[1])
            
        else: 
            # Use provided initial values
            pass
            
        if self.beta!=None and all(self.beta_init == self.beta):
            print("Model already fit once; continuing fit with more itrations.")
        
        
        #nonlinear_constraint = NonlinearConstraint(self.cons_f, 0.8, 1.2,jac='2-point')
        res = minimize(self.l2_regularized_loss, self.beta_init.reshape(-1),
                       method='Powell', options={'maxiter': maxiter}) #constraints=nonlinear_constraint)
        self.beta = (res.x).reshape(-1,2)
        self.beta_init = self.beta
        

#transform train to two y

In [15]:
X_train_FR = all_data_clean[(all_data_clean['COUNTRY'] == 0) & ((all_data_clean['train'] == 1))]
X_train_DE = all_data_clean[(all_data_clean['COUNTRY'] == 1) & ((all_data_clean['train'] == 1))]
X_train_DE = X_train_DE.rename(columns={'TARGET':'y_DE'})
X_train_FR = X_train_FR.rename(columns={'TARGET':'y_FR'})
X_train_FR = X_train_FR.drop(['train','ID','COUNTRY'],axis=1)
X_train_DE = X_train_DE.drop(['train','ID','COUNTRY'],axis=1)

In [16]:
X_train_FR.columns

Index(['DAY_ID', 'DE_CONSUMPTION', 'FR_CONSUMPTION', 'FR_DE_EXCHANGE',
       'DE_NET_EXPORT', 'FR_NET_EXPORT', 'DE_GAS', 'FR_GAS', 'DE_COAL',
       'FR_COAL', 'DE_HYDRO', 'FR_HYDRO', 'DE_NUCLEAR', 'FR_NUCLEAR',
       'DE_SOLAR', 'FR_SOLAR', 'DE_WINDPOW', 'FR_WINDPOW', 'DE_LIGNITE',
       'DE_RESIDUAL_LOAD', 'FR_RESIDUAL_LOAD', 'DE_RAIN', 'FR_RAIN', 'DE_WIND',
       'FR_WIND', 'DE_TEMP', 'FR_TEMP', 'GAS_RET', 'COAL_RET', 'CARBON_RET',
       'y_FR'],
      dtype='object')

In [17]:
X_train_reduced = pd.merge(X_train_FR,X_train_DE,on=['DAY_ID', 'DE_CONSUMPTION', 'FR_CONSUMPTION', 'FR_DE_EXCHANGE',
       'DE_NET_EXPORT', 'FR_NET_EXPORT', 'DE_GAS', 'FR_GAS', 'DE_COAL',
       'FR_COAL', 'DE_HYDRO', 'FR_HYDRO', 'DE_NUCLEAR', 'FR_NUCLEAR',
       'DE_SOLAR', 'FR_SOLAR', 'DE_WINDPOW', 'FR_WINDPOW', 'DE_LIGNITE',
       'DE_RESIDUAL_LOAD', 'FR_RESIDUAL_LOAD', 'DE_RAIN', 'FR_RAIN', 'DE_WIND',
       'FR_WIND', 'DE_TEMP', 'FR_TEMP', 'GAS_RET', 'COAL_RET', 'CARBON_RET'])

In [18]:
y_train_reduced = X_train_reduced[['y_FR','y_DE']]
X_train_reduced = X_train_reduced.drop(['y_FR','y_DE','DAY_ID'],axis=1)

model = CustomLinearModel()
model.fit(np.array(X_train_reduced),np.array(y_train_reduced))

y_predict = model.predict(np.array(X_train_reduced))


model.score(y_predict,np.array(y_train_reduced))

error_corr(y_predict.reshape(-1),np.array(y_train_reduced).reshape(-1))

pred = pd.DataFrame()
pred['y_pred_FR'] = y_predict[:,0]
pred['y_pred_DE'] = y_predict[:,1]
pred.corr()


X_train_d = X_train.drop(['COUNTRY'],axis=1)
X_train_d

y_predict = model.predict(np.array(X_train_d))

In [19]:
X_train_d['TARGET'] = np.array(y_train)
X_train_d['y_FR'] = np.array(y_predict[:,0]) 
X_train_d['y_DE'] = np.array(y_predict[:,1]) 
X_train_d['COUNTRY'] = X_train['COUNTRY']
test = X_train_d[['COUNTRY','y_FR','y_DE','TARGET']]
y_pred = np.array(test.apply(lambda row: row['y_FR'] if row['COUNTRY']==0 else row['y_DE'],axis=1))
np.corrcoef(y_pred,np.array(y_train))
X_test_d = X_test.drop(['COUNTRY'],axis=1)
X_test_d


Unnamed: 0,DE_CONSUMPTION,FR_CONSUMPTION,FR_DE_EXCHANGE,DE_NET_EXPORT,FR_NET_EXPORT,DE_GAS,FR_GAS,DE_COAL,FR_COAL,DE_HYDRO,...,FR_RESIDUAL_LOAD,DE_RAIN,FR_RAIN,DE_WIND,FR_WIND,DE_TEMP,FR_TEMP,GAS_RET,COAL_RET,CARBON_RET
0,-0.906186,-0.211080,0.423521,0.588854,0.095898,1.524963,0.423845,2.137016,0.478424,0.776246,...,-0.222525,-0.513180,-0.182048,-0.982546,-0.876632,0.880491,0.692242,0.569419,-0.029697,-0.929256
1,0.249995,-0.077328,-0.601610,-0.258809,1.156978,1.064102,1.807256,0.089534,-0.792111,1.549245,...,0.857739,-0.340595,-0.301094,-0.759816,-1.221443,-0.616617,-0.737496,0.251251,0.753646,0.664086
2,2.194214,0.273987,-1.179158,0.441770,1.846059,0.128004,-0.043300,-0.829546,-0.785153,0.419503,...,0.447967,0.796475,-0.367248,0.376055,-0.483363,0.865138,0.120079,-1.485642,-0.326450,-0.349747
3,-0.733309,-0.162132,0.044539,-0.173765,-0.249815,1.046069,-0.174150,0.888429,-0.283794,1.864139,...,-0.561295,-0.542606,-0.013291,-0.791119,-0.894309,0.239153,0.457457,-0.746863,2.262654,0.642069
4,-0.218271,-0.083331,-0.617391,-0.008830,0.376536,1.376753,1.413967,1.703635,-0.239676,0.815007,...,0.503567,-0.230291,-0.609203,-0.744986,-1.196282,0.176557,0.312557,-2.219626,-0.509272,-0.488341
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
649,1.603768,0.155855,-0.749769,0.448796,-0.864690,-0.601501,0.012450,-0.641398,-0.761764,-0.353839,...,-1.057186,1.591562,-0.284628,2.272121,1.647899,-0.838634,-0.463684,-1.071829,0.720641,-1.842072
650,0.853161,0.411201,-0.661778,-0.050792,-1.272234,0.993839,1.962333,0.989344,-0.296166,-0.382996,...,1.055349,0.155454,-0.168436,1.290495,2.199105,-0.329098,-1.172309,-1.290355,-0.897009,-0.925825
651,-0.825479,-0.042807,-0.557006,-0.883704,-0.905669,3.215415,2.157089,1.733252,0.174214,1.678564,...,1.824478,-0.825759,-0.266881,-1.247165,-0.666308,-0.807636,-1.260777,0.931572,0.151169,0.474347
652,-0.254705,0.006782,-0.537247,-1.646668,-0.927039,2.534455,2.202872,1.310571,0.683835,-0.094857,...,1.964292,-0.145241,-0.358861,0.195383,-0.103433,1.352906,-0.485161,0.995363,-0.152287,2.067306


In [20]:
d = all_data_clean['DAY_ID'].value_counts()
all_data_clean['yes'] = np.array(all_data_clean.apply(lambda row: 1 if d[row['DAY_ID']] == 1 else 0,axis=1))
X_train_add = all_data_clean[((all_data_clean['yes'] == 1)& (all_data_clean['train'] == 1))]
X_train_add = X_train_add.drop(['ID','DAY_ID','yes','train','COUNTRY'],axis=1)
target = np.array(X_train_add['TARGET'])
X_train_add = X_train_add.drop(['TARGET'],axis=1)
y_DE = model.predict(np.array(X_train_add))[:,1]
X_train_add['y_FR'] = target
X_train_add['y_DE'] = y_DE

In [21]:
X_train_reduced['y_FR'] = y_train_reduced['y_FR']
X_train_reduced['y_DE'] = y_train_reduced['y_DE']
X_train_reduced = pd.concat([X_train_reduced,X_train_add])

In [43]:
spearmanr(np.array(y_train_reduced['y_FR']),  np.array(y_train_reduced['y_DE'])).correlation

0.08504357709004168

In [41]:
y_train_reduced.corr()

Unnamed: 0,y_FR,y_DE
y_FR,1.0,0.072787
y_DE,0.072787,1.0


In [22]:
y_train_reduced = X_train_reduced[['y_FR','y_DE']]
X_train_reduced = X_train_reduced.drop(['y_FR','y_DE'],axis=1)

model = CustomLinearModel()
model.fit(np.array(X_train_reduced),np.array(y_train_reduced))

y_predict = model.predict(np.array(X_train_reduced))

In [23]:
model.score(y_predict,np.array(y_train_reduced))

array([0.2365552 , 0.59335094])

In [24]:
error_corr(y_predict.reshape(-1),np.array(y_train_reduced).reshape(-1))

0.44606040167494254

In [25]:
pred = pd.DataFrame()
pred['y_pred_FR'] = y_predict[:,0]
pred['y_pred_DE'] = y_predict[:,1]
pred.corr()

Unnamed: 0,y_pred_FR,y_pred_DE
y_pred_FR,1.0,0.221678
y_pred_DE,0.221678,1.0


In [26]:
X_train_d = X_train.drop(['COUNTRY'],axis=1)
X_train_d

y_predict = model.predict(np.array(X_train_d))

In [27]:
X_train_d['TARGET'] = np.array(y_train)
X_train_d['y_FR'] = np.array(y_predict[:,0]) 
X_train_d['y_DE'] = np.array(y_predict[:,1]) 
X_train_d['COUNTRY'] = X_train['COUNTRY']
test = X_train_d[['COUNTRY','y_FR','y_DE','TARGET']]
y_pred = np.array(test.apply(lambda row: row['y_FR'] if row['COUNTRY']==0 else row['y_DE'],axis=1))
np.corrcoef(y_pred,np.array(y_train))


array([[1.        , 0.21858172],
       [0.21858172, 1.        ]])

In [28]:
np.corrcoef(y_pred,np.array(y_train))

array([[1.        , 0.21858172],
       [0.21858172, 1.        ]])

In [29]:
X_test_d = X_test.drop(['COUNTRY'],axis=1)
X_test_d

Unnamed: 0,DE_CONSUMPTION,FR_CONSUMPTION,FR_DE_EXCHANGE,DE_NET_EXPORT,FR_NET_EXPORT,DE_GAS,FR_GAS,DE_COAL,FR_COAL,DE_HYDRO,...,FR_RESIDUAL_LOAD,DE_RAIN,FR_RAIN,DE_WIND,FR_WIND,DE_TEMP,FR_TEMP,GAS_RET,COAL_RET,CARBON_RET
0,-0.906186,-0.211080,0.423521,0.588854,0.095898,1.524963,0.423845,2.137016,0.478424,0.776246,...,-0.222525,-0.513180,-0.182048,-0.982546,-0.876632,0.880491,0.692242,0.569419,-0.029697,-0.929256
1,0.249995,-0.077328,-0.601610,-0.258809,1.156978,1.064102,1.807256,0.089534,-0.792111,1.549245,...,0.857739,-0.340595,-0.301094,-0.759816,-1.221443,-0.616617,-0.737496,0.251251,0.753646,0.664086
2,2.194214,0.273987,-1.179158,0.441770,1.846059,0.128004,-0.043300,-0.829546,-0.785153,0.419503,...,0.447967,0.796475,-0.367248,0.376055,-0.483363,0.865138,0.120079,-1.485642,-0.326450,-0.349747
3,-0.733309,-0.162132,0.044539,-0.173765,-0.249815,1.046069,-0.174150,0.888429,-0.283794,1.864139,...,-0.561295,-0.542606,-0.013291,-0.791119,-0.894309,0.239153,0.457457,-0.746863,2.262654,0.642069
4,-0.218271,-0.083331,-0.617391,-0.008830,0.376536,1.376753,1.413967,1.703635,-0.239676,0.815007,...,0.503567,-0.230291,-0.609203,-0.744986,-1.196282,0.176557,0.312557,-2.219626,-0.509272,-0.488341
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
649,1.603768,0.155855,-0.749769,0.448796,-0.864690,-0.601501,0.012450,-0.641398,-0.761764,-0.353839,...,-1.057186,1.591562,-0.284628,2.272121,1.647899,-0.838634,-0.463684,-1.071829,0.720641,-1.842072
650,0.853161,0.411201,-0.661778,-0.050792,-1.272234,0.993839,1.962333,0.989344,-0.296166,-0.382996,...,1.055349,0.155454,-0.168436,1.290495,2.199105,-0.329098,-1.172309,-1.290355,-0.897009,-0.925825
651,-0.825479,-0.042807,-0.557006,-0.883704,-0.905669,3.215415,2.157089,1.733252,0.174214,1.678564,...,1.824478,-0.825759,-0.266881,-1.247165,-0.666308,-0.807636,-1.260777,0.931572,0.151169,0.474347
652,-0.254705,0.006782,-0.537247,-1.646668,-0.927039,2.534455,2.202872,1.310571,0.683835,-0.094857,...,1.964292,-0.145241,-0.358861,0.195383,-0.103433,1.352906,-0.485161,0.995363,-0.152287,2.067306


In [30]:
X_test_d

Unnamed: 0,DE_CONSUMPTION,FR_CONSUMPTION,FR_DE_EXCHANGE,DE_NET_EXPORT,FR_NET_EXPORT,DE_GAS,FR_GAS,DE_COAL,FR_COAL,DE_HYDRO,...,FR_RESIDUAL_LOAD,DE_RAIN,FR_RAIN,DE_WIND,FR_WIND,DE_TEMP,FR_TEMP,GAS_RET,COAL_RET,CARBON_RET
0,-0.906186,-0.211080,0.423521,0.588854,0.095898,1.524963,0.423845,2.137016,0.478424,0.776246,...,-0.222525,-0.513180,-0.182048,-0.982546,-0.876632,0.880491,0.692242,0.569419,-0.029697,-0.929256
1,0.249995,-0.077328,-0.601610,-0.258809,1.156978,1.064102,1.807256,0.089534,-0.792111,1.549245,...,0.857739,-0.340595,-0.301094,-0.759816,-1.221443,-0.616617,-0.737496,0.251251,0.753646,0.664086
2,2.194214,0.273987,-1.179158,0.441770,1.846059,0.128004,-0.043300,-0.829546,-0.785153,0.419503,...,0.447967,0.796475,-0.367248,0.376055,-0.483363,0.865138,0.120079,-1.485642,-0.326450,-0.349747
3,-0.733309,-0.162132,0.044539,-0.173765,-0.249815,1.046069,-0.174150,0.888429,-0.283794,1.864139,...,-0.561295,-0.542606,-0.013291,-0.791119,-0.894309,0.239153,0.457457,-0.746863,2.262654,0.642069
4,-0.218271,-0.083331,-0.617391,-0.008830,0.376536,1.376753,1.413967,1.703635,-0.239676,0.815007,...,0.503567,-0.230291,-0.609203,-0.744986,-1.196282,0.176557,0.312557,-2.219626,-0.509272,-0.488341
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
649,1.603768,0.155855,-0.749769,0.448796,-0.864690,-0.601501,0.012450,-0.641398,-0.761764,-0.353839,...,-1.057186,1.591562,-0.284628,2.272121,1.647899,-0.838634,-0.463684,-1.071829,0.720641,-1.842072
650,0.853161,0.411201,-0.661778,-0.050792,-1.272234,0.993839,1.962333,0.989344,-0.296166,-0.382996,...,1.055349,0.155454,-0.168436,1.290495,2.199105,-0.329098,-1.172309,-1.290355,-0.897009,-0.925825
651,-0.825479,-0.042807,-0.557006,-0.883704,-0.905669,3.215415,2.157089,1.733252,0.174214,1.678564,...,1.824478,-0.825759,-0.266881,-1.247165,-0.666308,-0.807636,-1.260777,0.931572,0.151169,0.474347
652,-0.254705,0.006782,-0.537247,-1.646668,-0.927039,2.534455,2.202872,1.310571,0.683835,-0.094857,...,1.964292,-0.145241,-0.358861,0.195383,-0.103433,1.352906,-0.485161,0.995363,-0.152287,2.067306


In [31]:
y_predict_test = model.predict(np.array(X_test_d))

In [32]:
X_test_d['COUNTRY'] = X_test['COUNTRY']
X_test_d['y_FR'] = y_predict_test[:,0]
X_test_d['y_DE'] = y_predict_test[:,1]
y_predict_test = np.array(X_test_d.apply(lambda row: row['y_FR'] if row['COUNTRY']==0 else row['y_DE'],axis=1))

In [33]:
sub = pd.DataFrame()
sub['ID'] = id_test
sub['TARGET'] = y_predict_test
sub.to_csv("submissions/2_linear_model.csv",index=False)

In [34]:
np.var(y_predict_test)

0.14122767263784647

In [35]:
y_best = pd.read_csv('submissions/best_model.csv')
test = pd.merge(y_best,sub,on='ID')
test.corr()

Unnamed: 0,ID,TARGET_x,TARGET_y
ID,1.0,0.063695,-0.146195
TARGET_x,0.063695,1.0,0.525778
TARGET_y,-0.146195,0.525778,1.0
