In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math

from scripts.utils import save_fig, standard_scale


%load_ext autoreload
%autoreload 2

In [None]:
df = pd.read_csv(
    "data/features/features-METEO-FRANCE_per_vineyard.csv", index_col=0, header=[0, 1]
)
display(df.head())

# Build X and y

In [None]:
VINEYARDS = df.columns.droplevel(1).unique()

In [None]:
df_early = df.loc[1960:1993]
df_late = df.loc[1994:2013]

In [None]:
ALL_FEATURES = list(
    set([feature for feature in df_late.columns.droplevel(0) if feature != "0 - Price"])
)
concat = []
for vineyard in VINEYARDS:
    features_vineyard = df_late.loc[:, (vineyard,)].copy()[ALL_FEATURES]
    features_vineyard["Year"] = features_vineyard.index
    concat.append(standard_scale(features_vineyard))

X_late = pd.concat(concat)

concat = []
for vineyard in VINEYARDS:
    features_vineyard = df_early.loc[:, (vineyard,)].copy()[ALL_FEATURES]
    features_vineyard["Year"] = features_vineyard.index
    concat.append(standard_scale(features_vineyard))

X_early = pd.concat(concat)

In [None]:
prices = df_early.loc[:, (slice(None), '0 - Price')]
prices.columns = prices.columns.droplevel(1)
prices = prices - prices.mean(axis=0)
prices
concat = []
for vineyard in VINEYARDS:
    scaled = standard_scale(prices[[vineyard]])
    scaled.columns=['price']
    concat.append(scaled)
y_early = pd.concat(concat, axis=0)

prices = df_late.loc[:, (slice(None), '0 - Price')]
prices.columns = prices.columns.droplevel(1)
prices = prices - prices.mean(axis=0)
prices
concat = []
for vineyard in VINEYARDS:
    scaled = standard_scale(prices[[vineyard]])
    scaled.columns=['price']
    concat.append(scaled)
y_late = pd.concat(concat, axis=0)

In [None]:
from sklearn.feature_selection import (
    r_regression, p_values
)

coefs = r_regression(X_late, y_late.iloc[:, 0])
coefs = pd.DataFrame(
    np.concatenate([coefs.reshape(-1, 1), p_values.reshape(-1, 1)], axis=1),
    index=X.columns,
    columns=["F-statistic", "p-value"],
)
display(coefs.loc[coefs["F-statistic"].abs().nlargest(10).index])

In [None]:
from scipy.stats import pearsonr
pearsonr(X_late["WD: flowering - harvest"], X_late["P: véraison - harvest"])

In [None]:
coefs = r_regression(X_late, y_late.iloc[:, 0])
coefs= pd.DataFrame(np.concatenate([coefs.reshape(-1,1), p_values.reshape(-1,1)], axis=1), index=X.columns, columns=["PearsonR", "p-value"])
ordered_coefs = coefs.loc[coefs["PearsonR"].abs().nlargest(10).index]
ordered_coefs.to_excel("a.xls")


---
# End of notebook
