In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from scipy import stats

from scripts.model_pool import test_model
from scripts.utils import score_models

plt.rcParams['figure.figsize'] = (12, 4)
%load_ext autoreload
%autoreload 2

In [None]:
LIST = ['ERA5', 'METEO-FRANCE']
WEATHER_SOURCE = LIST[1]

In [None]:
df_test = pd.read_csv(f'data/features/features-{WEATHER_SOURCE}_per_vineyard.csv', index_col=0, header=[0,1])

In [None]:
VINEYARDS = df_test.columns.get_level_values(0).unique()
features = df_test.columns.get_level_values(1).unique()

In [None]:
dict_results = {}
first_vintage = 1960
last_vintage = 2017
target_variable = '0 - Price'
predictors = ['P: flowering', 'DTR: véraison - harvest','WD: flowering - harvest']

# OLS

In [None]:
model = LinearRegression()
dict_results["OLS"] = test_model(
    VINEYARDS,
    model,
    "OLS",
    df_test,
    target_variable,
    predictors,
    first_vintage,
    last_vintage,
)

# Trees : RF, Boosting

In [None]:
model = DecisionTreeRegressor(max_depth=3)
dict_results["DT"] = test_model(
    VINEYARDS,
    model,
    "Decision Tree",
    df_test,
    "0 - Price",
    predictors,
    first_vintage,
    last_vintage,
)

model = RandomForestRegressor(n_estimators=20, max_depth=3)
dict_results["RF"] = test_model(
    VINEYARDS,
    model,
    "Random Forest",
    df_test,
    "0 - Price",
    predictors,
    first_vintage,
    last_vintage,
)

model = GradientBoostingRegressor(n_estimators=20)
dict_results["GB"] = test_model(
    VINEYARDS,
    model,
    "Gradient Boosting",
    df_test,
    "0 - Price",
    predictors,
    first_vintage,
    last_vintage,
)

# SVR

In [None]:
model = SVR(kernel="rbf")
dict_results["SVR RBF"] = test_model(
    VINEYARDS,
    model,
    "SVR rbf",
    df_test,
    target_variable,
    predictors,
    first_vintage,
    last_vintage,
)

model = SVR(kernel="sigmoid")
dict_results["SVR sigmoid"] = test_model(
    VINEYARDS,
    model,
    "SVR sigmoid",
    df_test,
    target_variable,
    predictors,
    first_vintage,
    last_vintage,
)

# Classical Ashenfelter model

Ashenfelter parameters in 2008 *Predicting the Quality and Prices of Bordeaux Wine* paper:
- Temperature Growing Season (April - September)
- Winter rain (October to March)
- August Rain

In [None]:
predictors = ["Winter Rain", "August Rain", "Growing Season Tm"]
model = LinearRegression()
dict_results["Ashenfelter OLS"] = test_model(
    VINEYARDS,
    model,
    "Ashenfelter OLS",
    df_test,
    target_variable,
    predictors,
    first_vintage,
    last_vintage,
)

# Square temperature model

In [None]:
predictors = ["Growing Season Tm", "Sq Growing Season Tm"]
model = LinearRegression()
dict_results["Quadratic OLS"] = test_model(
    VINEYARDS,
    model,
    "Quadratic OLS",
    df_test,
    target_variable,
    predictors,
    first_vintage,
    last_vintage,
)

In [None]:
df_results = pd.concat([dict_results[key] for key in dict_results.keys()])
df_prices = df_test.loc[:, (VINEYARDS, "0 - Price")].droplevel(1, axis=1)

scores = score_models(df_results, df_prices, 1994, 2013)

models = scores.columns
scores_median = scores.reset_index().groupby("Test variable")[models].median()
display(scores_median)
scores_median.reset_index().to_excel(
    f"model_outputs/scores/model_pool_predictions-{WEATHER_SOURCE}_per_vineyard.xls",
    index=False,
)

### Welch's t-test

In [None]:
stats.ttest_ind(scores.loc['R2', 'OLS'], scores.loc['R2', 'Quadratic OLS'], equal_var = False)

---

# End of notebook