In [None]:
import pandas as pd

params = pd.read_csv('final_features.csv')
params = params.drop(['Unnamed: 0'], axis=1)
params = pd.concat([params, params, params, params])

In [None]:
params.columns.values.tolist()

In [None]:
features = ['price','availability_30', 'number_of_reviews', 'first_review']
target = 'review_scores_value'
targets = ['review_scores_rating',
           'review_scores_value',
           'review_scores_checkin',
           'review_scores_accuracy',
           'review_scores_location',
           'review_scores_cleanliness',
           'review_scores_communication']
params

In [None]:
from matplotlib import pyplot as plt

outlier_plot = params.boxplot(column=target, return_type='dict')

In [None]:
def boxplot_fill(col):
    iqr = col.quantile(0.75) - col.quantile(0.25)
    u_th = col.quantile(0.75) + 1.5 * iqr
    l_th = col.quantile(0.25) - 1.5 * iqr

    def box_trans(x_inside):
        if x_inside > u_th:
            return None
        elif x_inside < l_th:
            return None
        else:
            return x_inside

    return col.map(box_trans)


boxplot_fill(params[target]).hist()

In [None]:
params[target] = boxplot_fill(params[target])

In [None]:
params.dropna(inplace=True)
params

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.dummy import DummyRegressor
import numpy as np
from sklearn.model_selection import KFold

# x = params[features]
x = params.iloc[:,7:]
y = params.loc[:, target]

mean_error = []
std_error = []
min_mean_error = 2
min_std_error = 2
base_min_mean_error = 2
base_min_std_error = 2

r2_scores = []
max_r2_score = -100
base_max_r2_score = -100

accuracy = []
c_range = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
# c_range = [50, 100, 1000, 10000]

for c in c_range:
    model_c = Ridge(alpha=1 / (2 * c))
    temp = []
    temp_base = []
    temp_r2 = []
    temp_base_r2 = []
    kf = KFold(n_splits=5, shuffle=True)
    for train, test in kf.split(x):
        model_c.fit(x.iloc[train], y.iloc[train])
        y_pred = model_c.predict(x.iloc[test])
        temp.append(mean_squared_error(y.iloc[test], y_pred))
        temp_r2.append(r2_score(y.iloc[test], y_pred))

        baseline = DummyRegressor()
        baseline.fit(x.iloc[train], y.iloc[train])
        y_base = baseline.predict(x.iloc[test])
        temp_base.append(mean_squared_error(y.iloc[test], y_base))
        temp_base_r2.append(r2_score(y.iloc[test], y_base))

    mean_r2 = np.array(temp_r2).mean()
    if mean_r2 > max_r2_score:
        max_r2_score = mean_r2
    r2_scores.append(mean_r2)
    mean = np.array(temp).mean()
    std = np.array(temp).std()
    if mean < min_mean_error:
        min_mean_error = mean
    if std < min_std_error:
        min_std_error = std
    mean_error.append(mean)
    std_error.append(std)


    base_mean_r2 = np.array(temp_base_r2).mean()
    if base_mean_r2 > base_max_r2_score:
        base_max_r2_score = base_mean_r2
    base_mean = np.array(temp_base).mean()
    base_std = np.array(temp_base).std()
    if base_mean < base_min_mean_error:
        base_min_mean_error = base_mean
    if base_std < base_min_std_error:
        base_min_std_error = base_std

print('Max R2 score:', max_r2_score)
print('Max baseline R2 score:', base_max_r2_score)
print('Min mean error: ', min_mean_error)
print('Min baseline mean error: ', base_min_mean_error)
print('Min std error: ', min_std_error)
print('Min baseline std error: ', base_min_std_error)

plt.figure()
plt.plot(c_range, r2_scores)
plt.xlabel('Ci')
plt.ylabel('R2 score')

plt.figure()
plt.errorbar(c_range, mean_error, yerr=std_error)
plt.xlabel('Ci')
plt.ylabel('Mean square error')
plt.show()