In [1]:
import numpy  as np
import pandas as pd
from collections import defaultdict
from metrics import MSE
from tqdm import tqdm

In [2]:
X_train = pd.read_pickle("../Datasets/final/X_train.pkl").drop(columns=["gPlusUserId", "gPlusPlaceId"])
c = X_train.columns.tolist()
X_val = pd.read_pickle("../Datasets/final/X_val.pkl").drop(columns=["gPlusUserId", "gPlusPlaceId"])
X_test = pd.read_pickle("../Datasets/final/X_test.pkl").drop(columns=["gPlusUserId", "gPlusPlaceId"])
y_train = pd.read_pickle("../Datasets/final/y_train.pkl").to_numpy()
y_val = pd.read_pickle("../Datasets/final/y_val.pkl").to_numpy()
y_test = pd.read_pickle("../Datasets/final/y_test.pkl").to_numpy()

In [3]:
X_train.insert(X_train.shape[1], "rating", y_train.T[0])

In [4]:
month_columns = [i for i in c if "month" in i]
year_columns = [i for i in c if "year" in i]

In [5]:
time_columns = []
for i in year_columns:
    for j in month_columns:
        time_columns.append([j, i])

In [6]:
mean_per_month_dict = {}
for month in month_columns:
    month_df = X_train.groupby(month).agg({"rating": "mean"}).reset_index()
    mean_per_month_dict[month] = month_df[month_df[month] == 1]["rating"].item()

In [7]:
mean_per_year_dict = {}
for year in year_columns:
    year_df = X_train.groupby(year).agg({"rating": "mean"}).reset_index()
    mean_per_year_dict[year] = year_df[year_df[year] == 1]["rating"].item()

In [8]:
mean_per_time_dict = defaultdict(dict)
for time in time_columns:
    time_df = X_train.groupby(time).agg({"rating": "mean"}).reset_index()
    res = time_df[(time_df[time[0]] == 1) & (time_df[time[1]] == 1)]["rating"]
    if len(res) > 0:
        mean_per_time_dict[time[0]][time[1]] = res.item()

In [9]:
mean_per_time_dict

defaultdict(dict,
            {'month_12': {'year_1990': 4.212534059945504,
              'year_2003': 3.0,
              'year_2004': 4.0285714285714285,
              'year_2005': 3.588235294117647,
              'year_2006': 4.017241379310345,
              'year_2007': 3.84,
              'year_2008': 3.7169811320754715,
              'year_2009': 3.6231060606060606,
              'year_2010': 3.962557603686636,
              'year_2011': 3.7546524876566654,
              'year_2012': 3.9174698410672115,
              'year_2013': 3.6263898273849744},
             'month_04': {'year_2002': 4.0,
              'year_2003': 3.6,
              'year_2004': 3.75,
              'year_2005': 3.9109311740890687,
              'year_2006': 3.7641509433962264,
              'year_2007': 3.7511111111111113,
              'year_2008': 3.8005540166204987,
              'year_2009': 3.778745644599303,
              'year_2010': 3.781523096129838,
              'year_2011': 3.867855270057682,
   

In [10]:
y_pred_val = []
for i in tqdm(range(len(X_val))):
    res_month = None
    res_year = None
    
    for j in range(len(X_val.iloc[i][month_columns])):
        if X_val.iloc[i][month_columns][j] == 1:
            res_month = month_columns[j]
            break
            
    for k in range(len(X_val.iloc[i][year_columns])):
        if X_val.iloc[i][year_columns][k] == 1:
            res_year = year_columns[k]
            break
            
    if res_year not in mean_per_time_dict[res_month]:
        month_avg = mean_per_month_dict[res_month] 
        year_avg = mean_per_year_dict[res_year]
        y_pred_val.append(max(month_avg, year_avg))
    else:
        y_pred_val.append(mean_per_time_dict[res_month][res_year])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 161526/161526 [07:57<00:00, 338.15it/s]


In [11]:
MSE(predictions=y_pred_val, labels=y_val)[0]

1.6805897772228888

In [13]:
y_pred = []
for i in tqdm(range(len(X_test))):
    res_month = None
    res_year = None
    
    for j in range(len(X_test.iloc[i][month_columns])):
        if X_test.iloc[i][month_columns][j] == 1:
            res_month = month_columns[j]
            break
            
    for k in range(len(X_test.iloc[i][year_columns])):
        if X_test.iloc[i][year_columns][k] == 1:
            res_year = year_columns[k]
            break
            
    if res_year not in mean_per_time_dict[res_month]:
        month_avg = mean_per_month_dict[res_month] 
        year_avg = mean_per_year_dict[res_year]
        y_pred.append(max(month_avg, year_avg))
    else:
        y_pred.append(mean_per_time_dict[res_month][res_year])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 161526/161526 [08:01<00:00, 335.64it/s]


In [14]:
MSE(predictions=y_pred, labels=y_test)[0]

1.6616142544726031