In [1]:
import numpy  as np
import pandas as pd
from collections import defaultdict
from metrics import MAE
from tqdm import tqdm

In [2]:
X_train = pd.read_pickle("../../../Datasets/final/X_train.pkl")
c = X_train.columns.tolist()
X_val = pd.read_pickle("../../../Datasets/final/X_val.pkl")
X_test = pd.read_pickle("../../../Datasets/final/X_test.pkl")
y_train = pd.read_pickle("../../../Datasets/final/y_train.pkl").to_numpy()
y_val = pd.read_pickle("../../../Datasets/final/y_val.pkl").to_numpy()
y_test = pd.read_pickle("../../../Datasets/final/y_test.pkl").to_numpy()

In [3]:
X_train.insert(X_train.shape[1], "rating", y_train.T[0])

In [4]:
month_columns = [i for i in c if "month" in i]
year_columns = [i for i in c if "year" in i]

In [5]:
time_columns = []
for i in year_columns:
    for j in month_columns:
        time_columns.append([j, i])

In [6]:
mean_per_month_dict = {}
for month in month_columns:
    month_df = X_train.groupby([month, "gPlusPlaceId"]).agg({"gPlusUserId": "count"}).reset_index()
    popular_place = month_df[month_df[month] == 1].sort_values("gPlusUserId", ascending=False).iloc[0]["gPlusPlaceId"]
    mean_per_month_dict[month] = X_train[(X_train[month] == 1) & (X_train["gPlusPlaceId"] == popular_place)]["rating"].mean()

In [7]:
mean_per_year_dict = {}
for year in year_columns:
    year_df = X_train.groupby([year, "gPlusPlaceId"]).agg({"gPlusUserId": "count"}).reset_index()
    popular_place = year_df[year_df[year] == 1].sort_values("gPlusUserId", ascending=False).iloc[0]["gPlusPlaceId"]
    mean_per_year_dict[year] = X_train[(X_train[year] == 1) & (X_train["gPlusPlaceId"] == popular_place)]["rating"].mean()

In [8]:
mean_per_time_dict = defaultdict(dict)
for time in time_columns:
    time_df = X_train.groupby([*time, "gPlusPlaceId"]).agg({"gPlusUserId": "count"}).reset_index()
    popular_place = time_df[(time_df[time[0]] == 1) & (time_df[time[1]] == 1)].sort_values(
        "gPlusUserId", ascending=False)
    if len(popular_place) > 0:
        popular_place = popular_place.iloc[0]["gPlusPlaceId"]
        res = X_train[(X_train[time[0]] == 1) & (X_train[time[1]] == 1) & (X_train["gPlusPlaceId"] == popular_place)]["rating"].mean()
        mean_per_time_dict[time[0]][time[1]] = res
            

In [9]:
mean_per_time_dict

defaultdict(dict,
            {'month_12': {'year_1990': 4.2,
              'year_2003': 3.0,
              'year_2004': 3.5,
              'year_2005': 3.5,
              'year_2006': 4.5,
              'year_2007': 5.0,
              'year_2008': 4.333333333333333,
              'year_2009': 3.0,
              'year_2010': 4.666666666666667,
              'year_2011': 4.75,
              'year_2012': 4.857142857142857,
              'year_2013': 4.333333333333333},
             'month_04': {'year_2002': 4.0,
              'year_2003': 2.0,
              'year_2004': 5.0,
              'year_2005': 3.3333333333333335,
              'year_2006': 5.0,
              'year_2007': 3.5,
              'year_2008': 4.75,
              'year_2009': 4.5,
              'year_2010': 3.5,
              'year_2011': 5.0,
              'year_2012': 3.6666666666666665,
              'year_2013': 2.5},
             'month_06': {'year_2002': 5.0,
              'year_2004': 4.0,
              'year_2005

In [10]:
y_pred_val = []
for i in tqdm(range(len(X_val))):
    res_month = None
    res_year = None
    
    for j in range(len(X_val.iloc[i][month_columns])):
        if X_val.iloc[i][month_columns][j] == 1:
            res_month = month_columns[j]
            break
            
    for k in range(len(X_val.iloc[i][year_columns])):
        if X_val.iloc[i][year_columns][k] == 1:
            res_year = year_columns[k]
            break
            
    if res_year not in mean_per_time_dict[res_month]:
        month_avg = mean_per_month_dict[res_month] 
        year_avg = mean_per_year_dict[res_year]
        y_pred_val.append(max(month_avg, year_avg))
    else:
        y_pred_val.append(mean_per_time_dict[res_month][res_year])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 161526/161526 [08:20<00:00, 322.59it/s]


In [11]:
MAE(predictions=y_pred_val, labels=y_val)[0]

1.2039334621735212

In [12]:
y_pred = []
for i in tqdm(range(len(X_test))):
    res_month = None
    res_year = None
    
    for j in range(len(X_test.iloc[i][month_columns])):
        if X_test.iloc[i][month_columns][j] == 1:
            res_month = month_columns[j]
            break
            
    for k in range(len(X_test.iloc[i][year_columns])):
        if X_test.iloc[i][year_columns][k] == 1:
            res_year = year_columns[k]
            break
            
    if res_year not in mean_per_time_dict[res_month]:
        month_avg = mean_per_month_dict[res_month] 
        year_avg = mean_per_year_dict[res_year]
        y_pred.append(max(month_avg, year_avg))
    else:
        y_pred.append(mean_per_time_dict[res_month][res_year])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 161526/161526 [08:02<00:00, 334.91it/s]


In [13]:
MAE(predictions=y_pred, labels=y_test)[0]

1.198020007462217