# Approach 3: MAB - testing

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from tqdm import tqdm

In [3]:
# Fixate the global random seed
SEED = 0
np.random.seed(SEED)

## Training

###  Data Preprocessing

In [4]:
TEST_PATH = 'data/test.csv'

df = pd.read_csv(TEST_PATH)

In [5]:
print(f"> Size of train df: %d" % len(df["rating"]))
print("> Quick visual check of dataframe:") 
df.head()

> Size of train df: 9430
> Quick visual check of dataframe:


Unnamed: 0,user_id,item_id,rating,user_age,user_gender,user_occupation,user_zip,release_date,genre
0,1,20,4,24,M,technician,Arizona,1995.0,Drama
1,1,33,4,24,M,technician,Arizona,1995.0,Action
2,1,61,4,24,M,technician,Arizona,1994.0,Drama
3,1,117,3,24,M,technician,Arizona,1996.0,Action
4,1,155,2,24,M,technician,Arizona,1987.0,Musical


In [6]:
df['liked'] = df['rating'] >= 4 
df.drop(
    columns=['rating'], 
    inplace=True
    )

In [7]:
df_liked = df[df['liked']==True]
df_disliked = df[df['liked']==False]

In [8]:
like_counts = df_liked.value_counts(subset=['item_id']).to_frame().reset_index()
total_counts = df.value_counts(subset=['item_id']).to_frame().reset_index()
# dislike_counts = df_disliked.value_counts(subset=['item_id']).to_frame().reset_index()


In [9]:
merged_counts = pd.merge(like_counts, total_counts, how='inner', on="item_id", suffixes=["_like","_total"])
# merged_counts = pd.merge(like_counts,dislike_counts, how='outer', on="item_id")
merged_counts["ratio"] = merged_counts["count_like"] / merged_counts["count_total"]
merged_counts.sort_values("ratio",ascending=False)
merged_counts

Unnamed: 0,item_id,count_like,count_total,ratio
0,50,73,88,0.829545
1,258,68,97,0.701031
2,313,57,72,0.791667
3,127,53,61,0.868852
4,100,52,65,0.800000
...,...,...,...,...
874,1592,1,2,0.500000
875,1612,1,1,1.000000
876,1617,1,1,1.000000
877,1646,1,1,1.000000


### Question: How should we treat items in the test set, that only received likes? What is their like-ratio?

In [13]:
merged_counts["ratio"] = merged_counts['count_x'] / merged_counts['count_y']
merged_counts.sort_values(
    by="ratio",
    ascending=False,
    inplace=True
)
merged_counts

Unnamed: 0,item_id,count_x,count_y,ratio
113,124,21.0,1.0,21.0
449,479,13.0,1.0,13.0
74,83,12.0,1.0,12.0
179,192,12.0,1.0,12.0
165,178,11.0,1.0,11.0
...,...,...,...,...
1124,1646,1.0,,
1125,1653,1.0,,
1126,1656,,1.0,
1127,1662,,1.0,


In [20]:
print("Top Five item-ids")
merged_counts.reset_index()[0:5].item_id.to_list()

Top Five item-ids


[124, 479, 83, 192, 178]