In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import math
from tqdm import tqdm
random.seed(0)

plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

You can get the data from MovieLens 100K Data here: 
https://grouplens.org/datasets/movielens/

# user context extraction

In [None]:
user = pd.read_csv("ml-100k/u.user", header = None, sep = "|")
user.columns = ["user_id","age","gender","occupation","zipcode"]
user = user.drop(["zipcode"], axis = 1)

In [None]:
bins = [0, 20,30, 40, 50, np.inf]
names = ['<19', '20-29','30-39','40-49', '50+']

user['agegroup'] = pd.cut(user['age'], bins, labels=names)
user = user.drop(["age"], axis = 1)
user[['user_id','gender','agegroup']].head()

In [None]:
user.groupby('agegroup').sum()

In [None]:
user_features = pd.get_dummies(user[['gender','agegroup']],drop_first=True)
user_features['user_id'] = user['user_id']
user_features['user_id'].shape

# movie context extraction

In [None]:
movie = pd.read_csv("ml-100k/u.item", header = None, sep = "|", encoding='latin-1')
movie.columns = ["movie_id", "movie_title", "release_date", "video_release_date", "IMDb_URL", 
                  "unknown", "Action", "Adventure","Animation","Children's","Comedy","Crime","Documentary","Drama","Fantasy",
                  "Film-Noir","Horror", "Musical", "Mystery","Romance","Sci-Fi","Thriller", "War","Western"]
movie

In [None]:
movie_features = movie.drop(["movie_title","release_date", "video_release_date", "IMDb_URL","unknown"],axis = 1)

In [None]:
movie_features.sum() > (1682 * 0.1)

In [None]:
movie_features = movie_features[['movie_id','Comedy','Drama','Romance','Thriller']]
movie_features

# Rating

In [None]:
data  = pd.read_csv("ml-100k/u.data", sep ="\t", header=None, names = ["user_id", "movie_id","rating", "timestamp"])
data  = data.drop(["timestamp"], axis = 1)
data 

In [None]:
k=2
data.groupby("movie_id").count().sort_values("user_id", ascending = False).head(k)["rating"].sum()

In [None]:
# Obtain top k movies index
top_movies_index = data.groupby("movie_id").count().sort_values("user_id", ascending = False).head(k).reset_index()["movie_id"]
top_movies_index

In [None]:
top_movies_features = movie_features[movie_features.movie_id.isin(top_movies_index)]
top_movies_features.to_numpy().shape

In [None]:
top_movies_features

# Making top K movie dataset

In [None]:
reward_history = data[data["movie_id"].isin(top_movies_index)]
print(reward_history.shape)
reward_history.head()

In [None]:
sns.countplot(x=reward_history["rating"])
plt.show()

In [None]:
reward_history["reward"] = np.where(reward_history["rating"] <5,0,1)

reward_history = reward_history.reset_index(drop = True)

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
filtered_data_original[['movie_id','reward']].groupby(['movie_id']).mean().plot.bar(ax=ax[0])
ax[0].set_title('reward vs movie_id')
sns.countplot(x=filtered_data_original["movie_id"],hue=filtered_data_original["reward"],ax=ax[1])
ax[1].set_title('movie_id:reward')
plt.show()

In [None]:
reward_history[['movie_id','reward']].groupby(['movie_id']).mean().reset_index()

In [None]:
# save the output for use in other notebooks
reward_history.to_csv('reward_history_top10.csv')