# Frappe dataset

## Import

In [None]:
import pandas as pd
import numpy as np
import os.path

## Load dataset

In [None]:
# open the dataset
left = pd.read_csv('Datasets/frappe dataset/frappe.csv', sep="\t")
right = pd.read_csv('Datasets/frappe dataset/meta.csv', sep="\t")
df = pd.merge(left, right[['item', 'category', 'language']], on=["item"])
df

## Dataset preprocessing
Context features:
- daytime
- weekday
- isweekend
- weather	

In [None]:
# delete columns that are not needed
del df['homework']
del df['city']
del df['country']

In [None]:
# delete rows where in any column there is an 'unknown' value
df = df[~df.eq('unknown').any(1)]
df = df.reset_index(drop=True)

# make users and items id start from 0
df.user = pd.factorize(df.user)[0]
df.item = pd.factorize(df.item)[0]

print(f'n user: {df.user.nunique()} \t n item: {df.item.nunique()} \t n row: {len(df)}')

# count min and max rating
print(f'min rating: {df.cnt.min()} \t max rating: {df.cnt.max()} \t mean rating: {df.cnt.mean()}')

In [None]:
# make rating binary
df['cnt'] = df['cnt'].apply(lambda x: 1 if x > 4 else 0)

#rename rating column
df = df.rename(columns={'cnt': 'rating'})

# merge game categories under a single label 'Games'
game_categories = ['Brain & Puzzle','Arcade & Action','Cards & Casino','Racing','Sports Games','Brain', 'Casual']
book_categories = ['Books & Reference', 'Comics', 'Libraries & Demo']
df.loc[df['category'].isin(game_categories), 'category'] = 'Games'
df.loc[df['category'].isin(book_categories), 'category'] = 'Books'

# binarize language label: 1 if english, 0 otherwise
df['language'] = df['language'].apply(lambda x: 'english' if x == 'en' else 'other')

### Add user features
For each user find:
- his favourite app category
- if it uses paid apps
- on which day he used the most applications
- with what weather he used more applications

In [None]:
def get_favourite_feature_by_rating(df, column):
    fav_values = np.zeros((df.user.nunique(), 2), dtype=object)
    for user in df.user.unique():
        # group by column unique values and sum ratings
        grouped = df[['user', column, 'rating']][df.user == user].groupby(['user', column]).sum().sort_values('rating')
        fav_val = grouped.tail(1).index.get_level_values(1).tolist()[0] # get value with highest rating sum
        fav_values[user,:] = [user, fav_val] # add to numpy array of (user, fav_val)
    return pd.DataFrame(fav_values, columns=['user', 'fav_'+column]) # numpy to dataframe

df = pd.merge(df, get_favourite_feature_by_rating(df, 'category'), on=['user'])
df = pd.merge(df, get_favourite_feature_by_rating(df, 'weekday'), on=['user'])
df = pd.merge(df, get_favourite_feature_by_rating(df, 'weather'), on=['user'])
df = pd.merge(df, get_favourite_feature_by_rating(df, 'daytime'), on=['user'])
df = pd.merge(df, get_favourite_feature_by_rating(df, 'isweekend'), on=['user'])
df

In [None]:
df.cost = df.cost.eq('paid').mul(1) # convert paid/free to 1/0 
paid_apps = df[['user', 'cost']].groupby(['user'], as_index=False).any() # find if a user runs paid apps
paid_apps = paid_apps.rename(columns={'cost': 'uses_paid_apps'})
df = pd.merge(df, paid_apps, on=['user'])

In [None]:
one_hot = ['daytime', 'weekday', 'isweekend', 'weather', 'category', 'language', 'cost', 'fav_category', 'fav_weekday', 'fav_weather', 'fav_daytime', 'fav_isweekend', 'uses_paid_apps']

# convert categorical data to one-hot encoding
for col in one_hot:
  df = pd.get_dummies(df, columns=[col], prefix = [col])

df

In [None]:
df.to_csv('Datasets/frappe dataset/frappe_final.csv', index = False) # save final dataset to CSV

# keep only rating == 1 and drop duplicate to build matrix factorization dataset
df = df[['user', 'item', 'rating']][df.rating == 1]
df = df.drop_duplicates()
# make user and items id start from 0
df.user = pd.factorize(df.user)[0]
df.item = pd.factorize(df.item)[0]
df.to_csv('Datasets/frappe dataset/frappe_matrix_factorization.csv', index = False)