# Frappe dataset

## Import

In [4]:
import pandas as pd
import numpy as np
import requests # for downloading the dataset
import os.path

## Load dataset

In [8]:
# open the dataset
df = pd.read_csv('frappe dataset/frappe.csv', sep="\t")
df

Unnamed: 0,user,item,cnt,daytime,weekday,isweekend,homework,cost,weather,country,city
0,0,0,1,morning,sunday,weekend,unknown,free,sunny,United States,0
1,1,1,7,afternoon,saturday,weekend,unknown,free,cloudy,Spain,0
2,2,2,6,evening,monday,workday,unknown,free,cloudy,Spain,369
3,3,3,1,sunset,thursday,workday,unknown,free,unknown,United States,1028
4,4,4,428,night,thursday,workday,home,free,sunny,Switzerland,147
...,...,...,...,...,...,...,...,...,...,...,...
96198,110,0,5,evening,sunday,weekend,unknown,free,sunny,United States,0
96199,37,16,101,sunset,sunday,weekend,unknown,free,cloudy,Canada,128
96200,181,33,243,afternoon,sunday,weekend,unknown,free,cloudy,Israel,454
96201,451,752,1,evening,sunday,weekend,unknown,free,sunny,United States,0


In [9]:
# count unique values for each column
display("------ unique values ------")
display(df.nunique())

# count number of unknown values for each column
display("------ unknown values ------")
display(df.isin(['unknown']).sum(axis=0))

# count number of zero values for each column (for city 0 == unknown)
display("------ zero values ------")
display(df.isin([0]).sum(axis=0))

'------ unique values ------'

user          957
item         4082
cnt          1981
daytime         7
weekday         7
isweekend       2
homework        3
cost            2
weather         9
country        80
city          233
dtype: int64

'------ unknown values ------'

user             0
item             0
cnt              0
daytime          0
weekday          0
isweekend        0
homework     75670
cost             0
weather      12529
country       7025
city             0
dtype: int64

'------ zero values ------'

user            25
item           521
cnt              0
daytime          0
weekday          0
isweekend        0
homework         0
cost             0
weather          0
country          0
city         38052
dtype: int64

## Dataset preprocessing

We use the following contextual features:
- daytime
- weekday
- weather

This feature are one-hot encoded

In [10]:
# log transformation on the raw frequency numbers that represent the applications usage
df['cnt'] = df['cnt'].apply(np.log10)
f"frequency range is {df['cnt'][df['cnt'] == df['cnt'].min()].values[0]} to {df['cnt'][df['cnt'] == df['cnt'].max()].values[0]}"

# delete columns that are not needed
del df['homework']
del df['cost']
del df['city']
del df['isweekend']
del df['country']

In [11]:
# delete rows where weather is unknown
df = df[df.weather != 'unknown']
df = df.reset_index(drop=True)

# make user and items id start from 0
df.user = pd.factorize(df.user)[0]
df.item = pd.factorize(df.item)[0]

df

Unnamed: 0,user,item,cnt,daytime,weekday,weather
0,0,0,0.000000,morning,sunday,sunny
1,1,1,0.845098,afternoon,saturday,cloudy
2,2,2,0.778151,evening,monday,cloudy
3,3,3,2.631444,night,thursday,sunny
4,4,4,0.778151,sunset,saturday,sunny
...,...,...,...,...,...,...
83669,96,0,0.698970,evening,sunday,sunny
83670,30,33,2.004321,sunset,sunday,cloudy
83671,156,26,2.385606,afternoon,sunday,cloudy
83672,399,672,0.000000,evening,sunday,sunny


In [12]:
x_labels = ['user', 'item']
y_label = 'cnt'
context_labels = ['daytime', 'weekday', 'weather']

# convert categorical data to one-hot encoding
for col in context_labels:
  df = pd.get_dummies(df, columns=[col], prefix = [col])

# new context labels after one-hot encoding are columns from 3 to the end
context_labels = df.columns[3:]
df

Unnamed: 0,user,item,cnt,daytime_afternoon,daytime_evening,daytime_morning,daytime_night,daytime_noon,daytime_sunrise,daytime_sunset,...,weekday_tuesday,weekday_wednesday,weather_cloudy,weather_drizzle,weather_foggy,weather_rainy,weather_sleet,weather_snowy,weather_stormy,weather_sunny
0,0,0,0.000000,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,1,0.845098,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,2,2,0.778151,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,3,3,2.631444,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,4,4,0.778151,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83669,96,0,0.698970,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
83670,30,33,2.004321,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
83671,156,26,2.385606,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
83672,399,672,0.000000,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [16]:
save_folder = 'final datasets'
if not os.path.exists(save_folder):
    os.makedirs(save_folder)
df.to_csv(save_folder + '/frappe_final.csv', index = False) # save dataset to CSV file