#### import external libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

#### IPython extensions

In [2]:
%load_ext autoreload
%autoreload 2

## Mathematical functions

## Constants

In [3]:
PATH_FILES = '/home/franckml/Documents/repositories/movies_rating_model/ml/data/datasets/'
FILES_NAME = 'rating.csv'

In [4]:
# This variables will help to the test dataset
DROP_TIME_COLS = ['timestamp', 'weekday', 'month', 'year']
DROP_OTHER_COLS = ['userId', 'movieId', 'rating']

## Data gathering

In [5]:
df_rating = pd.read_csv(PATH_FILES + FILES_NAME) 
df_rating.head(n=5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [6]:
# temporal action: select a subset of dataset
df_rating = df_rating.sample(frac=0.01)
df_rating.head(n=2)

Unnamed: 0,userId,movieId,rating,timestamp
5384206,36995,3526,4.5,2004-02-22 13:23:19
8296866,57244,1370,4.0,1999-12-22 19:21:38


## Feature Engineering

In [7]:
# Change the data type: from object to datetime
df_rating['timestamp'] = pd.to_datetime(df_rating['timestamp'], infer_datetime_format=True)
df_rating.head(n=2)

Unnamed: 0,userId,movieId,rating,timestamp
5384206,36995,3526,4.5,2004-02-22 13:23:19
8296866,57244,1370,4.0,1999-12-22 19:21:38


In [8]:
# Create new variables
# new feature: day of week
df_rating['weekday'] = df_rating['timestamp'].dt.dayofweek
# new feature: month
df_rating['month'] = df_rating['timestamp'].dt.month
# new feature: year
df_rating['year'] = df_rating['timestamp'].dt.year
df_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp,weekday,month,year
5384206,36995,3526,4.5,2004-02-22 13:23:19,6,2,2004
8296866,57244,1370,4.0,1999-12-22 19:21:38,2,12,1999
14969735,103383,3664,2.0,2002-02-09 00:25:08,5,2,2002
12556359,86741,3257,3.0,2001-02-01 10:00:30,3,2,2001
5884653,40506,2355,3.0,2002-06-28 19:42:02,4,6,2002


In [9]:
# Dummy variables of the weekday 
weekday_dummy = pd.get_dummies(df_rating['weekday'], prefix='weekday', prefix_sep='_')
weekday_dummy.head(n=2)

Unnamed: 0,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
5384206,0,0,0,0,0,0,1
8296866,0,0,1,0,0,0,0


In [10]:
# Dummy variables of the month
month_dummy = pd.get_dummies(df_rating['month'], prefix='month', prefix_sep='_')
month_dummy.head(n=2)

Unnamed: 0,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
5384206,0,1,0,0,0,0,0,0,0,0,0,0
8296866,0,0,0,0,0,0,0,0,0,0,0,1


In [11]:
# Dummy variables of the year
year_dummy = pd.get_dummies(df_rating['month'], prefix='month', prefix_sep='_')
year_dummy.head(n=2)

Unnamed: 0,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
5384206,0,1,0,0,0,0,0,0,0,0,0,0
8296866,0,0,0,0,0,0,0,0,0,0,0,1


In [12]:
# Join the dummy time variables
df_rating = pd.concat([df_rating, weekday_dummy, month_dummy, year_dummy], axis=1)
df_rating.head(n=2)

Unnamed: 0,userId,movieId,rating,timestamp,weekday,month,year,weekday_0,weekday_1,weekday_2,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
5384206,36995,3526,4.5,2004-02-22 13:23:19,6,2,2004,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8296866,57244,1370,4.0,1999-12-22 19:21:38,2,12,1999,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [13]:
# Drop time variables to avoid redundance
df_rating = df_rating.drop(DROP_TIME_COLS, axis = 1)
df_rating.head(n=5)

Unnamed: 0,userId,movieId,rating,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
5384206,36995,3526,4.5,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8296866,57244,1370,4.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
14969735,103383,3664,2.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
12556359,86741,3257,3.0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5884653,40506,2355,3.0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0


In [14]:
# Create the target/response variable
df_rating['label'] = np.where(df_rating['rating'] >= 4.0, 1, 0 )
df_rating.head()

Unnamed: 0,userId,movieId,rating,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,...,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,label
5384206,36995,3526,4.5,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
8296866,57244,1370,4.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
14969735,103383,3664,2.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
12556359,86741,3257,3.0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5884653,40506,2355,3.0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0


In [15]:
# Drop time variables to avoid redundance
df_rating = df_rating.drop(DROP_OTHER_COLS, axis = 1)
df_rating.head(n=5)

Unnamed: 0,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,month_1,month_2,month_3,...,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,label
5384206,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
8296866,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
14969735,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
12556359,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5884653,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
