# Add timestamp context to ratings

Editing the Imdb dataset so that it can be used to create additional context for MovieLens dataset.

In [1]:
from datetime import datetime
import pandas as pd
import json

In [2]:
# depth of this file in the project
file_depth = '../..'

In [3]:
with open(file_depth + '/config/data_25m_config.json') as config_file:
    config = json.load(config_file)

In [6]:
new_context_ratings = config['new_context_ratings']
ratings = pd.read_csv(file_depth + new_context_ratings, encoding="UTF-8")
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000072,162541,50872,4.5,1240953372
25000073,162541,55768,2.5,1240951998
25000074,162541,56176,2.0,1240950697
25000075,162541,58559,4.0,1240953434


Create ratind_id column in ratings dataset and save it

In [5]:
ratings['ratingId'] = ratings.index

new_context_path = config['new_context_path']
ratings.to_csv(file_depth + new_context_path + 'ratings.csv', index=False)
ratings

Unnamed: 0,userId,movieId,rating,timestamp,ratingId
0,1,296,5.0,1147880044,0
1,1,306,3.5,1147868817,1
2,1,307,5.0,1147868828,2
3,1,665,5.0,1147878820,3
4,1,899,3.5,1147868510,4
...,...,...,...,...,...
25000072,162541,50872,4.5,1240953372,25000072
25000073,162541,55768,2.5,1240951998,25000073
25000074,162541,56176,2.0,1240950697,25000074
25000075,162541,58559,4.0,1240953434,25000075


#### Context from rating timestamp

In [9]:
rating_context = pd.DataFrame()
rating_context['ratingId'] = ratings['ratingId']
rating_context['timestamp'] = ratings['timestamp']
rating_context

Unnamed: 0,ratingId,timestamp
0,0,1147880044
1,1,1147868817
2,2,1147868828
3,3,1147878820
4,4,1147868510
...,...,...
25000072,25000072,1240953372
25000073,25000073,1240951998
25000074,25000074,1240950697
25000075,25000075,1240953434


In [10]:
# create datetime object from timestamp
rating_context['date'] = rating_context['timestamp'].apply(lambda x: datetime.fromtimestamp(x))

In [11]:
# create a week day column from timestamp

"""
0: Monday
1: Tuesday
2: Wednesday
3: Thursday
4: Friday
5: Saturday
6: Sunday
"""
rating_context['day'] = rating_context['timestamp'].apply(lambda x: datetime.fromtimestamp(x).isoweekday())

In [12]:
# create a isWeekday column from day column

"""
0: false / weekend
1: true  / weekday
"""
def get_is_weekday(day):
    if day == 6 or day == 7:
        return 0
    else:
        return 1
    
rating_context['isWeekday'] = rating_context['day'].apply(get_is_weekday)

In [13]:
"""
1: Spring	
2: Summer
3: Fall
4: Winter
"""
def get_season(month):
    if month >= 3 and month <= 5:
        return 1
    elif month >= 6 and month <= 8:
        return 2
    elif month >= 9 and month <= 11:
        return 3
    else:
        return 4
    
rating_context['season'] = rating_context['date'].apply(lambda x: get_season(x.month))

In [14]:
"""
1 - Morning
2 - Afternoon
3 - Evening
4 - Night
"""
def get_part_of_day(hour):
    if hour >= 5 and hour < 12:
        return 1
    elif hour >= 12 and hour < 17:
        return 2
    elif hour >= 17 and hour < 21:
        return 3
    else:
        return 4
    
rating_context['partOfDay'] = rating_context['date'].apply(lambda x: get_part_of_day(x.hour))

In [15]:
with open('holidays.json', 'r') as json_file:
    holiday_dates = json.load(json_file)

def is_date_in_interval(date, start_date, end_date, holiday_name):
    if holiday_name == 'new_years': 
        if date.month == 12:
            if date.day >= start_date.day:
                return True

        elif date.month == 1:
            if date.day <= end_date.day:
                return True
        return False
    else:
        if date.month >= start_date.month and date.month <= end_date.month:
            if date.day >= start_date.day and date.day <= end_date.day:
                return True
        return False


def find_holiday(date):
    for holiday_name, interval in holiday_dates.items():
        start_date = datetime.strptime(interval['start'], '%m-%d')
        end_date = datetime.strptime(interval['end'], '%m-%d')
        if is_date_in_interval(date, start_date, end_date, holiday_name):
            return holiday_name
    return 'no_holiday'

In [16]:
rating_context['holiday'] = rating_context['date'].apply(find_holiday)

In [17]:
holiday_counts = rating_context['holiday'].value_counts()
holiday_counts

holiday
no_holiday        18972878
summer_holiday     4247810
christmas           731008
thanksgiving        534591
new_years           245197
valentines          188197
veterans_day         80396
Name: count, dtype: int64

In [18]:
rating_context = rating_context.drop(columns=['date'])
rating_context = rating_context.drop(columns=['timestamp'])

In [19]:
rating_context

Unnamed: 0,ratingId,day,isWeekday,season,partOfDay,holiday
0,0,3,1,1,3,no_holiday
1,1,3,1,1,2,no_holiday
2,2,3,1,1,2,no_holiday
3,3,3,1,1,3,no_holiday
4,4,3,1,1,2,no_holiday
...,...,...,...,...,...,...
25000072,25000072,2,1,1,4,no_holiday
25000073,25000073,2,1,1,4,no_holiday
25000074,25000074,2,1,1,4,no_holiday
25000075,25000075,2,1,1,4,no_holiday


In [20]:
new_context_path = config['new_context_path']
rating_context.to_csv(file_depth + new_context_path + 'rating_context_data.csv', index=False)