In [1]:
import pandas as pd
import os
from src.data.check_structure import check_existing_folder


In [2]:
interim_data_relative_path = "../data/interim/"

In [3]:
# Import data

ratings = pd.read_csv("../data/raw/ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [4]:
# Format Unix-timestamp to datetime
ratings["timestamp"] = pd.to_datetime(ratings["timestamp"], unit="s")
ratings["timestamp"].describe()

count                         20000263
mean     2004-11-20 02:32:01.677113984
min                1995-01-09 11:46:44
25%                2000-08-20 18:55:45
50%                2004-12-20 15:18:06
75%         2008-11-02 16:11:57.500000
max                2015-03-31 06:40:02
Name: timestamp, dtype: object

In [10]:
# Split data into 4 parts

all_df = []
ratings['time_interval'] = pd.cut(ratings['timestamp'], bins=4, labels=['TI1', 'TI2', 'TI3', 'TI4'])
for i in range(1, 5):
    df = ratings[ratings['time_interval'] == f'TI{i}']
    all_df.append(df)
    print(f"Time interval {i}: {df['timestamp'].min()} - {df['timestamp'].max()} ({len(df)} ratings)")
    if check_existing_folder(interim_data_relative_path):
        os.makedirs(interim_data_relative_path)
    ratings[ratings['time_interval'] == f'TI{i}'].to_csv(f"../data/interim/ratings_TI{i}.csv", index=False)

Time interval 1: 1995-01-09 11:46:44 - 2000-01-29 22:29:39 (3977249 ratings)
Time interval 2: 2000-01-29 22:30:13 - 2005-02-18 09:13:16 (6303895 ratings)
Time interval 3: 2005-02-18 09:13:30 - 2010-03-10 19:51:38 (6041701 ratings)
Time interval 4: 2010-03-10 19:58:05 - 2015-03-31 06:40:02 (3677418 ratings)


In [6]:
# Import movie data

movies = pd.read_csv("../data/raw/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
movies["year"] = movies["title"].str.extract("\((\d{4})\)")
movies["year"] = pd.to_numeric(movies["year"])
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995.0
2,3,Grumpier Old Men (1995),Comedy|Romance,1995.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995.0
4,5,Father of the Bride Part II (1995),Comedy,1995.0


In [9]:
for index, year in enumerate(range(1995, 2020, 5)):
    df = movies[(movies["year"] >= year) & (movies["year"] < year + 5)]
    print(f"TI{index + 1}: Movies from {year} to {year + 5}: {len(df)}")
    df.to_csv(f"../data/interim/movies_TI{index +1 }.csv", index=False)

TI1: Movies from 1995 to 2000: 2608
TI2: Movies from 2000 to 2005: 3285
TI3: Movies from 2005 to 2010: 4590
TI4: Movies from 2010 to 2015: 4751
TI5: Movies from 2015 to 2020: 120
