In [1]:
import pandas as pd
import os
from src.data_module_def.check_structure import check_existing_folder


In [2]:
interim_data_relative_path = "../data/interim/"

In [3]:
# Import data

ratings = pd.read_csv("../data/raw/ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [23]:
ratings["timestamp"] = pd.to_datetime(ratings["timestamp"], unit="s")
ratings['year'] = ratings['timestamp'].dt.year
ratings = ratings.drop(columns=['timestamp'])


start_year = ratings['year'].min()
end_year = ratings['year'].max()

In [14]:
# Split data into 3 years intervals

while start_year <= end_year:
    next_start_year = start_year + 3
    
    tmp_df = ratings[(ratings['year'] >= start_year) & (ratings['year'] <= next_start_year)]
    
    if check_existing_folder(interim_data_relative_path):
        os.makedirs(interim_data_relative_path)
        
    tmp_df.to_csv(f"../data/interim/ratings_{start_year}_{next_start_year}.csv", index=False)    
    print(f"Saving data from {start_year} to {next_start_year}")    

    start_year += 4


Saving data from 1995 to 1998
Saving data from 1999 to 2002
Saving data from 2003 to 2006
Saving data from 2007 to 2010
Saving data from 2011 to 2014
Saving data from 2015 to 2018


In [26]:
def split_dataset(df, start_year, end_year, step, file_indicator):
    while start_year <= end_year:
        next_start_year = start_year + step

        tmp_df = df[(df['year'] >= start_year) & (df['year'] <= next_start_year)]

        if check_existing_folder(interim_data_relative_path):
            os.makedirs(interim_data_relative_path)

        tmp_df.to_csv(f"../data/interim/{file_indicator}/{file_indicator}_{start_year}_{next_start_year}.csv", index=False)
        print(f"Saving {file_indicator} data from {start_year} to {next_start_year}")

        start_year += step+1

In [18]:
split_dataset(ratings, ratings['year'].min(), ratings['year'].max(), 3, 'ratings')

Saving ratings data from 1995 to 1998
Saving ratings data from 1999 to 2002
Saving ratings data from 2003 to 2006
Saving ratings data from 2007 to 2010
Saving ratings data from 2011 to 2014
Saving ratings data from 2015 to 2018


In [19]:
# Import movie data

movies = pd.read_csv("../data/raw/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [20]:
movies["year"] = movies["title"].str.extract("\((\d{4})\)")
movies["year"] = pd.to_numeric(movies["year"])
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995.0
2,3,Grumpier Old Men (1995),Comedy|Romance,1995.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995.0
4,5,Father of the Bride Part II (1995),Comedy,1995.0


In [27]:
split_dataset(movies, start_year, end_year, 3, 'movies')

Saving movies data from 1995 to 1998
Saving movies data from 1999 to 2002
Saving movies data from 2003 to 2006
Saving movies data from 2007 to 2010
Saving movies data from 2011 to 2014
Saving movies data from 2015 to 2018


In [29]:
# Append all files which names ends before a specific year

def append_files(file_indicator, end_year):
    files = [f for f in os.listdir(f"../data/interim/{file_indicator}") if f.endswith(".csv")]

    dfs = []
    for file in files:
        year = int(file.split("_")[-1].split(".")[0])
        if year <= end_year:
            dfs.append(pd.read_csv(f"../data/interim/{file_indicator}/{file}"))

    df = pd.concat(dfs, ignore_index=True)
    df.to_csv(f"../data/interim/{file_indicator}.csv", index=False)

In [30]:
append_files('ratings', 1998)

In [31]:
append_files('movies', 1998)