In [1]:
import pandas as pd

In [None]:
# file paths for individual parts and combined part. Change raw_data_path and inter_data_path as needed.
# There should be a '/' at the end of raw and inter_data_path

raw_data_path = "C:/Users/User/Documents/MITB/MITB Term 5/Recommender Systems/Group Project/cs608-p2-experiments/data/01_raw/"
inter_data_path = "C:/Users/User/Documents/MITB/MITB Term 5/Recommender Systems/Group Project/cs608-p2-experiments/data/02_intermediate/"
primary_path = "C:/Users/User/Documents/MITB/MITB Term 5/Recommender Systems/Group Project/cs608-p2-experiments/data/03_primary/"

damien_scrape = raw_data_path + "anime_scrapy_damien.csv"
leroy_scrape = raw_data_path + "anime_scrapy_leroy.csv"
rosamund_scrape = raw_data_path + "anime_scrapy_rosamund.csv"
kenneth_scrape = raw_data_path + "anime_scrapy_kenneth.csv"

all_scrape_path = inter_data_path + "anime_scrapy_all.csv"

### Concatenating the 4 partitions together

In [None]:
def concatenate_and_write_scraped_partitions(output_path: str, *args: str) -> pd.DataFrame:
    """combines the partitioned scraped dataframes and writes the merged dataframe into a csv file.

    Args:
        output_path (str): Path to write the merged DataFrame to
        *args (str): Arbitrary number of filepaths to pandas DataFrames to concatenate.

    Returns:
        pd.DataFrame: DataFrame obtained by concatenating the input DataFrames row-wise.
    
    """
    merged_df = None
    for df_path in args:
        df = pd.read_csv(df_path, sep="|")
        if not merged_df:
            merged_df = df
        else:
            merged_df = pd.concat([merged_df, df], axis=0, ignore_index=True)
    merged_df.to_csv(output_path, sep="|", index=False)
    return merged_df

In [None]:
all_scrape_df = concatenate_and_write_scraped_partitions(all_scrape_path, damien_scrape, leroy_scrape, rosamund_scrape, kenneth_scrape)

In [None]:
all_scrape_df.info()

In [None]:
all_scrape_df.describe()

### Join anime features

In [None]:
anime_path = raw_data_path + "anime.csv"
joined_path = primary_path + "primary_data.csv"

In [None]:
# as I cannot see how sparse the data is when writing this code, I will preliminarily set the join to left, 
# but we can set it to outer or inner if they make more sense

how = "left"
on = "anime_id" # shared column from anime.csv and scraped data

In [None]:
# join the data we got from kaggle and the data we got from scraping 
def join_kaggle_and_scraped_data(
    kaggle_anime_data: str, 
    scraped_data: str, 
    how: str,
    on, str,
    joined_path: str,
    ) -> pd.DataFrame:
    """Joins Kaggle anime data and scraped anime data into a single DataFrame.

    This function reads Kaggle anime data from a CSV file and scraped anime data from another CSV file,
    then performs a merge operation on them based on the specified columns and join type. The resulting
    DataFrame is saved to a CSV file.

    Args:
        kaggle_anime_data (str): Path to the CSV file containing Kaggle anime data.
        scraped_data (str): Path to the CSV file containing scraped anime data.
        how (str): Type of merge to be performed. Options are 'left', 'right', 'outer', 'inner'.
        on (str): Column or index level names to join on. These must be found in both DataFrames.
        joined_path (str): Path to save the resulting merged DataFrame as a CSV file.

    Returns:
        pd.DataFrame: The resulting merged DataFrame.
    
    """
    left_df = pd.read_csv(kaggle_anime_data)
    right_df = pd.read_csv(scraped_data, sep="|")
    joined_df = pd.merge(left_df, right_df, how=how, on=on, suffixes=(None, "_scrape"))
    joined_df.to_csv(joined_path, sep="|", index=False)
    return joined_df


In [None]:
joined_data = join_kaggle_and_scraped_data(
    kaggle_anime_data=anime_path
    scraped_data=all_scrape_path, 
    how=how,
    on=on,
    joined_path=joined_path,
    )

In [None]:
joined_data.head()

In [None]:
joined_data.info()

In [None]:
joined_data.describe()