In [1]:
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
from dateutil import parser
from pandarallel import pandarallel
import spacy 
import pgeocode
from sklearn.model_selection import train_test_split

pandarallel.initialize(progress_bar=True)
tqdm.pandas()

nlp = spacy.load("en_core_web_sm")
tokenizer = nlp.tokenizer

nomi = pgeocode.Nominatim('us')

data_path = Path(r"D:\Productivity\Studying\PMLDL_A2\data\raw\ml-100k")
interim_path = Path(r"D:\Productivity\Studying\PMLDL_A2\data\interim")


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


# Make a pipeline for processing subset of u.data

In [2]:
def encode_date(date):
    date = parser.parse(date)
    return np.sin(date.day), np.cos(date.day), np.sin(date.month), np.cos(date.month), date.year

def split_title(title):
    *actural_title, release = title.split()
    release = release.removeprefix("(").removesuffix(")")
    release = int(release) if release.isdigit() else np.nan
    actural_title = " ".join(actural_title)
    return actural_title, release

def embed(title):
    return nlp(tokenizer(title)).vector


def process_datasets(df):
    df.columns = ["user_id", "item_id", "rating", "timestamp"]
    
    df_user = pd.read_csv(data_path / "u.user", sep= "|", encoding='latin-1', header=None, 
                          names=["id", "age", "gender", "occupation", "zip_code"])
    item_df = pd.read_csv(data_path / "u.item", sep= "|", encoding='latin-1', header=None)
    df_occupation = pd.read_csv(data_path / "u.occupation", sep= "|", encoding='latin-1', header=None)

    df_user = df_user[df_user.id.isin(df.user_id)]


    df["timestamp"] = df["timestamp"].apply(lambda x: pd.Timestamp(x, unit="s"))


    item_df = item_df.drop(columns=[0, 3, 4])
    item_df = item_df.dropna()

    item_df[["title", "year1"]] = item_df[1].progress_apply(split_title).progress_apply(pd.Series)
    item_df.dropna(inplace=True)
    item_df.drop(columns=[1], inplace=True)

    embeddings = item_df["title"].progress_apply(embed).parallel_apply(pd.Series)
    item_df.drop(columns=["title"], inplace=True)
    item_df["year1"] = (item_df["year1"]-item_df["year1"].min())/(item_df["year1"].max()-item_df["year1"].min())
    item_df = pd.concat([item_df, embeddings], axis=1, ignore_index=True)

    date_features = item_df[0].progress_apply(encode_date).parallel_apply(pd.Series)
    date_features[4] = (date_features[4]-date_features[4].min())/(date_features[4].max()-date_features[4].min())

    item_df.drop(columns=[0], inplace=True)
    item_df = pd.concat([item_df, date_features], axis=1, ignore_index=True)

    occupation_dtype = pd.CategoricalDtype(categories=df_occupation[0].to_list())
    df_user["occupation"] = pd.Series(df_user["occupation"], dtype=occupation_dtype)
    df_occupation = pd.get_dummies(df_user["occupation"], dtype=float)

    df_geo = df_user.zip_code.parallel_apply(nomi.query_postal_code)
    coordinates = df_geo[["latitude", "longitude"]]

    df_user.age = (df_user.age-df_user.age.min())/(df_user.age.max()-df_user.age.min())
    df_user.gender = df_user.gender.map({"M": 0, "F": 1})

    df_user[["latitude", "longitude"]] = coordinates
    df_user.drop(columns=["zip_code", "occupation"], inplace = True)
    df_user = pd.concat([df_occupation, df_user], axis=1)
    df_user.rename(columns={"id": "user_id"}, inplace=True)
    df_user = df_user.set_index("user_id", drop=True)

    item_df["item_id"] = item_df.index + 1
    item_df.set_index("item_id", drop=True, inplace=True)

    df = df.join(item_df, on="item_id").join(df_user, on="user_id")
    df.drop(columns=["user_id", "item_id", "timestamp"], inplace=True)
    df.dropna(inplace=True)

    return df.drop(columns="rating"), df.rating / 5

In [3]:
df = pd.read_csv(data_path / "u.data", sep="\t", header=None)
df.columns = ["user_id", "item_id", "rating", "timestamp"]

# Train/test split

Simply by 0.2 ratio

In [4]:
df_train1, df_test1 = train_test_split(df, test_size=0.2, random_state=42)
train_x1, train_y1 = process_datasets(df_train1)
test_x1, test_y1 = process_datasets(df_test1)

  0%|          | 0/1681 [00:00<?, ?it/s]

  0%|          | 0/1681 [00:00<?, ?it/s]

  0%|          | 0/1680 [00:00<?, ?it/s]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=210), Label(value='0 / 210'))), HB…

  0%|          | 0/1680 [00:00<?, ?it/s]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=210), Label(value='0 / 210'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=118), Label(value='0 / 118'))), HB…

  0%|          | 0/1681 [00:00<?, ?it/s]

  0%|          | 0/1681 [00:00<?, ?it/s]

  0%|          | 0/1680 [00:00<?, ?it/s]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=210), Label(value='0 / 210'))), HB…

  0%|          | 0/1680 [00:00<?, ?it/s]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=210), Label(value='0 / 210'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=118), Label(value='0 / 118'))), HB…

Disjoint users such that: 
* they all are asked an adequate but resourceful number of times
* their answers' portion comprises to approximately 0.2 of total sample size 

In [5]:
test_user_vc = df.user_id.value_counts()[df.user_id.value_counts().between(50, 120)]
test_user_ids = test_user_vc.index
num_test = test_user_vc[test_user_ids].sum()
f"There're {num_test} ratings by the test users. {df.shape[0]-num_test} remain for training. The test ratio is {num_test / df.shape[0]}"

"There're 20677 ratings by the test users. 79323 remain for training. The test ratio is 0.20677"

In [6]:
df_train2 = df[~df.user_id.isin(test_user_ids)]
df_test2 = df[df.user_id.isin(test_user_ids)]

train_x2, train_y2 = process_datasets(df_train2)
test_x2, test_y2 = process_datasets(df_test2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["timestamp"] = df["timestamp"].apply(lambda x: pd.Timestamp(x, unit="s"))


  0%|          | 0/1681 [00:00<?, ?it/s]

  0%|          | 0/1681 [00:00<?, ?it/s]

  0%|          | 0/1680 [00:00<?, ?it/s]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=210), Label(value='0 / 210'))), HB…

  0%|          | 0/1680 [00:00<?, ?it/s]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=210), Label(value='0 / 210'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=85), Label(value='0 / 85'))), HBox…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["timestamp"] = df["timestamp"].apply(lambda x: pd.Timestamp(x, unit="s"))


  0%|          | 0/1681 [00:00<?, ?it/s]

  0%|          | 0/1681 [00:00<?, ?it/s]

  0%|          | 0/1680 [00:00<?, ?it/s]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=210), Label(value='0 / 210'))), HB…

  0%|          | 0/1680 [00:00<?, ?it/s]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=210), Label(value='0 / 210'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=34), Label(value='0 / 34'))), HBox…

In [7]:
train_x1.to_parquet(interim_path / "train_x1.parquet"), pd.DataFrame(train_y1).to_parquet(interim_path / "train_y1.parquet")
test_x1.to_parquet(interim_path / "test_x1.parquet"), pd.DataFrame(test_y1).to_parquet(interim_path / "test_y1.parquet")

train_x2.to_parquet(interim_path / "train_x2.parquet"), pd.DataFrame(train_y2).to_parquet(interim_path / "train_y2.parquet")
test_x2.to_parquet(interim_path / "test_x2.parquet"), pd.DataFrame(test_y2).to_parquet(interim_path / "test_y2.parquet")

  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


(None, None)

In [10]:
pd.DataFrame(test_user_ids).to_parquet(interim_path / "test_user_ids.parquet")