# Train and Inference Models

In [15]:
import pathlib
from pathlib import Path
from typing import Optional

import pandas as pd
from geopy.geocoders import Nominatim
from lightfm import LightFM
from pandas import DataFrame, get_dummies, read_csv
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import LightFMWrapperModel
from rectools.models.base import ModelBase
from sklearn.preprocessing import MinMaxScaler

DATASET_DIR = Path("data/raw/ml-100k")
USER_COLUMNS = ["user_id", "age", "gender", "occupation", "zip_code"]
ITEM_COLUMNS = [
    "movie_id",
    "movie_title",
    "release_date",
    "video_release_date",
    "imdb_url",
    "unknown",
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]

In [2]:
# Change directory to repository root
%cd ../

/Users/danilandreev/Desktop/F23-PMLDL-Movie-Recommender-System


## Preprocess User Features

In [3]:
user_df = read_csv(
    DATASET_DIR / "u.user",
    sep="|",
    header=None,
    names=USER_COLUMNS,
    encoding="latin-1",
)
user_df.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [4]:
geolocator = Nominatim(user_agent="geoapiEx")

from geopy.extra.rate_limiter import RateLimiter

geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)


def get_latitude_longitude(zip_code):
    latitude = None
    longitude = None

    location = geocode(zip_code)
    if location:
        latitude = location.latitude
        longitude = location.longitude

    return latitude, longitude


user_df[["latitude", "longitude"]] = user_df["zip_code"].apply(
    lambda x: pd.Series(get_latitude_longitude(x))
)
user_df.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code,latitude,longitude
0,1,24,M,technician,85711,32.220898,-110.878343
1,2,53,F,other,94043,22.368609,120.589932
2,3,23,M,writer,32067,24.974408,121.259507
3,4,24,M,technician,43537,41.577418,-83.682557
4,5,33,F,other,15213,37.346706,126.813669


In [59]:
user_df_prep = user_df.copy()

In [60]:
user_df_prep["age"].fillna(user_df_prep["age"].mean(), inplace=True)
user_df_prep["gender"].fillna("M", inplace=True)
user_df_prep["occupation"].fillna("other", inplace=True)
user_df_prep["latitude"].fillna(user_df_prep["latitude"].mean(), inplace=True)
user_df_prep["longitude"].fillna(user_df_prep["longitude"].mean(), inplace=True)

In [61]:
age_scaler = MinMaxScaler(feature_range=(0, 100))
lat_scaler = MinMaxScaler(feature_range=(-90, 90))
lon_scaler = MinMaxScaler(feature_range=(-180, 180))

user_df_prep["age"] = age_scaler.fit_transform(user_df_prep[["age"]])
user_df_prep["latitude"] = age_scaler.fit_transform(user_df_prep[["latitude"]])
user_df_prep["longitude"] = age_scaler.fit_transform(user_df_prep[["longitude"]])

In [62]:
user_df_prep = get_dummies(user_df_prep, columns=["gender", "occupation"])

In [41]:
user_df_prep.drop(columns=["zip_code"], inplace=True)
user_df_prep.head()

Unnamed: 0,user_id,age,latitude,longitude,gender_F,gender_M,occupation_administrator,occupation_artist,occupation_doctor,occupation_educator,...,occupation_marketing,occupation_none,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer
0,1,0.257576,0.653887,0.148909,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2,0.69697,0.55565,0.85926,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,3,0.242424,0.581633,0.861315,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,0.257576,0.747181,0.23237,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,5,0.393939,0.704997,0.87836,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


## Preprocess Item Features

In [82]:
item_df = read_csv(
    DATASET_DIR / "u.item",
    sep="|",
    header=None,
    names=ITEM_COLUMNS,
    encoding="latin-1",
)
item_df.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [83]:
item_df_prep = item_df.copy()

In [84]:
item_df_prep["release_date"] = pd.to_datetime(
    item_df_prep["release_date"], format="%d-%b-%Y"
)
item_df_prep["release_date"] = item_df_prep["release_date"].astype(int) // 10**9

item_df_prep["video_release_date"] = pd.to_datetime(
    item_df_prep["video_release_date"], format="%d-%b-%Y"
)
item_df_prep["video_release_date"] = (
    item_df_prep["video_release_date"].astype(int) // 10**9
)

In [85]:
time_scaler = MinMaxScaler(feature_range=(0, 2147483647))
item_df_prep[["release_date", "video_release_date"]] = time_scaler.fit_transform(
    item_df_prep[["release_date", "video_release_date"]]
)

In [86]:
item_df_prep["release_date"].fillna(item_df_prep["release_date"].mean(), inplace=True)
item_df_prep["video_release_date"].fillna(
    item_df_prep["video_release_date"].mean(), inplace=True
)

In [87]:
item_df_prep.drop(columns=["movie_title", "imdb_url"], inplace=True)

In [88]:
item_df_prep.head()

Unnamed: 0,movie_id,release_date,video_release_date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,2122012000.0,0.0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,2122012000.0,0.0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,2122012000.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,2122012000.0,0.0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,2122012000.0,0.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


## Fit and Recommend

In [89]:
def convert_features_to_rectools(df, id_column, feature_names):
    features_frames = []

    for feature in feature_names:
        feature_frame = df.reindex(columns=[id_column, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        features_frames.append(feature_frame)

    features_df = pd.concat(features_frames)
    return features_df

In [90]:
def get_ml_100k_df(
    dataset_path: pathlib.Path, split: str = "train", include_features=False
) -> Optional[pd.DataFrame]:
    """
    Read a MovieLens 100k dataset and return a DataFrame.

    Args:
        dataset_path (str): The path to the dataset file.
        split (str, optional): Specifies whether to read 'train' or 'test' split. Default is 'train'.

    Returns:
        Optional[pd.DataFrame]: Returns a DataFrame containing the specified split of the dataset.
        Returns None if 'split' is neither 'train' nor 'test'.

    Raises:
        FileNotFoundError: If the dataset file is not found.
        ValueError: If an invalid 'split' value is provided.

    Examples:
        # Read 'train' split of the dataset
        train_data = get_ml_100k_df('path/to/train_data.csv', split='train')

        # Read 'test' split of the dataset
        test_data = get_ml_100k_df('path/to/test_data.csv', split='test')
    """
    df: Optional[pd.DataFrame] = None
    user_features_df = None
    item_features_df = None

    if split == "train":
        df = read_csv(
            dataset_path,
            sep="\t",
            header=None,
            names=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime],
        )
    elif split == "test":
        df = read_csv(
            dataset_path,
            sep="\t",
            header=None,
            names=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime],
            usecols=[Columns.User, Columns.Item],
        )
    else:
        raise ValueError("Invalid split. Use 'train' or 'test'.")

    if include_features:
        user_features_df = convert_features_to_rectools(
            user_df_prep, "user_id", user_df_prep.columns[1:]
        )
        item_features_df = convert_features_to_rectools(
            item_df_prep, "movie_id", item_df_prep.columns[1:]
        )

    return df, user_features_df, item_features_df

## Fit and Recommend

In [91]:
def fit_recommend(
    model: ModelBase,
    train_df: DataFrame,
    user_features_df=None,
    item_features_df=None,
    k: int = 10,
    output_path: Optional[str] = None,
) -> DataFrame:
    """
    Fits the model and generates recommendations.

    Args:
        model (ModelBase): An instance of the model to be fitted and used for recommendations.
        train_df (DataFrame): DataFrame representing the training data.
        k (int, optional): Number of recommendations to generate per user. Defaults to 10.
        output_path (str, optional): Path to save the generated recommendations as a CSV file.
                                     Defaults to None.

    Returns:
        DataFrame: DataFrame containing the generated recommendations.

    Raises:
        Any specific exceptions that may occur during model fitting or recommendation generation.

    Examples:
        # Create an instance of the model (assuming it's already imported)
        model_instance = ModelBase()

        # Fit the model and generate recommendations
        recommendations = fit_recommend(
            model=model_instance,
            train_df=train_data,
            k=15,
            output_path='path/to/recommendations.csv'
        )
    """
    train_dataset = Dataset.construct(
        train_df,
        user_features_df=user_features_df,
        item_features_df=item_features_df,
        cat_user_features=["gender", "occupation", "zipcode"],
        cat_item_features=[
            "movie_title",
            # "release_date",
            # "video_release_date",
            "imdb_url",
        ],
    )

    model.fit(train_dataset)
    recos = model.recommend(
        users=train_df[Columns.User].unique(),
        dataset=train_dataset,
        k=k,
        filter_viewed=True,  # Remove items that the user has already interacted with
    )

    if output_path is not None:
        recos.to_csv(output_path, index=False)

    return recos

In [92]:
dataset_dir = Path("data/raw/ml-100k")
dataset_splits = {}

for i in range(5):
    dataset_splits[f"u{i + 1}"] = (
        get_ml_100k_df(
            dataset_dir / f"u{i + 1}.base", split="train", include_features=True
        ),
        get_ml_100k_df(
            dataset_dir / f"u{i + 1}.test", split="test", include_features=True
        ),
    )

In [93]:
models = {}

models["light-fm-wrapper-model"] = LightFMWrapperModel(
    model=LightFM(no_components=30)  # Specify the embeddings dimensions
)

In [94]:
dataset_splits["u1"][1]

(       user_id  item_id
 0            1        6
 1            1       10
 2            1       12
 3            1       14
 4            1       17
 ...        ...      ...
 19995      458      648
 19996      458     1101
 19997      459      934
 19998      460       10
 19999      462      682
 
 [20000 rows x 2 columns],
       id      value            feature
 0      1  25.757576                age
 1      2   69.69697                age
 2      3  24.242424                age
 3      4  25.757576                age
 4      5  39.393939                age
 ..   ...        ...                ...
 938  939          0  occupation_writer
 939  940          0  occupation_writer
 940  941          0  occupation_writer
 941  942          0  occupation_writer
 942  943          0  occupation_writer
 
 [25461 rows x 3 columns],
         id         value       feature
 0        1  2.122012e+09  release_date
 1        2  2.122012e+09  release_date
 2        3  2.122012e+09  release_date
 3

In [99]:
for model_name, model in models.items():
    for split_name, (train_df, test_df) in dataset_splits.items():
        output_path = f"data/interim/{model_name}_{split_name}.csv"
        _ = fit_recommend(
            model,
            train_df[0],
            # user_features_df=train_df[1],
            # item_features_df=train_df[2],
            output_path=output_path,
        )