## Implement a Recommender System based on linear regression (ElasticNet) and evaluate its performance using Hit Ratio @ 10

In [16]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import ElasticNet
import numpy as np

In [17]:
from benchmark.calculate_hit_ratio_lr import calculate_hit_ratio_lr

#### Load the data

In [18]:
# File paths
data_file = '../data/raw/u.data'
user_file = '../data/raw/u.user'
item_file = '../data/raw/u.item'
genre_file = '../data/raw/u.genre'
occupation_file = '../data/raw/u.occupation'

In [19]:
# Ratings Data
ratings_columns = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(data_file, sep='\t', names=ratings_columns, encoding='latin-1')

In [20]:
# User Data
user_columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_csv(user_file, sep='|', names=user_columns, encoding='latin-1')

In [21]:
# Movie Data
item_columns = ['movie_id', 'movie_title', 'release_date', 'video_release_date',
                'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation',
                'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                'Thriller', 'War', 'Western']
movies = pd.read_csv(item_file, sep='|', names=item_columns, encoding='latin-1')

In [22]:
# Occupation Data
occupations = pd.read_csv(occupation_file, names=['occupation'], encoding='latin-1')

In [23]:
def aggregate_genres(row):
    """
    Aggregate genres into a single column
    
    Parameters:
    row (Series): A row of the DataFrame
    
    Returns:
    str: A string of genres separated by a pipe (|)
    """
    return '|'.join([genre for genre in item_columns[6:] if row[genre] == 1])

In [24]:
# Aggregate genres for each movie
movies['genres'] = movies.apply(aggregate_genres, axis=1)

#### Calculate Hit Ratio @ 10 for each folder from (u1.base, u1.test) to (u5.base, u5.test) and (ua.base, ua.test) and (ub.base, ub.test)

In [25]:
def predict_ratings(model, X, user_ids, movie_ids):
    """
    Predict ratings for all user-movie pairs
    
    Parameters:
    model (sklearn.linear_model.ElasticNet): Trained model
    X (DataFrame): DataFrame of features
    user_ids (list): List of user IDs
    movie_ids (list): List of movie IDs
    
    Returns:
    DataFrame: DataFrame of predicted ratings for all user-movie pairs
    """
    # Generate predictions
    predictions = model.predict(X)

    # Create a DataFrame for predictions
    predicted_ratings_df = pd.DataFrame({
        'user_id': X['user_id'],
        'movie_id': X['movie_id'],
        'predicted_rating': predictions
    })

    # Convert to MultiIndex DataFrame
    predicted_ratings_df.set_index(['user_id', 'movie_id'], inplace=True)

    # Reindex to include all user-movie pairs, filling missing ones with average prediction
    full_index = pd.MultiIndex.from_product([user_ids, movie_ids], names=['user_id', 'movie_id'])
    predicted_ratings_df = predicted_ratings_df.reindex(full_index, fill_value=np.mean(predictions))

    return predicted_ratings_df

In [26]:
# Get all movie IDs
all_movies = ratings['movie_id'].unique()

In [27]:
# Initialize results list
fold_hit_ratio_results = []

In [28]:
# File paths for cross-validation sets
cross_validation_sets = [('u1.base', 'u1.test'), ('u2.base', 'u2.test'), ('u3.base', 'u3.test'),
                         ('u4.base', 'u4.test'), ('u5.base', 'u5.test'), ('ua.base', 'ua.test'),
                         ('ub.base', 'ub.test')]

In [29]:
# Initialize MinMaxScaler
scaler = MinMaxScaler()

In [32]:
for train_file, test_file in cross_validation_sets:
    # Load training and test data
    train_data = pd.read_csv(f'../data/raw/{train_file}', sep='\t', names=ratings_columns, encoding='latin-1')
    test_data = pd.read_csv(f'../data/raw/{test_file}', sep='\t', names=ratings_columns, encoding='latin-1')

    # Merge with user and movie data
    train_merged = pd.merge(train_data, users, on='user_id')
    train_merged = pd.merge(train_merged, movies[['movie_id', 'genres']], on='movie_id')
    test_merged = pd.merge(test_data, users, on='user_id')
    test_merged = pd.merge(test_merged, movies[['movie_id', 'genres']], on='movie_id')

    # One-Hot Encoding
    train_merged = train_merged.join(train_merged['genres'].str.get_dummies('|'))
    train_merged = pd.get_dummies(train_merged, columns=['gender', 'occupation'])
    test_merged = test_merged.join(test_merged['genres'].str.get_dummies('|'))
    test_merged = pd.get_dummies(test_merged, columns=['gender', 'occupation'])

    # Normalize age
    train_merged['age_normalized'] = scaler.fit_transform(train_merged['age'].values.reshape(-1, 1))
    test_merged['age_normalized'] = scaler.transform(test_merged['age'].values.reshape(-1, 1))

    # Prepare features and target
    X_train = train_merged.drop(['rating', 'timestamp', 'zip_code', 'age', 'genres'], axis=1)
    y_train = train_merged['rating']
    X_test = test_merged.drop(['rating', 'timestamp', 'zip_code', 'age', 'genres'], axis=1)
    y_test = test_merged['rating']

    # Train the model
    model = ElasticNet(alpha=1, l1_ratio=0.5)
    model.fit(X_train, y_train)
    
    # Get user IDs and movie IDs from the test set
    test_user_ids = test_data['user_id'].unique()
    test_movie_ids = test_data['movie_id'].unique()
    
    # Predict ratings for the test set
    predicted_ratings_df = predict_ratings(model, X_test, test_user_ids, test_movie_ids)
    
    # Calculate Hit Ratio @ 10
    hit_ratio = calculate_hit_ratio_lr(test_merged, predicted_ratings_df, all_movies)
    fold_hit_ratio_results.append(hit_ratio)
    print(f'Hit Ratio @ 10 for {train_file}: {hit_ratio:.4f}')


# Calculate the average Hit Ratio @ 10 across all folds
average_hit_ratio = np.mean(fold_hit_ratio_results)
print(f'Average Hit Ratio @ 10 across all folds: {average_hit_ratio:.4f}')


Hit Ratio @ 10 for u1.base: 0.6798
Hit Ratio @ 10 for u2.base: 0.7252
Hit Ratio @ 10 for u3.base: 0.6773
Hit Ratio @ 10 for u4.base: 0.6685
Hit Ratio @ 10 for u5.base: 0.6720
Hit Ratio @ 10 for ua.base: 0.6777
Hit Ratio @ 10 for ub.base: 0.6652
Average Hit Ratio @ 10 across all folds: 0.6794
