### Description
This is a movie prediction competition, we need to recommend movies for users based on their reviews and information about them

In [1]:
!pip install lightfm

Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l- \ | done
[?25h  Created wheel for lightfm: filename=lightfm-1.17-cp310-cp310-linux_x86_64.whl size=464211 sha256=c1e68e06d2e5587c51d51071f67f62477362ce523282014b34af47b6603178d8
  Stored in directory: /root/.cache/pip/wheels/4f/9b/7e/0b256f2168511d8fa4dae4fae0200fdbd729eb424a912ad636
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/rec-sys-challenge-task-course-2023/train.csv
/kaggle/input/rec-sys-challenge-task-course-2023/kaggle_baseline.csv


In [3]:
df = pd.read_csv('/kaggle/input/rec-sys-challenge-task-course-2023/train.csv')
df.head()

Unnamed: 0,user_id,title,movie_id,rating,release_date,sex,age
0,2592,Top Gun (1986),1101,4,Action|Romance,50,M
1,4318,12 Angry Men (1957),1203,4,Drama,25,M
2,2756,Robocop 2 (1990),2986,2,Action|Crime|Sci-Fi,18,M
3,1706,Modern Times (1936),3462,5,Comedy,25,M
4,4813,Milk Money (1994),276,3,Comedy|Romance,35,F


In [4]:
df.rename(columns={'age': 'gender'}, inplace=True)
df.rename(columns={'sex': 'age'}, inplace=True)
df.rename(columns={'gender': 'sex'}, inplace=True)
df.head()

Unnamed: 0,user_id,title,movie_id,rating,release_date,age,sex
0,2592,Top Gun (1986),1101,4,Action|Romance,50,M
1,4318,12 Angry Men (1957),1203,4,Drama,25,M
2,2756,Robocop 2 (1990),2986,2,Action|Crime|Sci-Fi,18,M
3,1706,Modern Times (1936),3462,5,Comedy,25,M
4,4813,Milk Money (1994),276,3,Comedy|Romance,35,F


### Prepare data and fit model with train dataset

In [5]:
from lightfm import LightFM
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit(
    users=df['user_id'].unique(),
    items=df['movie_id'].unique(),
    user_features=df['sex'].unique(),
    item_features=df['release_date'].unique()
)

interactions, weights = dataset.build_interactions([(row['user_id'], row['movie_id'], row['rating']) for index, row in df.iterrows()])
user_features = dataset.build_user_features([(row['user_id'], [row['sex']]) for index, row in df.iterrows()])
item_features = dataset.build_item_features([(row['movie_id'], [row['release_date']]) for index, row in df.iterrows()])

model = LightFM(loss='warp')
model.fit(interactions, user_features=user_features, item_features=item_features)

<lightfm.lightfm.LightFM at 0x7fc21762e770>

### Generate predictions for the test dataset and create a submission file

In [6]:
testset = pd.read_csv('/kaggle/input/rec-sys-challenge-task-course-2023/kaggle_baseline.csv')

user_id_map = {user_id: internal_index for user_id, internal_index in zip(df['user_id'].unique(), range(len(df['user_id'].unique())))}
item_id_map = {item_id: internal_index for item_id, internal_index in zip(df['movie_id'].unique(), range(len(df['movie_id'].unique())))}

def get_top_n_recommendations(model, testset, user_id_map, item_id_map, n=25):
    top_n = {}
    for user_id in testset['user_id'].unique():
        internal_user_id = user_id_map[user_id]
        internal_item_ids = list(item_id_map.values())     
        scores = model.predict(internal_user_id, internal_item_ids)
        original_item_ids = list(item_id_map.keys())
        top_items_indices = np.argsort(-scores)[:n]
        top_items = [original_item_ids[idx] for idx in top_items_indices]
        top_n[user_id] = top_items
    return top_n

top_n_recommendations = get_top_n_recommendations(model, testset, user_id_map, item_id_map, n=25)

# Create submission file
with open('submission.csv', 'w', encoding='UTF8') as f:
    f.write('user_id,prediction\n')
    for user_id, recommendations in top_n_recommendations.items():
        f.write(f"{user_id},{' '.join(map(str, recommendations))}\n")