# Recommender Project

## Import Librairies

In [32]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from surprise import Dataset, Reader, SVD, accuracy
from sklearn.metrics import mean_absolute_error
from collections import defaultdict
from surprise.model_selection import train_test_split as surprise_train_test_split
from surprise.model_selection import cross_validate
from tqdm import tqdm

## Get Movielens Datas

In [2]:
# Paths to the files
movies_path = 'ml-1m/movies.dat'
ratings_path = 'ml-1m/ratings.dat'
users_path = 'ml-1m/users.dat'

#movies_path = '/kaggle/input/moviesetdataset/movies.dat'
#ratings_path = '/kaggle/input/moviesetdataset/ratings.dat'
#users_path = '/kaggle/input/moviesetdataset/users.dat'

movies = pd.read_csv(movies_path, sep='::', header=None, names=['MovieID', 'Title', 'Genres'], engine='python', encoding='latin1')
ratings = pd.read_csv(ratings_path, sep='::', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python', encoding='latin1')
users = pd.read_csv(users_path, sep='::', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], engine='python', encoding='latin1')

print("Movies Data:")
print(movies.head())

print("\nRatings Data:")
print(ratings.head())

print("\nUsers Data:")
print(users.head())

Movies Data:
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy

Ratings Data:
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291

Users Data:
   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455


### Drop unnecessary columns

In [3]:
users.drop(columns=['Zip-code'], inplace=True)
users.drop(columns=['Occupation'], inplace=True)
ratings.drop(columns=['Timestamp'], inplace=True)

### Merge the datas

In [4]:
movies_ratings = pd.merge(movies, ratings, on='MovieID')
movielens_data = pd.merge(movies_ratings, users, on='UserID')

print(movielens_data)

         MovieID                  Title                       Genres  UserID  \
0              1       Toy Story (1995)  Animation|Children's|Comedy       1   
1              1       Toy Story (1995)  Animation|Children's|Comedy       6   
2              1       Toy Story (1995)  Animation|Children's|Comedy       8   
3              1       Toy Story (1995)  Animation|Children's|Comedy       9   
4              1       Toy Story (1995)  Animation|Children's|Comedy      10   
...          ...                    ...                          ...     ...   
1000204     3952  Contender, The (2000)               Drama|Thriller    5812   
1000205     3952  Contender, The (2000)               Drama|Thriller    5831   
1000206     3952  Contender, The (2000)               Drama|Thriller    5837   
1000207     3952  Contender, The (2000)               Drama|Thriller    5927   
1000208     3952  Contender, The (2000)               Drama|Thriller    5998   

         Rating Gender  Age  
0        

## Get IMDB datas

In [5]:
basics_path = 'title.basics.tsv'
ratings_path = 'title.ratings.tsv'

#basics_path = '/kaggle/input/movies/title.basics.tsv'
#ratings_path = '/kaggle/input/movies/title.ratings.tsv'

imdb_basics =  pd.read_csv(basics_path, delimiter='\t', low_memory=False)
imdb_ratings = pd.read_csv(ratings_path, delimiter='\t', low_memory=False)

print("basics :")
print(imdb_basics.head())

print("\nratings :")
print(imdb_ratings.head())

basics :
      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short          Pauvre Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

  isAdult startYear endYear runtimeMinutes                    genres  
0       0      1894      \N              1         Documentary,Short  
1       0      1892      \N              5           Animation,Short  
2       0      1892      \N              5  Animation,Comedy,Romance  
3       0      1892      \N             12           Animation,Short  
4       0      1893      \N              1              Comedy,Short  

ratings :
      tconst  averageRating  numVotes
0  tt0000001            5.7      2059
1  tt0000002            5.6      

### Drop unnecessary columns

In [6]:
imdb_basics.drop(columns=['endYear'], inplace=True)
imdb_basics.drop(columns=['runtimeMinutes'], inplace=True)
imdb_basics.drop(columns=['isAdult'], inplace=True)
imdb_ratings.drop(columns=['numVotes'], inplace=True)

### Merge the datas

In [7]:
imdb_data = imdb_basics.merge(imdb_ratings, on='tconst')

print(imdb_data)

            tconst  titleType                primaryTitle  \
0        tt0000001      short                  Carmencita   
1        tt0000002      short      Le clown et ses chiens   
2        tt0000003      short              Pauvre Pierrot   
3        tt0000004      short                 Un bon bock   
4        tt0000005      short            Blacksmith Scene   
...            ...        ...                         ...   
1451827  tt9916730      movie                      6 Gunn   
1451828  tt9916766  tvEpisode              Episode #10.15   
1451829  tt9916778  tvEpisode                      Escape   
1451830  tt9916840  tvEpisode  Horrid Henry's Comic Caper   
1451831  tt9916880  tvEpisode   Horrid Henry Knows It All   

                      originalTitle startYear                       genres  \
0                        Carmencita      1894            Documentary,Short   
1            Le clown et ses chiens      1892              Animation,Short   
2                    Pauvre Pierr

## Preprocessing

### Data Cleaning and Preparation

In [8]:
# Replace '\\N' values with NaN for numeric columns
imdb_data['startYear'] = imdb_data['startYear'].replace('\\N', np.nan).astype(float)
imdb_data['averageRating'] = imdb_data['averageRating'].replace('\\N', np.nan).astype(float)

# Replace NaN values with the mean of the column for 'averageRating'
imdb_data['averageRating'].fillna(imdb_data['averageRating'].mean(), inplace=True)

# Replace '\\N' values with 'Unknown' for categorical columns
imdb_data['genres'] = imdb_data['genres'].replace('\\N', 'Unknown')
imdb_data['titleType'] = imdb_data['titleType'].replace('\\N', 'Unknown')
imdb_data['primaryTitle'] = imdb_data['primaryTitle'].replace('\\N', 'Unknown')
imdb_data['originalTitle'] = imdb_data['originalTitle'].replace('\\N', 'Unknown')

# Fill NaNs in 'startYear' with a default value
imdb_data['startYear'].fillna(0, inplace=True)

# Convert the 'startYear' column to an integer data type
imdb_data['startYear'] = imdb_data['startYear'].astype(int)

print(imdb_data[['startYear', 'averageRating', 'genres', 'titleType', 'primaryTitle', 'originalTitle']].head())
print(imdb_data.dtypes)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  imdb_data['averageRating'].fillna(imdb_data['averageRating'].mean(), inplace=True)


   startYear  averageRating                    genres titleType  \
0       1894            5.7         Documentary,Short     short   
1       1892            5.6           Animation,Short     short   
2       1892            6.5  Animation,Comedy,Romance     short   
3       1892            5.4           Animation,Short     short   
4       1893            6.2              Comedy,Short     short   

             primaryTitle           originalTitle  
0              Carmencita              Carmencita  
1  Le clown et ses chiens  Le clown et ses chiens  
2          Pauvre Pierrot          Pauvre Pierrot  
3             Un bon bock             Un bon bock  
4        Blacksmith Scene        Blacksmith Scene  
tconst            object
titleType         object
primaryTitle      object
originalTitle     object
startYear          int64
genres            object
averageRating    float64
dtype: object


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  imdb_data['startYear'].fillna(0, inplace=True)


### Create Clean Title and Year

In [9]:
# Strip and lowercase titles for better matching
movielens_data['CleanTitle'] = movielens_data['Title'].str.extract(r'^(.*?) \(\d{4}\)$')[0].str.lower().str.strip()
imdb_data['CleanTitle'] = imdb_data['primaryTitle'].str.lower().str.strip()

# Convert years to integers for exact matching
movielens_data['Year'] = movielens_data['Title'].str.extract(r'\((\d{4})\)')[0].astype(int)
imdb_data['Year'] = imdb_data['startYear'].astype(int)

### Fuse MovieLens Dataset and Imdb Dataset

In [10]:
merged_data = pd.merge(movielens_data, imdb_data, left_on=['CleanTitle', 'Year'], right_on=['CleanTitle', 'Year'])

### Data Cleanup and Rating Normalization

In [11]:
# Scaling MovieLens ratings from 1-5 to 1-10
merged_data['ScaledRating'] = merged_data['Rating'] * 2
merged_data['CompositeRating'] = (merged_data['ScaledRating'] + merged_data['averageRating']) / 2

merged_data.drop(columns=['tconst', 'primaryTitle', 'originalTitle', 'startYear', 'Title', 'Genres'], inplace=True)
merged_data.drop(columns=['averageRating', 'ScaledRating', 'Rating'], inplace=True)

print(merged_data)

        MovieID  UserID Gender  Age        CleanTitle  Year titleType  \
0             1       1      F    1         toy story  1995     movie   
1             1       6      F   50         toy story  1995     movie   
2             1       8      M   25         toy story  1995     movie   
3             1       9      M   25         toy story  1995     movie   
4             1      10      F   35         toy story  1995     movie   
...         ...     ...    ...  ...               ...   ...       ...   
805605     3951    3940      M   35  two family house  2000     movie   
805606     3951    3985      M   45  two family house  2000     movie   
805607     3951    4025      F   56  two family house  2000     movie   
805608     3951    4727      F   56  two family house  2000     movie   
805609     3951    5333      F   25  two family house  2000     movie   

                            genres  CompositeRating  
0       Adventure,Animation,Comedy             9.15  
1       Adventu

## Feature Engineering

In [12]:
# Encode genres
mlb_genres = MultiLabelBinarizer()
genres_encoded = mlb_genres.fit_transform(merged_data['genres'].str.split(','))
genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_, index=merged_data.index)

# Encode age groups
age_bins = [0, 18, 25, 35, 45, 50, 56, np.inf]
age_labels = ['0-18', '18-25', '25-35', '35-45', '45-50', '50-56', '56+']
merged_data['age_group'] = pd.cut(merged_data['Age'], bins=age_bins, labels=age_labels)
age_dummies = pd.get_dummies(merged_data['age_group'], prefix='age')

# Encode gender
gender_dummies = pd.get_dummies(merged_data['Gender'], prefix='gender')

# Combine all features
merged_data = pd.concat([merged_data, genres_df, age_dummies, gender_dummies], axis=1)

# Drop original columns that are not numerical
merged_data.drop(columns=['genres', 'age_group', 'Gender'], inplace=True)

print(merged_data)

        MovieID  UserID  Age        CleanTitle  Year titleType  \
0             1       1    1         toy story  1995     movie   
1             1       6   50         toy story  1995     movie   
2             1       8   25         toy story  1995     movie   
3             1       9   25         toy story  1995     movie   
4             1      10   35         toy story  1995     movie   
...         ...     ...  ...               ...   ...       ...   
805605     3951    3940   35  two family house  2000     movie   
805606     3951    3985   45  two family house  2000     movie   
805607     3951    4025   56  two family house  2000     movie   
805608     3951    4727   56  two family house  2000     movie   
805609     3951    5333   25  two family house  2000     movie   

        CompositeRating  Action  Adult  Adventure  ...  Western  age_0-18  \
0                  9.15       0      0          1  ...        0      True   
1                  8.15       0      0          1  ..

## Creating User-Item Interaction Matrix

In [13]:
# Aggregate ratings to ensure unique combinations of UserID and MovieID
ratings_aggregated = merged_data.groupby(['UserID', 'MovieID'], as_index=False)['CompositeRating'].mean()

# Create user-item interaction matrix
interaction_matrix = ratings_aggregated.pivot(index='UserID', columns='MovieID', values='CompositeRating').fillna(0)

print(interaction_matrix)

MovieID  1     2     3     4     5     6     7     8     9     10    ...  \
UserID                                                               ...   
1        9.15   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
2        0.00   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
3        0.00   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
4        0.00   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
5        0.00   0.0   0.0   0.0   0.0   5.9   0.0   0.0   0.0   0.0  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
6036     0.00   0.0   0.0   5.0   0.0   6.9   0.0   0.0   0.0   0.0  ...   
6037     0.00   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
6038     0.00   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
6039     0.00   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
6040     7.15   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   

MovieID  39

## Data Splitting for Model

In [42]:
# Convert the merged data to the format required by Surprise
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(pd.DataFrame({
    'userID': ratings_aggregated['UserID'], 
    'itemID': ratings_aggregated['MovieID'], 
    'rating': ratings_aggregated['CompositeRating']
}), reader)

# Split the data into training and test sets
trainset, testset = surprise_train_test_split(data, test_size=0.2)

## Model Configuration and Evaluation

In [75]:
# Initialize the SVD algorithm
svd = SVD(n_factors=50) 

# Train the algorithm on the training set
svd.fit(trainset)

# Predict ratings for the test set
predictions = svd.test(testset)

# Compute and print RMSE
rmse = accuracy.rmse(predictions)
print(f'RMSE: {rmse}')

# Extract predicted and actual ratings from the predictions
predicted_ratings = np.array([pred.est for pred in predictions])
actual_ratings = np.array([pred.r_ui for pred in predictions])

# Calculate MAE
mae = mean_absolute_error(actual_ratings, predicted_ratings)
print(f'MAE: {mae}')

RMSE: 0.8799
RMSE: 0.8799287059736176
MAE: 0.6934325514378185


In [80]:
actual_ratings = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
    actual_ratings[uid].append((iid, true_r))
    
def precision_recall_at_k(predictions, actual_ratings, k=10, threshold=4.0):
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()

    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in actual_ratings[uid])
        
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])
        
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    avg_prec = sum(prec for prec in precisions.values()) / len(precisions)
    avg_rec = sum(rec for rec in recalls.values()) / len(recalls)

    return avg_prec, avg_rec

average_precision, average_recall = precision_recall_at_k(predictions, actual_ratings, k=10, threshold=4.0)
print(f'Average Precision: {average_precision}')
print(f'Average Recall: {average_recall}')

Average Precision: 0.9933100155451922
Average Recall: 0.6740192666043744


In [81]:
def f1_score(precision, recall):
    if precision + recall == 0:
        return 0
    return 2 * (precision * recall) / (precision + recall)

f1 = f1_score(average_precision, average_recall)
print(f'F1 Score at k: {f1}')

F1 Score at k: 0.803092820783906


## Recommender Algorithm

In [79]:
def get_top_recommendations(user1_id, user2_id, model, interaction_matrix, data):
    
    # Retrieve indices of movies that user1 and user2 has not rated yet.
    user1_unrated_movies = interaction_matrix.loc[user1_id][interaction_matrix.loc[user1_id] == 0].index.tolist()
    user2_unrated_movies = interaction_matrix.loc[user2_id][interaction_matrix.loc[user2_id] == 0].index.tolist()
    
    # Find the common movies that neither user1 nor user2 has rated.
    common_movies = set(user1_unrated_movies).intersection(set(user2_unrated_movies))
    
    averaged_predictions = []
    
    # Calculate the average predicted ratings for each movie that both users have not rated.
    for movie in common_movies:
        prediction1 = model.predict(user1_id, movie).est
        prediction2 = model.predict(user2_id, movie).est
        avg_prediction = (prediction1 + prediction2) / 2
        
        # Retrieve the movie title if it exists
        movie_titles = data[data['MovieID'] == movie]['CleanTitle'].values
        
        # Append the title and average prediction to the list
        if len(movie_titles) > 0:
            movie_title = movie_titles[0]
            averaged_predictions.append((movie_title, avg_prediction))
        else:
            print(f"Movie ID {movie} not found in data")

    averaged_predictions_df = pd.DataFrame(averaged_predictions, columns=['MovieTitle', 'Average_Prediction'])

    # You can change head value to get more or less movies recommendations
    top_predictions = averaged_predictions_df.sort_values(by='Average_Prediction', ascending=False).head(10)
    top_predictions.reset_index(drop=True, inplace=True)
    return top_predictions

# Example usage
# TOP 10 RECOMMENDATION FOR USER 1 AND USER 2 !
user1_id = 1
user2_id = 4
top_recommendations = get_top_recommendations(user1_id, user2_id, svd, interaction_matrix, merged_data)
print(top_recommendations)

              MovieTitle  Average_Prediction
0  it's a wonderful life            8.845860
1     american history x            8.734271
2           12 angry men            8.664325
3        cinema paradiso            8.663128
4                  alien            8.642846
5            rear window            8.612820
6         paths of glory            8.603327
7            city lights            8.596598
8           pulp fiction            8.570728
9                 aliens            8.570118
