# Recommender Project

## Project Overview
This project develops a movie pairing recommender system that suggests films based on complementary genres, themes, or viewer preferences. It integrates data from MovieLens and IMDb to provide recommendations.

## Import Librairies

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate
from surprise import accuracy

## Get Movielens Datas

In [2]:
# Paths to the files
movies_path = 'ml-1m/movies.dat'
ratings_path = 'ml-1m/ratings.dat'
users_path = 'ml-1m/users.dat'

movies = pd.read_csv(movies_path, sep='::', header=None, names=['MovieID', 'Title', 'Genres'], engine='python', encoding='latin1')
ratings = pd.read_csv(ratings_path, sep='::', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python', encoding='latin1')
users = pd.read_csv(users_path, sep='::', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], engine='python', encoding='latin1')

print("Movies Data:")
print(movies.head())

print("\nRatings Data:")
print(ratings.head())

print("\nUsers Data:")
print(users.head())

Movies Data:
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy

Ratings Data:
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291

Users Data:
   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455


### Drop unnecessary columns

In [3]:
users.drop(columns=['Zip-code'], inplace=True)
users.drop(columns=['Occupation'], inplace=True)
ratings.drop(columns=['Timestamp'], inplace=True)

### Merge the datas

In [4]:
movies_ratings = pd.merge(movies, ratings, on='MovieID')
movielens_data = pd.merge(movies_ratings, users, on='UserID')

print(movielens_data.head())
print("\nlength of the dataframe : ", len(movielens_data))

   MovieID             Title                       Genres  UserID  Rating  \
0        1  Toy Story (1995)  Animation|Children's|Comedy       1       5   
1        1  Toy Story (1995)  Animation|Children's|Comedy       6       4   
2        1  Toy Story (1995)  Animation|Children's|Comedy       8       4   
3        1  Toy Story (1995)  Animation|Children's|Comedy       9       5   
4        1  Toy Story (1995)  Animation|Children's|Comedy      10       5   

  Gender  Age  
0      F    1  
1      F   50  
2      M   25  
3      M   25  
4      F   35  

length of the dataframe :  1000209


## Get IMDB datas

In [5]:
imdb_basics =  pd.read_csv('title.basics.tsv', delimiter='\t', low_memory=False)
imdb_ratings = pd.read_csv('title.ratings.tsv', delimiter='\t', low_memory=False)

print("basics :")
print(imdb_basics.head())

print("\nratings :")
print(imdb_ratings.head())

basics :
      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short          Pauvre Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

  isAdult startYear endYear runtimeMinutes                    genres  
0       0      1894      \N              1         Documentary,Short  
1       0      1892      \N              5           Animation,Short  
2       0      1892      \N              5  Animation,Comedy,Romance  
3       0      1892      \N             12           Animation,Short  
4       0      1893      \N              1              Comedy,Short  

ratings :
      tconst  averageRating  numVotes
0  tt0000001            5.7      2059
1  tt0000002            5.6      

### Drop unnecessary columns

In [6]:
imdb_basics.drop(columns=['endYear'], inplace=True)
imdb_basics.drop(columns=['runtimeMinutes'], inplace=True)
imdb_basics.drop(columns=['isAdult'], inplace=True)
imdb_ratings.drop(columns=['numVotes'], inplace=True)

### Merge the datas

In [7]:
imdb_data = imdb_basics.merge(imdb_ratings, on='tconst')

print(imdb_data.head())
print("\nlength of the dataframe : ", len(imdb_data))

      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short          Pauvre Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

  startYear                    genres  averageRating  
0      1894         Documentary,Short            5.7  
1      1892           Animation,Short            5.6  
2      1892  Animation,Comedy,Romance            6.5  
3      1892           Animation,Short            5.4  
4      1893              Comedy,Short            6.2  

length of the dataframe :  1451832


## Preprocessing

### Data Cleaning and Preparation

In [8]:
# Replace '\\N' values with NaN for numeric columns
imdb_data['startYear'] = imdb_data['startYear'].replace('\\N', np.nan).astype(float)
imdb_data['averageRating'] = imdb_data['averageRating'].replace('\\N', np.nan).astype(float)

# Replace NaN values with the mean of the column for 'averageRating'
imdb_data['averageRating'].fillna(imdb_data['averageRating'].mean(), inplace=True)

# Replace '\\N' values with 'Unknown' for categorical columns
imdb_data['genres'] = imdb_data['genres'].replace('\\N', 'Unknown')
imdb_data['titleType'] = imdb_data['titleType'].replace('\\N', 'Unknown')
imdb_data['primaryTitle'] = imdb_data['primaryTitle'].replace('\\N', 'Unknown')
imdb_data['originalTitle'] = imdb_data['originalTitle'].replace('\\N', 'Unknown')

# Fill NaNs in 'startYear' with a default value (e.g., 0)
imdb_data['startYear'].fillna(0, inplace=True)

# Convert the 'startYear' column to an integer data type
imdb_data['startYear'] = imdb_data['startYear'].astype(int)

# Print a sample of the data to verify that replacements were made successfully
print(imdb_data[['startYear', 'averageRating', 'genres', 'titleType', 'primaryTitle', 'originalTitle']].head())

# Print the data types of the columns to ensure they are correct for analysis
print(imdb_data.dtypes)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  imdb_data['averageRating'].fillna(imdb_data['averageRating'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  imdb_data['startYear'].fillna(0, inplace=True)


   startYear  averageRating                    genres titleType  \
0       1894            5.7         Documentary,Short     short   
1       1892            5.6           Animation,Short     short   
2       1892            6.5  Animation,Comedy,Romance     short   
3       1892            5.4           Animation,Short     short   
4       1893            6.2              Comedy,Short     short   

             primaryTitle           originalTitle  
0              Carmencita              Carmencita  
1  Le clown et ses chiens  Le clown et ses chiens  
2          Pauvre Pierrot          Pauvre Pierrot  
3             Un bon bock             Un bon bock  
4        Blacksmith Scene        Blacksmith Scene  
tconst            object
titleType         object
primaryTitle      object
originalTitle     object
startYear          int64
genres            object
averageRating    float64
dtype: object


### Create Clean Title and Year

In [9]:
# Strip and lowercase titles for better matching
movielens_data['CleanTitle'] = movielens_data['Title'].str.extract(r'^(.*?) \(\d{4}\)$')[0].str.lower().str.strip()
imdb_data['CleanTitle'] = imdb_data['primaryTitle'].str.lower().str.strip()

# Convert years to integers for exact matching
movielens_data['Year'] = movielens_data['Title'].str.extract(r'\((\d{4})\)')[0].astype(int)
imdb_data['Year'] = imdb_data['startYear'].astype(int)

### Fuse MovieLens Dataset and Imdb Dataset

In [10]:
merged_data = pd.merge(movielens_data, imdb_data, left_on=['CleanTitle', 'Year'], right_on=['CleanTitle', 'Year'])

### Data Cleanup and Rating Normalization

In [11]:
# Scaling MovieLens ratings from 1-5 to 1-10
merged_data['ScaledRating'] = merged_data['Rating'] * 2
merged_data['CompositeRating'] = (merged_data['ScaledRating'] + merged_data['averageRating']) / 2

merged_data.drop(columns=['tconst', 'primaryTitle', 'originalTitle', 'startYear', 'Title', 'Genres'], inplace=True)
merged_data.drop(columns=['averageRating', 'ScaledRating', 'Rating'], inplace=True)

print(merged_data.head())
print("\nlength of the dataframe: ", len(merged_data))

   MovieID  UserID Gender  Age CleanTitle  Year titleType  \
0        1       1      F    1  toy story  1995     movie   
1        1       6      F   50  toy story  1995     movie   
2        1       8      M   25  toy story  1995     movie   
3        1       9      M   25  toy story  1995     movie   
4        1      10      F   35  toy story  1995     movie   

                       genres  CompositeRating  
0  Adventure,Animation,Comedy             9.15  
1  Adventure,Animation,Comedy             8.15  
2  Adventure,Animation,Comedy             8.15  
3  Adventure,Animation,Comedy             9.15  
4  Adventure,Animation,Comedy             9.15  

length of the dataframe:  805610


## Feature Engineering

In [12]:
# Encoding genres
mlb_genres = MultiLabelBinarizer()
genres_encoded = mlb_genres.fit_transform(merged_data['genres'].str.split(','))
genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_, index=merged_data.index)

# Encoding age
age_bins = [0, 18, 25, 35, 45, 50, 56, np.inf]
age_labels = ['0-18', '18-25', '25-35', '35-45', '45-50', '50-56', '56+']
merged_data['age_group'] = pd.cut(merged_data['Age'], bins=age_bins, labels=age_labels)
age_dummies = pd.get_dummies(merged_data['age_group'], prefix='age')

# Encoding gender
gender_dummies = pd.get_dummies(merged_data['Gender'], prefix='gender')

# Concatenating new encoded columns with original data
merged_data = pd.concat([merged_data, genres_df, age_dummies, gender_dummies], axis=1)

# Dropping original categorical columns that are now redundant
merged_data.drop(columns=['genres', 'age_group', 'Gender'], inplace=True)

In [13]:
print(merged_data.head())
print("\nlength of the dataframe: ", len(merged_data))

   MovieID  UserID  Age CleanTitle  Year titleType  CompositeRating  Action  \
0        1       1    1  toy story  1995     movie             9.15       0   
1        1       6   50  toy story  1995     movie             8.15       0   
2        1       8   25  toy story  1995     movie             8.15       0   
3        1       9   25  toy story  1995     movie             9.15       0   
4        1      10   35  toy story  1995     movie             9.15       0   

   Adult  Adventure  ...  Western  age_0-18  age_18-25  age_25-35  age_35-45  \
0      0          1  ...        0      True      False      False      False   
1      0          1  ...        0     False      False      False      False   
2      0          1  ...        0     False       True      False      False   
3      0          1  ...        0     False       True      False      False   
4      0          1  ...        0     False      False       True      False   

   age_45-50  age_50-56  age_56+  gender_F  

## Creating User-Item Interaction Matrix

In [14]:
user_item = merged_data.pivot_table(index='UserID', columns='CleanTitle', values='CompositeRating')
print("Size of user_item matrix: ", user_item.shape)

Size of user_item matrix:  (6040, 2341)


### Handling Missing Values in the User-Item Matrix

In [15]:
user_item_filled = user_item.apply(lambda x: x.fillna(x.mean()), axis=0)

global_mean = user_item_filled.stack().mean() 
user_item_filled = user_item_filled.fillna(global_mean)

print("Size of user_item matrix: \n", user_item_filled.head())

Size of user_item matrix: 
 CleanTitle  'til there was you  1-900  10 things i hate about you  \
UserID                                                              
1                     5.092308    5.6                    7.072857   
2                     5.092308    5.6                    7.072857   
3                     5.092308    5.6                    7.072857   
4                     5.092308    5.6                    7.072857   
5                     5.092308    5.6                    7.072857   

CleanTitle  101 dalmatians  12 angry men  2 days in the valley  20 dates  \
UserID                                                                     
1                 5.896703      8.795455              6.533217  5.656115   
2                 5.896703      8.795455              6.533217  5.656115   
3                 5.896703      8.795455              6.533217  5.656115   
4                 5.896703      8.795455              6.533217  5.656115   
5                 5.896703      

## Data Splitting for Model Evaluation

In [16]:
train_data, test_data = train_test_split(merged_data, test_size=0.2, random_state=42)

## Model Configuration and Evaluation

In [17]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(train_data[['UserID', 'MovieID', 'CompositeRating']], reader)
algo = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02)

# Training and evaluating the model using 5-fold cross-validation
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8727  0.8783  0.8785  0.8733  0.8764  0.8758  0.0025  
MAE (testset)     0.6858  0.6916  0.6914  0.6872  0.6891  0.6890  0.0023  
Fit time          10.20   9.58    10.79   10.87   8.81    10.05   0.77    
Test time         1.16    1.74    1.63    1.64    1.51    1.54    0.20    


{'test_rmse': array([0.87265639, 0.87833627, 0.87848166, 0.87330021, 0.8763585 ]),
 'test_mae': array([0.68582327, 0.69160953, 0.69137669, 0.68715816, 0.68906172]),
 'fit_time': (10.203290700912476,
  9.581296682357788,
  10.790733575820923,
  10.872620582580566,
  8.81301760673523),
 'test_time': (1.1573431491851807,
  1.7381346225738525,
  1.6338632106781006,
  1.636031150817871,
  1.5122034549713135)}

## Training and Testing the Model

In [18]:
from surprise.model_selection import train_test_split

trainset, testset = train_test_split(data, test_size=0.25)

# Train the SVD algorithm on the training set
algo.fit(trainset)

# Test the trained algorithm on the testing set
predictions = algo.test(testset)

# Calculate and print the RMSE to evaluate the model's accuracy
rmse = accuracy.rmse(predictions)
print(f'RMSE on test set is : {rmse}')

RMSE: 0.8815
RMSE on test set is : 0.8814854130452326


## Calculating Couple Scores

In [19]:
def couple_score(user1, user2, movie_id):
    
    # Predict the rating for each user
    score1 = algo.predict(user1, movie_id).est
    score2 = algo.predict(user2, movie_id).est

    # Return the average of both scores
    return (score1 + score2) / 2

## Generating Movie Recommendations for Couples

In [20]:
def recommend_for_couple(user1, user2, merged_data):
    
    # Extract the IDs of all unique movies in the data
    movies = set(merged_data['MovieID'])
    scores = []

    for movie in movies:

        # Calculate the couple score for each movie
        score = couple_score(user1, user2, movie)

        # Get the movie title from the DataFrame
        title = merged_data[merged_data['MovieID'] == movie]['CleanTitle'].values[0]
        scores.append((title, score))

    # Sort the movies by score in descending order
    scores.sort(key=lambda x: x[1], reverse=True)

    # Return the top 10 recommended movies
    return scores[:10]

recommended_movies = recommend_for_couple(1, 2, merged_data)

for title, score in recommended_movies:
    print(f"('{title}', {score})")


('schindler's list', 9.209888025207805)
('forrest gump', 8.998143340222164)
('12 angry men', 8.858101281429853)
('saving private ryan', 8.825859000170862)
('to kill a mockingbird', 8.720936104715706)
('rear window', 8.689598398280772)
('it's a wonderful life', 8.65550308028416)
('stop making sense', 8.648099606981534)
('one flew over the cuckoo's nest', 8.64466817673021)
('back to the future', 8.635209834421662)
