In [1]:
# type: ignore
import pandas as pd 
from spotlight.datasets.movielens import get_movielens_dataset
from faker import Faker
import random
from collections import Counter

## A) Load

In [5]:
# download the dataset 100K
dataset = get_movielens_dataset(variant='100K')

# create a dataframe from the dataset
ratings = pd.DataFrame({
    'user_id': dataset.user_ids,
    'movie_id': dataset.item_ids,
    'rating': dataset.ratings,
    'timestamp': dataset.timestamps
})

# convertir timestamp to datetime
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')

# Load the information about the users 
users_columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_csv('datasets/ml-100k/u.user', sep='|', header=None, names=users_columns, encoding='ISO-8859-1')

# Load the information about the movies
column_names = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL'] + [f'genre_{i}' for i in range(19)]
movies_metadata = pd.read_csv('datasets/ml-100k/u.item', sep='|', encoding='ISO-8859-1', header=None, names=column_names)

# Load the information about the genres
genres = pd.read_csv('datasets/ml-100k/u.genre', sep='|', header=None, names=['genre', 'genre_id'])

## B) Merge all

In [6]:
movies_metadata['genres'] = movies_metadata[[f'genre_{i}' for i in range(19)]].values.tolist()

# merge information about the rating and the movies name
full_data = pd.merge(ratings, movies_metadata[['movie_id', 'movie_title', 'release_date', 'genres']], on='movie_id')

# convert the numerical genres to the name of the genres
genre_dict = dict(zip(genres['genre_id'], genres['genre']))
full_data['genres_name'] = full_data['genres'].apply(lambda flags: [genre_dict[i] for i, flag in enumerate(flags) if flag == 1])
full_data.drop(columns=['genres'], inplace=True)

# merge information about the rating and the users
full_data = full_data.merge(users, on='user_id', how='left')

# convert the timestamp to a datetime object
full_data['timestamp'] = pd.to_datetime(full_data['timestamp'], unit='s')
full_data['release_date'] = pd.to_datetime(full_data['release_date'], errors='coerce')

full_data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title,release_date,genres_name,age,gender,occupation,zip_code
0,196,242,3.0,1997-12-04 15:55:49,Kolya (1996),1997-01-24,[Comedy],49,M,writer,55105
1,186,302,3.0,1998-04-04 19:22:22,L.A. Confidential (1997),1997-01-01,"[Crime, Film-Noir, Mystery, Thriller]",39,F,executive,0
2,22,377,1.0,1997-11-07 07:18:36,Heavyweights (1994),1994-01-01,"[Children's, Comedy]",25,M,writer,40206
3,244,51,2.0,1997-11-27 05:02:03,Legends of the Fall (1994),1994-01-01,"[Drama, Romance, War, Western]",28,M,technician,80525
4,166,346,1.0,1998-02-02 05:33:16,Jackie Brown (1997),1997-01-01,"[Crime, Drama]",47,M,educator,55113


## C) Database movies without any adding

In [7]:
movies_metadata_complet = movies_metadata.copy()

# Combine genre columns into a list and map them to genre names
movies_metadata_complet['genres'] = movies_metadata_complet[[f'genre_{i}' for i in range(19)]].values.tolist()
genre_dict = dict(zip(genres['genre_id'], genres['genre']))
movies_metadata_complet['genres_name'] = movies_metadata_complet['genres'].apply(lambda flags: [genre_dict[i] for i, flag in enumerate(flags) if flag == 1])

# Drop unnecessary columns
movies_metadata_complet.drop(columns=['genres'] + [f'genre_{i}' for i in range(19)], inplace=True)

# Convert release_date to datetime
movies_metadata_complet['release_date'] = pd.to_datetime(movies_metadata_complet['release_date'], errors='coerce')

# Calculate ratings
ratings_data = full_data.groupby('movie_id')['rating'].agg(['mean', 'min', 'max'])
movies_metadata_complet['avg_rating'] = movies_metadata_complet['movie_id'].map(ratings_data['mean'])
movies_metadata_complet['min_rating'] = movies_metadata_complet['movie_id'].map(ratings_data['min'])
movies_metadata_complet['max_rating'] = movies_metadata_complet['movie_id'].map(ratings_data['max'])

# Final dataframe
movies_metadata_complet

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,genres_name,avg_rating,min_rating,max_rating
0,1,Toy Story (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,"[Animation, Children's, Comedy]",3.878319,1.0,5.0
1,2,GoldenEye (1995),1995-01-01,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,"[Action, Adventure, Thriller]",3.206107,1.0,5.0
2,3,Four Rooms (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,[Thriller],3.033333,1.0,5.0
3,4,Get Shorty (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,"[Action, Comedy, Drama]",3.550239,1.0,5.0
4,5,Copycat (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Copycat%20(1995),"[Crime, Drama, Thriller]",3.302325,1.0,5.0
...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),1998-02-06,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,[Drama],1.000000,1.0,1.0
1678,1679,B. Monkey (1998),1998-02-06,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,"[Romance, Thriller]",3.000000,3.0,3.0
1679,1680,Sliding Doors (1998),1998-01-01,,http://us.imdb.com/Title?Sliding+Doors+(1998),"[Drama, Romance]",2.000000,2.0,2.0
1680,1681,You So Crazy (1994),1994-01-01,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,[Comedy],3.000000,3.0,3.0


## D) Database user

In [8]:
fake = Faker()

def generate_user_data():
    prenom = fake.first_name()
    nom = fake.last_name()
    domain = random.choice(["gmail.com", "yahoo.com", "outlook.com"])
    email = f"{prenom.lower()}.{nom.lower()}@{domain}"
    return prenom, nom, email

# Apply the function to generate user data
users[['prenom', 'nom', 'email']] = users.apply(lambda row: pd.Series(generate_user_data()), axis=1)

top_rated_movies = full_data.copy()

# Get the genres of top-rated movies for each user
top_rated_movies['best_movie_genres'] = top_rated_movies['genres_name'].apply(lambda genres: ', '.join(genres))

# Get the most frequent genre for each user
def get_preferred_genre(user_id):
    user_genres = ', '.join(top_rated_movies[top_rated_movies['user_id'] == user_id]['best_movie_genres']).split(', ')
    genre_counts = Counter(user_genres)
    return genre_counts.most_common(1)[0][0] if genre_counts else None

# Get the best-rated movie for each user
def get_best_rated_movie(user_id):
    user_movies = top_rated_movies[top_rated_movies['user_id'] == user_id]
    return user_movies.loc[user_movies['rating'].idxmax()]['movie_title']

# Calculate the preferred genre and best-rated movie for each user
user_genres = {user_id: get_preferred_genre(user_id) for user_id in users['user_id'].unique()}
user_best_rated_movie = {user_id: get_best_rated_movie(user_id) for user_id in users['user_id'].unique()}

# Add the preferred genre and best-rated movie columns to the users dataframe
users['preferred_genre'] = users['user_id'].map(user_genres)
users['best_rated_movie'] = users['user_id'].map(user_best_rated_movie)

In [9]:
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code,prenom,nom,email,preferred_genre,best_rated_movie
0,1,24,M,technician,85711,Lauren,Schmitt,lauren.schmitt@gmail.com,Drama,Groundhog Day (1993)
1,2,53,F,other,94043,James,Lester,james.lester@yahoo.com,Drama,Shall We Dance? (1996)
2,3,23,M,writer,32067,Raven,Wise,raven.wise@yahoo.com,Drama,Conspiracy Theory (1997)
3,4,24,M,technician,43537,Natalie,Randall,natalie.randall@outlook.com,Thriller,Ulee's Gold (1997)
4,5,33,F,other,15213,Timothy,Collins,timothy.collins@yahoo.com,Comedy,"Adventures of Priscilla, Queen of the Desert, ..."


## E) Data base all with additional data

In [10]:
df_combined = pd.merge(ratings, movies_metadata_complet, on='movie_id', how='left')
df_combined = pd.merge(df_combined, users, on='user_id', how='left')
df_combined.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title,release_date,video_release_date,IMDb_URL,genres_name,avg_rating,...,max_rating,age,gender,occupation,zip_code,prenom,nom,email,preferred_genre,best_rated_movie
0,196,242,3.0,1997-12-04 15:55:49,Kolya (1996),1997-01-24,,http://us.imdb.com/M/title-exact?Kolya%20(1996),[Comedy],3.991453,...,5.0,49,M,writer,55105,Jay,Knight,jay.knight@outlook.com,Comedy,Stand by Me (1986)
1,186,302,3.0,1998-04-04 19:22:22,L.A. Confidential (1997),1997-01-01,,http://us.imdb.com/M/title-exact?L%2EA%2E+Conf...,"[Crime, Film-Noir, Mystery, Thriller]",4.161616,...,5.0,39,F,executive,0,Eugene,Allen,eugene.allen@outlook.com,Thriller,Clear and Present Danger (1994)
2,22,377,1.0,1997-11-07 07:18:36,Heavyweights (1994),1994-01-01,,http://us.imdb.com/M/title-exact?Heavyweights%...,"[Children's, Comedy]",2.153846,...,4.0,25,M,writer,40206,Nathan,Snyder,nathan.snyder@gmail.com,Comedy,Supercop (1992)
3,244,51,2.0,1997-11-27 05:02:03,Legends of the Fall (1994),1994-01-01,,http://us.imdb.com/M/title-exact?Legends%20of%...,"[Drama, Romance, War, Western]",3.45679,...,5.0,28,M,technician,80525,Stephen,Kelley,stephen.kelley@yahoo.com,Comedy,Monty Python's Life of Brian (1979)
4,166,346,1.0,1998-02-02 05:33:16,Jackie Brown (1997),1997-01-01,,http://us.imdb.com/M/title-exact?imdb-title-11...,"[Crime, Drama]",3.642857,...,5.0,47,M,educator,55113,Jorge,Villanueva,jorge.villanueva@yahoo.com,Thriller,Conspiracy Theory (1997)


## F) Download

In [11]:
full_data.to_csv('ratings.csv', index=False)
movies_metadata_complet.to_csv('movies_metadata_complet.csv', index=False)
users.to_csv('users_metadata_complet.csv', index=False)
df_combined.to_csv('ratings_complet.csv', index=False)