In [1]:
import pandas as pd

Original MovieLens data

In [2]:
ratings = pd.read_csv('ml-25m/ratings.csv')
movies = pd.read_csv('ml-25m/movies.csv')
tags = pd.read_csv('ml-25m/tags.csv')

In [3]:
#merge all into one dataframe
df_watches = ratings.merge(movies, on='movieId', how='left')
df_watches = df_watches.merge(tags.drop(columns='timestamp'), on=['userId', 'movieId'], how='inner')
df_watches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 834731 entries, 0 to 834730
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     834731 non-null  int64  
 1   movieId    834731 non-null  int64  
 2   rating     834731 non-null  float64
 3   timestamp  834731 non-null  int64  
 4   title      834731 non-null  object 
 5   genres     834731 non-null  object 
 6   tag        834730 non-null  object 
dtypes: float64(1), int64(3), object(3)
memory usage: 44.6+ MB


In [4]:
#leave only watches with reviews higher than 3
df_watches = df_watches[df_watches['rating'] > 3]

In [5]:
df_watches.to_csv('processed_data/watches.csv')

Load overviews data

In [6]:
metadata = pd.read_csv('ml-meta/movies_metadata.csv')
links_df = pd.read_csv('ml-meta/links.csv')

  metadata = pd.read_csv('ml-meta/movies_metadata.csv')


In [7]:
#drop unnecessary columns
metadata = metadata[['id', 'overview']]

In [8]:
#get equivalent ids
links_df = links_df.dropna(subset=['tmdbId'])
links_df['tmdbId'] = links_df['tmdbId'].astype('int')
metadata['id'] = pd.to_numeric(metadata['id'], errors='coerce')
metadata = metadata.dropna(subset=['id'])
metadata['id'] = metadata['id'].astype('int')

In [9]:
#put them all together
overviews = metadata.merge(links_df[['movieId', 'tmdbId']], left_on='id', right_on='tmdbId', how='inner')
overviews = overviews.dropna(subset=['overview'])
overviews.drop(columns=['tmdbId', 'id'], inplace=True)
overviews = overviews[['movieId', 'overview']]
overviews.head()

Unnamed: 0,movieId,overview
0,1,"Led by Woody, Andy's toys live happily in his ..."
1,2,When siblings Judy and Peter discover an encha...
2,3,A family wedding reignites the ancient feud be...
3,4,"Cheated on, mistreated and stepped on, the wom..."
4,5,Just when George Banks has recovered from his ...


In [10]:
overviews.to_csv('processed_data/overviews.csv')

Leaving only common movies

In [16]:
common_movies = list(set(overviews['movieId'].unique()) & set(df_watches['movieId'].unique()))
len(common_movies)

14533

In [17]:
overviews_common = overviews[overviews['movieId'].isin(common_movies)]
watches_common = df_watches[df_watches['movieId'].isin(common_movies)]

In [18]:
overviews_common.head()

Unnamed: 0,movieId,overview
0,1,"Led by Woody, Andy's toys live happily in his ..."
1,2,When siblings Judy and Peter discover an encha...
2,3,A family wedding reignites the ancient feud be...
3,4,"Cheated on, mistreated and stepped on, the wom..."
4,5,Just when George Banks has recovered from his ...


In [19]:
watches_common.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,tag
0,3,260,4.0,1439472239,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,classic
1,3,260,4.0,1439472239,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,sci-fi
2,4,1732,4.5,1573943590,"Big Lebowski, The (1998)",Comedy|Crime,dark comedy
3,4,1732,4.5,1573943590,"Big Lebowski, The (1998)",Comedy|Crime,great dialogue
4,4,7569,3.5,1573943431,You Only Live Twice (1967),Action|Adventure|Sci-Fi|Thriller,so bad it's good


In [20]:
overviews_common.to_csv('processed_data/overviews_common.csv')
watches_common.to_csv('processed_data/watches_common.csv')

Exploring number of interactions

In [22]:
import numpy as np

In [25]:
np.median(watches_common['userId'].value_counts())

5.0

In [26]:
len(watches_common['userId'].unique())

12140

In [30]:
len(watches_common[watches_common['userId'].map(watches_common['userId'].value_counts()) >= 15]['userId'].unique())

3613

In [32]:
watches_over15 = watches_common[watches_common['userId'].map(watches_common['userId'].value_counts()) >= 15]
watches_over15.to_csv('processed_data/watches_over15.csv')

In [35]:
common_movies_over15 = list(set(overviews['movieId'].unique()) & set(watches_over15['movieId'].unique()))
len(common_movies_over15)

13746

In [36]:
overviews_common_over15 = overviews[overviews['movieId'].isin(common_movies_over15)]

In [38]:
overviews_common_over15.to_csv('processed_data/overviews_common_over15.csv')