<a class="anchor" id='import'>
<font color = '#006400'>
    
# **1. Data Integration** </font>
</a>

<a class="anchor" id='lib'></a>
<font color = '#008000'>

## **1.1. Import the needed libraries** </font>

In [1]:
import polars as pl
import requests
import zipfile
import io
import pandas as pd

<a class="anchor" id='lib'></a>
<font color = '#008000'>

## **1.2. Integrate the datasets into the notebook** </font>

In [2]:
#Movies
url_data = "https://files.grouplens.org/datasets/movielens/ml-32m.zip"

response = requests.get(url_data, verify=False)
zip_file = zipfile.ZipFile(io.BytesIO(response.content))

with zip_file.open("ml-32m/movies.csv") as f:
    movies = pd.read_csv(f, sep=',')

print(movies.head())




   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [3]:
#Ratings
url_data = "https://files.grouplens.org/datasets/movielens/ml-32m.zip"
response = requests.get(url_data, verify=False)
zip_file = zipfile.ZipFile(io.BytesIO(response.content))

with zip_file.open("ml-32m/ratings.csv") as f:
    ratings = pd.read_csv(f)

print(ratings.head())



   userId  movieId  rating  timestamp
0       1       17     4.0  944249077
1       1       25     1.0  944250228
2       1       29     2.0  943230976
3       1       30     5.0  944249077
4       1       32     5.0  943228858


In [4]:
#Links
url_data = "https://files.grouplens.org/datasets/movielens/ml-32m.zip"
response = requests.get(url_data, verify=False)
zip_file = zipfile.ZipFile(io.BytesIO(response.content))

with zip_file.open("ml-32m/links.csv") as f:
    links = pd.read_csv(f)

print(links.head())



   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0


In [5]:
#Tags
url_data = "https://files.grouplens.org/datasets/movielens/ml-32m.zip"
response = requests.get(url_data, verify=False)
zip_file = zipfile.ZipFile(io.BytesIO(response.content))

with zip_file.open("ml-32m/tags.csv") as f:
    tags = pd.read_csv(f)

print(tags.head())



   userId  movieId          tag   timestamp
0      22    26479  Kevin Kline  1583038886
1      22    79592     misogyny  1581476297
2      22   247150   acrophobia  1622483469
3      34     2174        music  1249808064
4      34     2174        weird  1249808102


<a class="anchor" id='import'>
<font color = '#006400'>
    
# **2. Data Access, Exploration and Understanding** </font>
</a>

<a class="anchor" id='lib'></a>
<font color = '#008000'>

## **2.1. Ratings** </font>

In [6]:
def cleaning_ratings_data(df):
    """
    Enhanced cleaning function with detailed testing summary
    """
    # Store original stats
    original_rows = len(df)
    
    # Create copy
    df_clean = df.copy()
    
    print("DATA CLEANING PROCESS")
    print("=" * 50)
    
    # 1. Remove empty columns
    empty_cols = [col for col in df_clean.columns if len(df_clean[col].value_counts()) == 0]
    if empty_cols:
        df_clean.drop(empty_cols, axis=1, inplace=True)
        print(f"✓ Removed empty columns: {empty_cols}")
    else:
        print("✓ No empty columns found")
    
    # 2. Check and remove missing values
    missing_before = df_clean[['userId', 'movieId', 'rating', 'timestamp']].isnull().sum()
    if missing_before.sum() > 0:
        print(f"✓ Found missing values: {dict(missing_before)}")
        df_clean = df_clean.dropna(subset=['userId', 'movieId', 'rating', 'timestamp'])
        missing_after = df_clean[['userId', 'movieId', 'rating', 'timestamp']].isnull().sum()
        print(f"✓ Removed {missing_before.sum() - missing_after.sum()} rows with missing values")
    else:
        print("✓ No missing values found")
    
    # 3. Validate ratings (0.5 to 5.0 with 0.5 increments)
    valid_ratings = [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]
    invalid_ratings = df_clean[~df_clean['rating'].isin(valid_ratings)]
    if len(invalid_ratings) > 0:
        print(f"✓ Found {len(invalid_ratings)} invalid ratings")
        print(f"✓ Invalid rating values: {sorted(invalid_ratings['rating'].unique())}")
        df_clean = df_clean[df_clean['rating'].isin(valid_ratings)]
        print(f"✓ Removed {len(invalid_ratings)} rows with invalid ratings")
    else:
        print("✓ All ratings are valid (0.5 to 5.0 with 0.5 increments)")
    
    # 4. Remove duplicate user-movie ratings
    duplicates = df_clean.duplicated(subset=['userId', 'movieId']).sum()
    if duplicates > 0:
        print(f"✓ Found {duplicates} duplicate user-movie ratings")
        df_clean = df_clean.drop_duplicates(subset=['userId', 'movieId'])
        print(f"✓ Removed {duplicates} duplicate ratings")
    else:
        print("✓ No duplicate ratings found")
    
    # 5. Validate user and movie IDs
    invalid_users = len(df_clean[df_clean['userId'] <= 0])
    invalid_movies = len(df_clean[df_clean['movieId'] <= 0])
    if invalid_users > 0 or invalid_movies > 0:
        print(f"✓ Found {invalid_users} invalid user IDs and {invalid_movies} invalid movie IDs")
        df_clean = df_clean[(df_clean['userId'] > 0) & (df_clean['movieId'] > 0)]
        print(f"✓ Removed {invalid_users + invalid_movies} rows with invalid IDs")
    else:
        print("✓ All user and movie IDs are valid")
    
    # 6. Process timestamps
    df_clean['rating_date'] = pd.to_datetime(df_clean['timestamp'], unit='s')
    df_clean['rating_year'] = df_clean['rating_date'].dt.year
    df_clean['rating_month'] = df_clean['rating_date'].dt.month
    
    print(f"✓ Added date features: rating_date, rating_year, rating_month")
    
    return df_clean

# Apply cleaning
ratings = cleaning_ratings_data(ratings)

# Display the processed ratings table and summary
print("\n" + "=" * 50)
print("PROCESSED RATINGS TABLE")
print("=" * 50)
print(ratings.head())

print("\n" + "=" * 50)
print("PROCESSING SUMMARY")
print("=" * 50)
print(f"Total ratings: {len(ratings)}")
print(f"Columns: {list(ratings.columns)}")
print(f"Rating range: {ratings['rating'].min()} - {ratings['rating'].max()}")
print(f"Date range: {ratings['rating_year'].min()} - {ratings['rating_year'].max()}")
print(f"Unique users: {ratings['userId'].nunique()}")
print(f"Unique movies: {ratings['movieId'].nunique()}")

DATA CLEANING PROCESS
✓ No empty columns found
✓ No missing values found
✓ All ratings are valid (0.5 to 5.0 with 0.5 increments)
✓ No duplicate ratings found
✓ All user and movie IDs are valid
✓ Added date features: rating_date, rating_year, rating_month

PROCESSED RATINGS TABLE
   userId  movieId  rating  timestamp         rating_date  rating_year  \
0       1       17     4.0  944249077 1999-12-03 19:24:37         1999   
1       1       25     1.0  944250228 1999-12-03 19:43:48         1999   
2       1       29     2.0  943230976 1999-11-22 00:36:16         1999   
3       1       30     5.0  944249077 1999-12-03 19:24:37         1999   
4       1       32     5.0  943228858 1999-11-22 00:00:58         1999   

   rating_month  
0            12  
1            12  
2            11  
3            12  
4            11  

PROCESSING SUMMARY
Total ratings: 32000204
Columns: ['userId', 'movieId', 'rating', 'timestamp', 'rating_date', 'rating_year', 'rating_month']
Rating range: 0.5 - 5.

In [7]:
user_ratings = ratings[ratings['userId'] == 196]
user_ratings

Unnamed: 0,userId,movieId,rating,timestamp,rating_date,rating_year,rating_month
29828,196,50,4.5,1553755714,2019-03-28 06:48:34,2019,3
29829,196,318,5.0,1553626106,2019-03-26 18:48:26,2019,3
29830,196,356,4.5,1553626441,2019-03-26 18:54:01,2019,3
29831,196,593,3.5,1553755692,2019-03-28 06:48:12,2019,3
29832,196,858,5.0,1553626435,2019-03-26 18:53:55,2019,3
...,...,...,...,...,...,...,...
29918,196,174055,4.0,1553626722,2019-03-26 18:58:42,2019,3
29919,196,183837,2.0,1553626788,2019-03-26 18:59:48,2019,3
29920,196,187593,4.0,1553626303,2019-03-26 18:51:43,2019,3
29921,196,192803,4.5,1553626786,2019-03-26 18:59:46,2019,3


In [8]:
#checking whether user 196 rated the same movie more than once
user_ratings[user_ratings['movieId'].duplicated()]

Unnamed: 0,userId,movieId,rating,timestamp,rating_date,rating_year,rating_month


In [9]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp,rating_date,rating_year,rating_month
count,32000200.0,32000200.0,32000200.0,32000200.0,32000204,32000200.0,32000200.0
mean,100278.5,29318.61,3.540396,1275241000.0,2010-05-30 17:39:59.573265152,2009.908,6.548105
min,1.0,1.0,0.5,789652000.0,1995-01-09 11:46:44,1995.0,1.0
25%,50053.0,1233.0,3.0,1051012000.0,2003-04-22 11:53:50,2003.0,3.0
50%,100297.0,3452.0,3.5,1272622000.0,2010-04-30 10:03:32.500000,2010.0,7.0
75%,150451.0,44199.0,4.0,1503158000.0,2017-08-19 15:59:05.249999872,2017.0,10.0
max,200948.0,292757.0,5.0,1697164000.0,2023-10-13 02:29:07,2023.0,12.0
std,57949.05,50958.16,1.058986,256163000.0,,8.140929,3.504734


In [10]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000204 entries, 0 to 32000203
Data columns (total 7 columns):
 #   Column        Dtype         
---  ------        -----         
 0   userId        int64         
 1   movieId       int64         
 2   rating        float64       
 3   timestamp     int64         
 4   rating_date   datetime64[ns]
 5   rating_year   int32         
 6   rating_month  int32         
dtypes: datetime64[ns](1), float64(1), int32(2), int64(3)
memory usage: 1.4 GB


<a class="anchor" id='lib'></a>
<font color = '#008000'>

## **2.2. Movies** </font>

In [11]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [12]:
def cleaning_movies_data(movies):
    # Store original stats
    original_rows = len(movies)
    
    # Create copy
    movies_clean = movies.copy()
    
    print("MOVIES DATA CLEANING PROCESS")
    print("=" * 50)
    print(f"Original dataset: {len(movies_clean)} movies, {len(movies_clean.columns)} columns")
    
    # 1. Remove empty columns
    empty_cols = [col for col in movies_clean.columns if len(movies_clean[col].value_counts()) == 0]
    if empty_cols:
        movies_clean.drop(empty_cols, axis=1, inplace=True)
        print(f"✓ Removed empty columns: {empty_cols}")
    
    # 2. Check for missing values
    missing_before = movies_clean.isnull().sum()
    if missing_before.sum() > 0:
        print(f"✓ Found missing values: {dict(missing_before)}")
        movies_clean = movies_clean.dropna(subset=['movieId', 'title', 'genres'])
        print(f"✓ Removed rows with missing critical data")
    
    # 3. Remove duplicates - keep only rows that are unique across all columns except movieId
    subset_cols = [col for col in movies_clean.columns if col != 'movieId']
    duplicates = movies_clean.duplicated(subset=subset_cols).sum()

    if duplicates > 0:
        print(f"✓ Found {duplicates} duplicate rows (identical in all columns except movieId)")
        movies_clean = movies_clean.drop_duplicates(subset=subset_cols)
        print(f"✓ Removed {duplicates} duplicate rows")
    else:
        print("✓ No duplicates found (across all columns except movieId)")
    
    # 4. Extract year from title and REPLACE title column
    movies_clean['year'] = movies_clean['title'].str.extract(r'\((\d{4})\)')[0]
    movies_clean['year'] = pd.to_numeric(movies_clean['year'], errors='coerce')
    
    # REPLACE title column with clean title (without year)
    movies_clean['title'] = movies_clean['title'].str.replace(r'\s*\(\d{4}\)', '', regex=True).str.strip()
    
    # Check for invalid years (but don't remove them)
    invalid_years = movies_clean['year'].isna().sum()
    if invalid_years > 0:
        print(f"✓ Found {invalid_years} movies with invalid years")
    
    # 5. Process genres and convert to binary columns
    # Define all possible genres
    all_genres = [
        'Action', 'Adventure', 'Animation', "Children", 'Comedy', 
        'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
        'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
        'Thriller', 'War', 'Western'
    ]
    
    # Create binary columns for each genre
    for genre in all_genres:
        movies_clean[genre] = movies_clean['genres'].apply(
            lambda x: 1 if pd.notna(x) and genre in x.split('|') else 0
        )
    
    # Count movies with no genres
    no_genres_listed = (movies_clean['genres'] == '(no genres listed)').sum()
    if no_genres_listed > 0:
        print(f"✓ Found {no_genres_listed} movies with 'no genres listed'")
    
    # 6. REMOVE genres column
    movies_clean = movies_clean.drop('genres', axis=1)
    print("✓ Removed 'genres' column")
    
    # Show genre distribution
    print("\n✓ GENRE DISTRIBUTION:")
    print("-" * 25)
    for genre in all_genres:
        count = movies_clean[genre].sum()
        if count > 0:
            print(f"  {genre}: {count} movies")
    
    return movies_clean

# Apply cleaning
movies = cleaning_movies_data(movies)

# Display the processed movies table and summary
print("\n" + "=" * 50)
print("PROCESSED MOVIES TABLE")
print("=" * 50)
print(movies.head())

print("\n" + "=" * 50)
print("FINAL PROCESSING SUMMARY")
print("=" * 50)
print(f"Total movies: {len(movies)}")
print(f"Columns: {list(movies.columns)}")
print(f"Year range: {movies['year'].min()} - {movies['year'].max()}")

# Show genre column counts
print("\nGENRE COUNTS:")
print("-" * 20)
genre_cols = ['Action', 'Adventure', 'Animation', "Children", 'Comedy', 
              'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
              'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
              'Thriller', 'War', 'Western']

for genre in genre_cols:
    if genre in movies.columns:
        count = movies[genre].sum()
        print(f"{genre}: {count} movies")

MOVIES DATA CLEANING PROCESS
Original dataset: 87585 movies, 3 columns
✓ Found 35 duplicate rows (identical in all columns except movieId)
✓ Removed 35 duplicate rows
✓ Found 615 movies with invalid years
✓ Found 7080 movies with 'no genres listed'
✓ Removed 'genres' column

✓ GENRE DISTRIBUTION:
-------------------------
  Action: 9668 movies
  Adventure: 5402 movies
  Animation: 4616 movies
  Children: 4520 movies
  Comedy: 23115 movies
  Crime: 6976 movies
  Documentary: 9356 movies
  Drama: 34164 movies
  Fantasy: 3850 movies
  Film-Noir: 353 movies
  Horror: 8651 movies
  Musical: 1059 movies
  Mystery: 4013 movies
  Romance: 10368 movies
  Sci-Fi: 4907 movies
  Thriller: 11818 movies
  War: 2325 movies
  Western: 1696 movies

PROCESSED MOVIES TABLE
   movieId                        title    year  Action  Adventure  Animation  \
0        1                    Toy Story  1995.0       0          1          1   
1        2                      Jumanji  1995.0       0          1       

In [13]:
movies

Unnamed: 0,movieId,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,1995.0,0,1,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995.0,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995.0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995.0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,1995.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87580,292731,The Monroy Affaire,2022.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
87581,292737,Shelter in Solitude,2023.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
87582,292753,Orca,2023.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
87583,292755,The Angry Breed,1968.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
movies.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
movieId,87550.0,157629.452393,79016.519962,1.0,112635.5,165718.0,213182.5,292757.0
year,86935.0,1995.350296,26.010308,1874.0,1981.0,2006.0,2015.0,2023.0
Action,87550.0,0.110428,0.313425,0.0,0.0,0.0,0.0,1.0
Adventure,87550.0,0.061702,0.240615,0.0,0.0,0.0,0.0,1.0
Animation,87550.0,0.052724,0.223484,0.0,0.0,0.0,0.0,1.0
Children,87550.0,0.051628,0.221275,0.0,0.0,0.0,0.0,1.0
Comedy,87550.0,0.264021,0.440813,0.0,0.0,0.0,1.0,1.0
Crime,87550.0,0.07968,0.270799,0.0,0.0,0.0,0.0,1.0
Documentary,87550.0,0.106865,0.308943,0.0,0.0,0.0,0.0,1.0
Drama,87550.0,0.390223,0.487803,0.0,0.0,0.0,1.0,1.0


In [15]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 87550 entries, 0 to 87584
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   movieId      87550 non-null  int64  
 1   title        87550 non-null  object 
 2   year         86935 non-null  float64
 3   Action       87550 non-null  int64  
 4   Adventure    87550 non-null  int64  
 5   Animation    87550 non-null  int64  
 6   Children     87550 non-null  int64  
 7   Comedy       87550 non-null  int64  
 8   Crime        87550 non-null  int64  
 9   Documentary  87550 non-null  int64  
 10  Drama        87550 non-null  int64  
 11  Fantasy      87550 non-null  int64  
 12  Film-Noir    87550 non-null  int64  
 13  Horror       87550 non-null  int64  
 14  Musical      87550 non-null  int64  
 15  Mystery      87550 non-null  int64  
 16  Romance      87550 non-null  int64  
 17  Sci-Fi       87550 non-null  int64  
 18  Thriller     87550 non-null  int64  
 19  War      

<a class="anchor" id='lib'></a>
<font color = '#008000'>

## **2.3. Tags** </font>

In [16]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,22,26479,Kevin Kline,1583038886
1,22,79592,misogyny,1581476297
2,22,247150,acrophobia,1622483469
3,34,2174,music,1249808064
4,34,2174,weird,1249808102


In [17]:
def cleaning_tags_data(tags):
    tags_clean = tags.copy()
    
    # Basic cleaning
    tags_clean = tags_clean.dropna(subset=['userId', 'movieId', 'tag'])
    tags_clean = tags_clean.drop_duplicates()
    
    # Process timestamps
    tags_clean['tag_date'] = pd.to_datetime(tags_clean['timestamp'], unit='s')
    tags_clean['tag_year'] = tags_clean['tag_date'].dt.year
    tags_clean['tag_month'] = tags_clean['tag_date'].dt.month
    
    # Clean tags (lowercase, strip whitespace)
    tags_clean['tag'] = tags_clean['tag'].str.lower().str.strip()
    
    return tags_clean

tags = cleaning_tags_data(tags)

In [18]:
tags

Unnamed: 0,userId,movieId,tag,timestamp,tag_date,tag_year,tag_month
0,22,26479,kevin kline,1583038886,2020-03-01 05:01:26,2020,3
1,22,79592,misogyny,1581476297,2020-02-12 02:58:17,2020,2
2,22,247150,acrophobia,1622483469,2021-05-31 17:51:09,2021,5
3,34,2174,music,1249808064,2009-08-09 08:54:24,2009,8
4,34,2174,weird,1249808102,2009-08-09 08:55:02,2009,8
...,...,...,...,...,...,...,...
2000067,162279,90645,rafe spall,1320817734,2011-11-09 05:48:54,2011,11
2000068,162279,91079,anton yelchin,1322337407,2011-11-26 19:56:47,2011,11
2000069,162279,91079,felicity jones,1322337400,2011-11-26 19:56:40,2011,11
2000070,162279,91658,rooney mara,1325828398,2012-01-06 05:39:58,2012,1


<a class="anchor" id='lib'></a>
<font color = '#008000'>

## **2.4. Links** </font>

In [19]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


<a class="anchor" id='import'>
<font color = '#006400'>
    
# **3. Convert to Parquet** </font>
</a>

In [20]:
# Define paths
output_dir = "../results"

# Make sure the directory exists
import os
os.makedirs(output_dir, exist_ok=True)

# Convert and save
ratings.to_parquet(os.path.join(output_dir, "ratings32M.parquet"), index=False)
movies.to_parquet(os.path.join(output_dir, "movies32M.parquet"), index=False)
tags.to_parquet(os.path.join(output_dir, "tags32M.parquet"), index=False)
links.to_parquet(os.path.join(output_dir, "links32M.parquet"), index=False)