<a class="anchor" id='import'>
<font color = '#006400'>
    
# **1. Data Integration** </font>
</a>

<a class="anchor" id='lib'></a>
<font color = '#008000'>

## **1.1. Import the needed libraries** </font>

In [1]:
import polars as pl
import requests
import zipfile
import io
import pandas as pd

<a class="anchor" id='lib'></a>
<font color = '#008000'>

## **1.2. Integrate the datasets into the notebook** </font>

In [2]:
url_data = "https://files.grouplens.org/datasets/movielens/ml-32m.zip"

response = requests.get(url_data, verify=False)
zip_file = zipfile.ZipFile(io.BytesIO(response.content))

with zip_file.open("ml-32m/movies.csv") as f:
    movies = pd.read_csv(f)




In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
url_data = "https://files.grouplens.org/datasets/movielens/ml-32m.zip"
response = requests.get(url_data, verify=False)
zip_file = zipfile.ZipFile(io.BytesIO(response.content))

with zip_file.open("ml-32m/ratings.csv") as f:
    ratings = pd.read_csv(f)



In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


In [None]:
url_data = "https://files.grouplens.org/datasets/movielens/ml-32m.zip"
response = requests.get(url_data, verify=False)
zip_file = zipfile.ZipFile(io.BytesIO(response.content))

with zip_file.open("ml-32m/links.csv") as f:
    links = pd.read_csv(f)



In [None]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [None]:
url_data = "https://files.grouplens.org/datasets/movielens/ml-32m.zip"
response = requests.get(url_data, verify=False)
zip_file = zipfile.ZipFile(io.BytesIO(response.content))

with zip_file.open("ml-32m/tags.csv") as f:
    tags = pd.read_csv(f)



In [None]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,22,26479,Kevin Kline,1583038886
1,22,79592,misogyny,1581476297
2,22,247150,acrophobia,1622483469
3,34,2174,music,1249808064
4,34,2174,weird,1249808102


<a class="anchor" id='import'>
<font color = '#006400'>
    
# **2. Data Access, Exploration and Understanding** </font>
</a>

<a class="anchor" id='lib'></a>
<font color = '#008000'>

## **2.1. Ratings** </font>

In [None]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,32000200.0,32000200.0,32000200.0,32000200.0
mean,100278.5,29318.61,3.540396,1275241000.0
std,57949.05,50958.16,1.058986,256163000.0
min,1.0,1.0,0.5,789652000.0
25%,50053.0,1233.0,3.0,1051012000.0
50%,100297.0,3452.0,3.5,1272622000.0
75%,150451.0,44199.0,4.0,1503158000.0
max,200948.0,292757.0,5.0,1697164000.0


In [None]:
ratings["movieId"].unique()
ratings["userId"].unique()
ratings["rating"].unique()

array([4. , 1. , 2. , 5. , 3. , 3.5, 0.5, 4.5, 2.5, 1.5])

In [None]:
ratings = pl.from_pandas(ratings)

<a class="anchor" id='lib'></a>
<font color = '#008000'>

## **2.2. Movies** </font>

In [None]:
movies.describe()

Unnamed: 0,movieId
count,87585.0
mean,157651.365519
std,79013.402099
min,1.0
25%,112657.0
50%,165741.0
75%,213203.0
max,292757.0


In [None]:
movies = pl.from_pandas(movies)


In [None]:
movies.filter(pl.col('movieId') == 29).select('genres')

genres
str
"""Adventure|Drama|Fantasy|Myster…"


In [None]:
movies.filter(pl.col('genres') == 'no genres listed')

movieId,title,genres
i64,str,str


In [None]:
valid_genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
    "(no genres listed)"
]


invalid_genres = (
    movies
    .select(pl.col("genres").str.split('|').explode())
    .filter(~pl.col("genres").is_in(valid_genres))
    .unique()
)

print(invalid_genres)


shape: (2, 1)
┌──────────┐
│ genres   │
│ ---      │
│ str      │
╞══════════╡
│ Children │
│ IMAX     │
└──────────┘


In [None]:
movies.filter(pl.col('genres') == 'IMAX')

movieId,title,genres
i64,str,str
4460,"""Encounter in the Third Dimensi…","""IMAX"""


In [None]:
invalid_genres = ["IMAX",  "(no genres listed)"]
movies = movies.filter(~pl.col("genres").is_in(invalid_genres))

In [None]:
movies.describe()

statistic,movieId,title,genres
str,f64,str,str
"""count""",80504.0,"""80504""","""80504"""
"""null_count""",0.0,"""0""","""0"""
"""mean""",155501.531576,,
"""std""",81084.239109,,
"""min""",1.0,""" (2019)""","""Action"""
"""25%""",104285.0,,
"""50%""",164739.0,,
"""75%""",213756.0,,
"""max""",292757.0,"""貞子3D (2012)""","""Western"""


In [None]:
movies = movies.with_columns(
    pl.col("title").str.replace(r" \(\d{4}\)$", "", literal=False).alias("title")
)

In [None]:
movies.head(1000)

movieId,title,genres
i64,str,str
1,"""Toy Story""","""Adventure|Animation|Children|C…"
2,"""Jumanji""","""Adventure|Children|Fantasy"""
3,"""Grumpier Old Men""","""Comedy|Romance"""
4,"""Waiting to Exhale""","""Comedy|Drama|Romance"""
5,"""Father of the Bride Part II""","""Comedy"""
…,…,…
1018,"""That Darn Cat!""","""Children|Comedy|Mystery"""
1019,"""20,000 Leagues Under the Sea""","""Adventure|Drama|Sci-Fi"""
1020,"""Cool Runnings""","""Comedy"""
1021,"""Angels in the Outfield""","""Children|Comedy"""


In [None]:
movies.describe()

statistic,movieId,title,genres
str,f64,str,str
"""count""",80504.0,"""80504""","""80504"""
"""null_count""",0.0,"""0""","""0"""
"""mean""",155501.531576,,
"""std""",81084.239109,,
"""min""",1.0,"""""","""Action"""
"""25%""",104285.0,,
"""50%""",164739.0,,
"""75%""",213756.0,,
"""max""",292757.0,"""貞子3D""","""Western"""


In [None]:
movies = movies.unique()

In [None]:
movies.describe()

statistic,movieId,title,genres
str,f64,str,str
"""count""",80504.0,"""80504""","""80504"""
"""null_count""",0.0,"""0""","""0"""
"""mean""",155501.531576,,
"""std""",81084.239109,,
"""min""",1.0,"""""","""Action"""
"""25%""",104285.0,,
"""50%""",164739.0,,
"""75%""",213756.0,,
"""max""",292757.0,"""貞子3D""","""Western"""


In [None]:
movies = movies.with_columns(
    pl.col("genres").str.split("|")
).explode("genres")

# Agora cada linha tem apenas um género
print(movies.head(10))


shape: (10, 3)
┌─────────┬───────────────────────────────┬───────────┐
│ movieId ┆ title                         ┆ genres    │
│ ---     ┆ ---                           ┆ ---       │
│ i64     ┆ str                           ┆ str       │
╞═════════╪═══════════════════════════════╪═══════════╡
│ 143029  ┆ Stuck in the Suburbs          ┆ Children  │
│ 143029  ┆ Stuck in the Suburbs          ┆ Comedy    │
│ 149610  ┆ Hot Summer in Barefoot County ┆ Action    │
│ 149610  ┆ Hot Summer in Barefoot County ┆ Comedy    │
│ 7111    ┆ Ryan's Daughter               ┆ Drama     │
│ 7111    ┆ Ryan's Daughter               ┆ Romance   │
│ 154917  ┆ Back To The Sea               ┆ Animation │
│ 154917  ┆ Back To The Sea               ┆ Children  │
│ 74916   ┆ Greenberg                     ┆ Comedy    │
│ 74916   ┆ Greenberg                     ┆ Drama     │
└─────────┴───────────────────────────────┴───────────┘


<a class="anchor" id='lib'></a>
<font color = '#008000'>

## **2.3. Links** </font>

In [None]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [None]:
links.describe()

Unnamed: 0,movieId,imdbId,tmdbId
count,87585.0,87585.0,87461.0
mean,157651.365519,2792840.0,241382.3
std,79013.402099,4278866.0,247146.7
min,1.0,1.0,2.0
25%,112657.0,94642.0,46836.0
50%,165741.0,492996.0,139272.0
75%,213203.0,3877296.0,381693.0
max,292757.0,29081100.0,1186337.0


In [None]:
links = pl.from_pandas(links)

In [None]:
links = links.unique()

In [None]:
links.describe()

statistic,movieId,imdbId,tmdbId
str,f64,f64,f64
"""count""",87585.0,87585.0,87461.0
"""null_count""",0.0,0.0,124.0
"""mean""",157651.365519,2792800.0,241382.280422
"""std""",79013.402099,4278900.0,247146.667043
"""min""",1.0,1.0,2.0
"""25%""",112657.0,94642.0,46836.0
"""50%""",165741.0,492996.0,139272.0
"""75%""",213203.0,3877296.0,381693.0
"""max""",292757.0,29081098.0,1186337.0


<a class="anchor" id='lib'></a>
<font color = '#008000'>

## **2.4. Tags** </font>

In [None]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,22,26479,Kevin Kline,1583038886
1,22,79592,misogyny,1581476297
2,22,247150,acrophobia,1622483469
3,34,2174,music,1249808064
4,34,2174,weird,1249808102


In [None]:
tags.describe()

Unnamed: 0,userId,movieId,timestamp
count,2000072.0,2000072.0,2000072.0
mean,81928.59,71893.26,1528914000.0
std,38106.5,74803.79,129083500.0
min,22.0,1.0,1135429000.0
25%,68413.0,4011.0,1473615000.0
50%,78213.0,52328.0,1574071000.0
75%,103698.0,122294.0,1614740000.0
max,162279.0,292629.0,1697155000.0


In [None]:
tags = pl.from_pandas(tags)


In [None]:
tags = tags.unique()

In [None]:
tags.describe()

statistic,userId,movieId,tag,timestamp
str,f64,f64,str,f64
"""count""",2000072.0,2000072.0,"""2000055""",2000072.0
"""null_count""",0.0,0.0,"""17""",0.0
"""mean""",81928.586291,71893.261729,,1528900000.0
"""std""",38106.498431,74803.79499,,129080000.0
"""min""",22.0,1.0,""" The Asylum""",1135400000.0
"""25%""",68413.0,4011.0,,1473600000.0
"""50%""",78213.0,52328.0,,1574100000.0
"""75%""",103698.0,122294.0,,1614700000.0
"""max""",162279.0,292629.0,"""카운트다운""",1697200000.0


<a class="anchor" id='import'>
<font color = '#006400'>
    
# **3. Convert to Parquet** </font>
</a>

In [None]:
# Define paths
output_dir = "~/Downloads"

# Make sure the directory exists
import os
os.makedirs(output_dir, exist_ok=True)

# Convert and save
ratings.write_parquet(os.path.join(output_dir, "ratings_32M.parquet"))
movies.write_parquet(os.path.join(output_dir, "movies_32M.parquet"))
links.write_parquet(os.path.join(output_dir, "links_32M.parquet"))
tags.write_parquet(os.path.join(output_dir, "tags_32M.parquet"))