In [2]:
import pandas as pd
import os
import zipfile
import requests

# 1. Kiểm tra thư mục dữ liệu
data_folder = "ml-latest-small"
zip_file = "ml-latest-small.zip"
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"

# Tải dữ liệu nếu chưa có
if not os.path.exists(data_folder):
    print("Downloading MovieLens dataset...")
    r = requests.get(url)
    with open(zip_file, "wb") as f:
        f.write(r.content)
    # Giải nén
    with zipfile.ZipFile(zip_file, "r") as zip_ref:
        zip_ref.extractall()
    print("Download and extraction complete!")

# 2. Đọc dữ liệu
movies = pd.read_csv(os.path.join(data_folder, "movies.csv"))
ratings = pd.read_csv(os.path.join(data_folder, "ratings.csv"))

# 3. Thông tin cơ bản
print("Movies info:")
print(movies.info())
print(movies.head())

print("\nRatings info:")
print(ratings.info())
print(ratings.head())

# 4. Gộp dữ liệu theo movieId
movie_ratings = pd.merge(ratings, movies, on='movieId', how='left')

# 5. Xử lý dữ liệu thiếu (nếu có)
print("\nMissing values per column:")
print(movie_ratings.isnull().sum())

# 6. Thống kê cơ bản
print("\nTop 5 movies by average rating:")
top_movies = movie_ratings.groupby('title')['rating'].mean().sort_values(ascending=False).head()
print(top_movies)

# 7. Parsing timestamps
movie_ratings['timestamp'] = pd.to_datetime(movie_ratings['timestamp'], unit='s')
print("\nSample timestamps:")
print(movie_ratings[['title', 'timestamp']].head())


Downloading MovieLens dataset...
Download and extraction complete!
Movies info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                         