In [1]:
# Upload kaggle.json from local
from google.colab import files
files.upload()

# Move it to correct location
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


Saving kaggle.json to kaggle.json


In [2]:
from pyspark import SparkContext
from kaggle.api.kaggle_api_extended import KaggleApi
import os
import zipfile

def parse_line(line):
    """Parses each line of input data into (movie_id, rating)."""
    if line.startswith("userId,movieId,rating,timestamp"):
        return None
    parts = line.split(",")
    return (int(parts[1]), float(parts[2]))

def main():
    # Set up Kaggle API
    api = KaggleApi()
    api.authenticate()

    # Download dataset
    download_path = "kaggle_data"
    os.makedirs(download_path, exist_ok=True)
    api.dataset_download_files('rounakbanik/the-movies-dataset', path=download_path, unzip=True)

    dataset_file = os.path.join(download_path, "ratings.csv")

    # Set up Spark
    sc = SparkContext("local", "MovieRatings")

    # Read the input data
    input_rdd = sc.textFile(dataset_file)

    # Parse and filter the data
    mapped_rdd = input_rdd.filter(lambda line: not line.startswith("userId,movieId,rating,timestamp")) \
                          .map(parse_line)

    # Remove None values
    mapped_rdd = mapped_rdd.filter(lambda x: x is not None)

    # Calculate average ratings
    reduced_rdd = mapped_rdd.groupByKey().mapValues(lambda ratings: sum(ratings) / len(ratings))

    # Collect and print
    results = reduced_rdd.collect()
    for movie_id, avg_rating in results[:10]:  # Just print top 10 for demo
        print(f"Movie {movie_id} has an average rating of {avg_rating:.2f}")

    sc.stop()

if __name__ == "__main__":
    main()


Dataset URL: https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset
Movie 110 has an average rating of 4.02
Movie 858 has an average rating of 4.34
Movie 91542 has an average rating of 3.74
Movie 112552 has an average rating of 4.12
Movie 1210 has an average rating of 3.99
Movie 2706 has an average rating of 3.31
Movie 3476 has an average rating of 3.68
Movie 1254 has an average rating of 4.13
Movie 106920 has an average rating of 3.89
Movie 44 has an average rating of 2.63
