In [1]:
import pandas as pd
import os
import requests
import zipfile
import io

def download_movielens():
    try:
        if not (os.path.exists('movies.csv') and os.path.exists('ratings.csv')):
            print("Downloading MovieLens 100K dataset...")
            url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
            response = requests.get(url)
            if response.status_code != 200:
                print("Failed to download dataset.")
                return None, None
            zip_file = zipfile.ZipFile(io.BytesIO(response.content))
            zip_file.extract('ml-100k/u.item', 'data/')
            zip_file.extract('ml-100k/u.data', 'data/')
            movies = pd.read_csv('data/ml-100k/u.item', sep='|', encoding='latin-1',
                                 names=['movieId', 'title', 'release_date', 'video_release', 'imdb_url'] + 
                                       ['unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 
                                        'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
                                        'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])
            genre_columns = ['unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 
                            'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
                            'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
            movies['genres'] = movies[genre_columns].apply(lambda x: '|'.join([col for col, val in x.items() if val == 1]), axis=1)
            movies = movies[['movieId', 'title', 'genres']]
            ratings = pd.read_csv('data/ml-100k/u.data', sep='\t',
                                  names=['userId', 'movieId', 'rating', 'timestamp'])
            return ratings, movies
        else:
            return load_data()
    except Exception as e:
        print(f"Error downloading dataset: {str(e)}")
        return None, None

def load_data():
    try:
        ratings = pd.read_csv('ratings.csv', 
                            names=['userId', 'movieId', 'rating', 'timestamp'],
                            sep=',', skiprows=1)
        movies = pd.read_csv('movies.csv', 
                            names=['movieId', 'title', 'genres'],
                            sep=',', encoding='latin-1', skiprows=1)
        if ratings.empty or movies.empty:
            print("Error: One or both data files are empty.")
            return None, None
        invalid_titles = movies['title'].str.contains('Animation|Children|Romance|Drama|Unknown|Crime', case=False, na=False)
        if invalid_titles.any():
            print(f"Found {invalid_titles.sum()} potentially invalid movie titles. Sample: {movies['title'][invalid_titles].head().tolist()}")
        return ratings, movies
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return None, None

def preprocess_data(ratings, movies):
    if ratings is None or movies is None:
        return None, None, None
    try:
        movies['title'] = movies['title'].fillna('Unknown')
        movies['genres'] = movies['genres'].str.replace('|', ' ', regex=False).fillna('')
        comedy_count = movies['genres'].str.contains('Comedy', case=False, na=False).sum()
        if comedy_count == 0:
            print(f"No movies found with 'Comedy' genre. Sample genres: {movies['genres'].head().tolist()}")
        if not pd.api.types.is_numeric_dtype(ratings['rating']):
            print(f"Ratings contain non-numeric values: {ratings['rating'].head().tolist()}")
            return None, None, None
        ratings['normalized_rating'] = (ratings['rating'] - 0.5) / (5 - 0.5)
        return ratings, movies, {'movie_count': len(movies), 'rating_count': len(ratings), 'user_count': ratings['userId'].nunique()}
    except Exception as e:
        print(f"Error preprocessing data: {str(e)}")
        return None, None, None