In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (collect_set, avg, percentile_approx, col, split, explode, when, lit, 
                                  log, regexp_replace, desc, asc, count, max as spark_max, min as spark_min)
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
import numpy as np
from sklearn.neighbors import NearestNeighbors
import pandas as pd

class MovieRecommendationSystem:
    def __init__(self, spark_session=None):
        if spark_session is None:
            self.spark = SparkSession.builder \
                         .appName("MovieRecommendationSystem") \
                         .config("spark.driver.memory", "6g") \
                         .config("spark.driver.maxResultSize", "3g") \
                         .config("spark.executor.memory", "6g") \
                         .config("spark.executor.memoryFraction", "0.8") \
                         .config("spark.sql.adaptive.enabled", "true") \
                         .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
                         .getOrCreate() 
        else:
            self.spark = spark_session
        
        self.kmeans_model = None
        self.knn_model = None
        self.scaler_model = None
        self.feature_vectors = None
        self.movies_df = None
        self.clusters_df = None
        self.genre_columns = []
        
    def load_data(self):
        raw_df = self.spark.read.csv("hdfs://namenode:8020/user/data/movie_rank.csv", header=True, inferSchema=True)

        self.movies_df = raw_df.filter(
            (col("titleType") == "movie") &
            (col("isAdult") == "0")
        ).select(
            col("tconst"),
            col("primaryTitle").alias("title"),
            col("runtimeMinutes").cast("int").alias("duration"),
            regexp_replace(col("genres"), "\\\\N", "").alias("genres_clean"),
            col("averageRating").cast("double").alias("average_rating"),
            col("numVotes").cast("int").alias("vote_count"),
            col("startYear").cast("int").alias("year")
        ).filter(
            (col("duration") >= 60) & (col("duration") <= 300) &  # Realistic duration
            (col("vote_count") >= 1000) &  # Higher threshold for reliability
            (col("average_rating") >= 1.0) & (col("average_rating") <= 10.0) &
            (col("year") >= 1920) & (col("year") <= 2025) &
            (col("genres_clean") != "") & (col("genres_clean").isNotNull()) &
            (col("title").isNotNull()) & (col("title") != "")
        ).withColumn("genres", col("genres_clean")).drop("genres_clean")
        
        print(f"Data loaded and cleaned: {self.movies_df.count()} movies")
        print("Data sample:")
        self.movies_df.show(10, truncate=False)
        
    def create_advanced_features(self):
        
        # DataFrame with a single row containing global stats
        stats_df = self.movies_df.agg(
            avg("average_rating").alias("global_avg_rating"),
            avg("vote_count").alias("global_avg_votes"),
            avg("duration").alias("global_avg_duration"),
            percentile_approx("vote_count", 0.9).alias("vote_threshold_90p"),
            spark_max("vote_count").alias("max_votes"),
            spark_min("vote_count").alias("min_votes")
        )
    
        # crossJoin to enrich each row of the DataFrame with these stats
        self.movies_df = self.movies_df.crossJoin(stats_df)
    
        # 1. Bayesian quality score
        self.movies_df = self.movies_df.withColumn(
            "bayesian_rating",
            (col("vote_count") * col("average_rating") + col("vote_threshold_90p") * col("global_avg_rating")) /
            (col("vote_count") + col("vote_threshold_90p"))
        )
    
        # 2. Normalized popularity score
        self.movies_df = self.movies_df.withColumn(
            "popularity_score",
            ((log(col("vote_count") + lit(1)) - log(col("min_votes") + lit(1))) /
            (log(col("max_votes") + lit(1)) - log(col("min_votes") + lit(1))))
        )
    
        # 3. Final quality score combining rating and popularity
        self.movies_df = self.movies_df.withColumn(
            "quality_score",
            col("bayesian_rating") * lit(0.7) + col("popularity_score") * lit(0.3)
        )
    
        # 4. Duration category
        self.movies_df = self.movies_df.withColumn(
            "duration_category",
            when(col("duration") < 90, "Short")
            .when(col("duration") < 120, "Medium")
            .when(col("duration") < 150, "Long")
            .otherwise("Very Long")
        )
    
        # 5. Movie era
        self.movies_df = self.movies_df.withColumn(
            "era",
            when(col("year") < 1960, "Classic")
            .when(col("year") < 1980, "Vintage")
            .when(col("year") < 2000, "Modern")
            .when(col("year") < 2010, "Contemporary")
            .otherwise("Recent")
        )
    
        print("Advanced features created")
        
    def preprocess_data(self):
        # Create advanced features
        self.create_advanced_features()
    
        # Extract and count exploded genres
        genres_exploded = self.movies_df.select(
            explode(split(col("genres"), ",")).alias("genre")
        ).groupBy("genre").count()
    
        # Filter genres with at least 50 movies
        main_genres_df = genres_exploded.filter(col("count") >= 50)
    
        # Explode genres in the main DataFrame
        movies_genres = self.movies_df.withColumn("genre", explode(split(col("genres"), ",")))
    
        # Keep only main genres
        filtered_genres = movies_genres.join(main_genres_df.select("genre"), on="genre", how="inner")
    
        # Group main genres by all movie columns (to avoid duplicates)
        # We group on all original columns to keep a unique movie with its filtered genres
    
        group_cols = [c for c in self.movies_df.columns]  # all original columns
        movies_with_main_genres = filtered_genres.groupBy(group_cols).agg(
            collect_set("genre").alias("main_genres")
        )
    
        # Normalization of numerical features
        movies_with_main_genres = movies_with_main_genres.withColumn(
            "duration_normalized",
            (col("duration") - lit(90)) / lit(60)
        ).withColumn(
            "year_normalized",
            (col("year") - lit(1990)) / lit(20)
        ).withColumn(
            "rating_normalized",
            (col("average_rating") - lit(6.0)) / lit(2.0)
        )
    
        feature_cols = [
            "duration_normalized", "year_normalized", "rating_normalized",
            "bayesian_rating", "popularity_score", "quality_score"
        ]
    
        assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_raw")
        movies_with_features = assembler.transform(movies_with_main_genres)
    
        scaler = StandardScaler(inputCol="features_raw", outputCol="features",
                                withMean=True, withStd=True)
        self.scaler_model = scaler.fit(movies_with_features)
        self.feature_vectors = self.scaler_model.transform(movies_with_features)
    
        print("Enhanced preprocessing completed")
        print(f"Features used: {feature_cols}")
        print("Main genres available in the 'main_genres' column (array)")
    
        return self.feature_vectors
    
    def train_kmeans(self, k=6):
        kmeans = KMeans(k=k, seed=42, featuresCol="features", predictionCol="cluster",
                       maxIter=50, tol=1e-4)
        self.kmeans_model = kmeans.fit(self.feature_vectors)
        
        # Apply predictions
        self.clusters_df = self.kmeans_model.transform(self.feature_vectors)
        
        print(f"K-means trained with {k} clusters")
        
        # Analyze cluster distribution
        cluster_analysis = self.clusters_df.groupBy("cluster").agg(
            count("*").alias("count"),
            avg("average_rating").alias("avg_rating"),
            avg("quality_score").alias("avg_quality"),
            avg("year").alias("avg_year")
        ).orderBy("cluster")
        
        print("Cluster distribution and characteristics:")
        cluster_analysis.show()
        
        return self.clusters_df

    def prepare_knn(self):
        """
        KNN preparation
        """
        # Convert to pandas with all necessary information
        features_pd = self.clusters_df.select(
            "tconst", "title", "features", "cluster", "average_rating", 
            "vote_count", "quality_score", "bayesian_rating", "genres", 
            "year", "duration", "era", "popularity_score", "main_genres"
        ).toPandas()
        
        # Extract numerical features
        self.features_array = np.array([row.toArray() for row in features_pd['features']])
        
        # Create a unique list of all genres
        all_genres = set()
        for genres_list in features_pd['main_genres']:
            if genres_list:  # Check that the list is not None
                all_genres.update(genres_list)
        
        self.genre_list = sorted(list(all_genres))
        self.genre_to_idx = {genre: idx for idx, genre in enumerate(self.genre_list)}
        
        # Create one-hot vectors for genres
        genre_features = []
        for movie_genres in features_pd['main_genres']:
            genre_vector = np.zeros(len(self.genre_list))
            if movie_genres: 
                for genre in movie_genres:
                    if genre in self.genre_to_idx:
                        genre_vector[self.genre_to_idx[genre]] = 1.0
            genre_features.append(genre_vector)
        
        self.genre_features = np.array(genre_features)
        
        # Normalize numerical features
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        normalized_numerical = scaler.fit_transform(self.features_array)
        
        # Combine numerical features and genres with weighting
        # More importance to genres to avoid off-topic recommendations
        numerical_weight = 0.3
        genre_weight = 0.7
        
        self.hybrid_features = np.concatenate([
            normalized_numerical * numerical_weight,
            self.genre_features * genre_weight
        ], axis=1)
        
        # Train KNN on hybrid features
        self.knn_model = NearestNeighbors(n_neighbors=50, metric='cosine', algorithm='brute')
        self.knn_model.fit(self.hybrid_features)
        
        # Store metadata
        self.movie_metadata = features_pd.to_dict('records')
        self.title_to_idx = {movie['title']: idx for idx, movie in enumerate(self.movie_metadata)}
        
        print("✅ Hybrid KNN model prepared")
        print(f"📊 Features: {self.hybrid_features.shape[1]} (numerical: {normalized_numerical.shape[1]}, genres: {self.genre_features.shape[1]})")
        print(f"🎭 Weighting: {numerical_weight*100}% numerical, {genre_weight*100}% genres")
    
    def get_advanced_movie_recommendations(self, movie_title, n_recommendations=5, min_genre_overlap=0.2):
        """
        Recommendations with simple genre overlap filtering
        """
        if movie_title not in self.title_to_idx:
            print(f"❌ Movie '{movie_title}' not found in the database")
            available_titles = [title for title in self.title_to_idx.keys() 
                              if movie_title.lower() in title.lower()]
            if available_titles:
                print(f"🔍 Similar available movies: {available_titles[:5]}")
            return []
        
        movie_idx = self.title_to_idx[movie_title]
        movie_info = self.movie_metadata[movie_idx]
        movie_genres = set(movie_info['main_genres']) if movie_info['main_genres'] else set()
        
        print(f"\n🎬 Reference movie: {movie_info['title']} ({movie_info['year']})")
        print(f"🎭 Genres: {', '.join(movie_genres) if movie_genres else 'Not specified'}")
        print(f"⭐ Rating: {movie_info['average_rating']:.1f} | 👥 Votes: {movie_info['vote_count']:,}")
        
        # Find nearest neighbors
        movie_features = self.hybrid_features[movie_idx].reshape(1, -1)
        distances, indices = self.knn_model.kneighbors(movie_features, n_neighbors=100)
        
        # Filter and score candidates
        candidates = []
        
        for distance, idx in zip(distances[0], indices[0]):
            if idx == movie_idx:  # Exclude the movie itself
                continue
            
            candidate = self.movie_metadata[idx]
            candidate_genres = set(candidate['main_genres']) if candidate['main_genres'] else set()
            
            # Calculate genre overlap (Jaccard similarity)
            if len(movie_genres) == 0 and len(candidate_genres) == 0:
                genre_overlap = 1.0  # Two movies without genres = similar
            elif len(movie_genres) == 0 or len(candidate_genres) == 0:
                genre_overlap = 0.0  # A movie without genre ≠ movie with genres
            else:
                intersection = len(movie_genres.intersection(candidate_genres))
                union = len(movie_genres.union(candidate_genres))
                genre_overlap = intersection / union if union > 0 else 0.0
            
            # Filter by minimum overlap
            if genre_overlap < min_genre_overlap:
                continue
            
            # Calculate composite score
            similarity = 1 - distance  # Convert cosine distance to similarity
            
            # Weighted final score
            composite_score = (
                similarity * 0.6 +                    # Global similarity
                genre_overlap * 0.3 +                 # Genre overlap
                (candidate['quality_score'] / 10) * 0.1  # Quality bonus
            )
            
            candidates.append({
                'movie': candidate,
                'composite_score': composite_score,
                'genre_overlap': genre_overlap,
                'similarity': similarity
            })
        
        # Sort by composite score
        candidates.sort(key=lambda x: x['composite_score'], reverse=True)
        
        # Take the best ones
        final_recommendations = candidates[:n_recommendations]
        
        # Display results
        print(f"\n🎯 TOP {len(final_recommendations)} RECOMMENDATIONS:")
        print("=" * 80)
        
        for i, rec in enumerate(final_recommendations, 1):
            movie = rec['movie']
            movie_genres_str = ', '.join(movie['main_genres']) if movie['main_genres'] else 'Not specified'
            
            print(f"{i}. 🎬 {movie['title']} ({movie['year']})")
            print(f"   ⭐ Rating: {movie['average_rating']:.1f} | 👥 {movie['vote_count']:,} votes")
            print(f"   🎭 Genres: {movie_genres_str}")
            print(f"   📊 Scores: Global: {rec['composite_score']:.3f} | "
                  f"Genres: {rec['genre_overlap']:.3f} | Similarity: {rec['similarity']:.3f}")
            print(f"   🏆 Quality: {movie['quality_score']:.2f}")
            print()
        
        return [rec['movie'] for rec in final_recommendations]
    
    def interactive_advanced_system(self):
        """
        Advanced interactive interface
        """
        while True:
            print("\n" + "="*60)
            print("🎬 ADVANCED MOVIE RECOMMENDATION SYSTEM")
            print("="*60)
            print("1. 🎯 Smart recommendations by genre")
            print("2. 🔍 Recommendations based on a movie")  
            print("3. 🏆 Top movies by genre and era")
            print("4. 🔧 Custom search")
            print("5. ❌ Exit")
            try:
                choice = input("\n➤ Choose an option (1-5): ").strip()
                
                if choice == "1":
                    genre = input("🎭 Preferred genre: ").strip()
                    min_rating = float(input("⭐ Minimum rating (ex: 7.5): ") or "7.0")
                    n_recs = int(input("📝 Number of recommendations (ex: 5): ") or "5")
                    self.get_smart_genre_recommendations(genre, n_recs, min_rating)
                    
                elif choice == "2":
                    movie_title = input("🎬 Reference movie name: ").strip()
                    n_recs = int(input("📝 Number of recommendations (ex: 5): ") or "5")
                    self.get_advanced_movie_recommendations(movie_title, n_recs)
                    
                elif choice == "3":
                    self.show_top_movies_by_category()
                    
                elif choice == "4":
                    self.custom_search()
                    
                elif choice == "5":
                    print("👋 Thank you for using the recommendation system!")
                    break
                    
                else:
                    print("❌ Invalid option. Choose between 1 and 5.")
                    
            except ValueError:
                print("❌ Input error. Please enter valid values.")
            except KeyboardInterrupt:
                print("\n👋 Goodbye!")
                break
            except Exception as e:
                print(f"❌ Error: {e}")
                
    def get_smart_genre_recommendations(self, genre_preference, n_recommendations=5, min_rating=7.0):
        """
        Simple recommendations by genre
        """
        recommendations = self.clusters_df.filter(
            col("genres").contains(genre_preference) &
            (col("average_rating") >= min_rating)
        ).select(
            "tconst", "title", "duration", "genres", "average_rating", 
            "vote_count", "quality_score", "year", "era"
        ).orderBy(desc("quality_score")).limit(n_recommendations)
        
        if recommendations.count() == 0:
            print(f"No movie found for genre '{genre_preference}' with rating >= {min_rating}")
            return []
        
        # Nice display
        print(f"\nRecommendations for genre '{genre_preference}':")
        print(f"Criteria: Rating >= {min_rating}")
        print("-" * 80)
        
        results = recommendations.collect()
        
        for i, movie in enumerate(results, 1):
            print(f"{i}. {movie.title} ({movie.year}) - {movie.era}")
            print(f"   Rating: {movie.average_rating:.1f} | Votes: {movie.vote_count:,} | "
                  f"Score: {movie.quality_score:.2f}")
            print(f"   Genres: {movie.genres} | Duration: {movie.duration}min")
            print()
        
        return recommendations

    def show_top_movies_by_category(self):
        """
        Shows top movies by category
        """
        print("\n" + "="*60)
        print("🏆 TOP MOVIES BY CATEGORY")
        print("="*60)
        
        # Top movies by decade
        print("\n📅 BEST MOVIES BY DECADE:")
        print("-" * 40)
        
        decades = [(2020, 2025, "2020s"), (2010, 2019, "2010s"), (2000, 2009, "2000s"), 
                   (1990, 1999, "1990s"), (1980, 1989, "1980s"), (1970, 1979, "1970s")]
        
        for start_year, end_year, decade_name in decades:
            decade_movies = self.clusters_df.filter(
                (col("year") >= start_year) & (col("year") <= end_year)
            ).orderBy(desc("quality_score")).limit(3).collect()
            
            if decade_movies:
                print(f"\n🎬 {decade_name}:")
                for i, movie in enumerate(decade_movies, 1):
                    print(f"   {i}. {movie.title} ({movie.year}) - {movie.average_rating:.1f}★")
                    print(f"      {movie.genres} | {movie.vote_count:,} votes")
        
        # Top movies by main genre
        print("\n\n🎭 BEST MOVIES BY GENRE:")
        print("-" * 40)
        
        main_genres = ["Action", "Drama", "Comedy", "Thriller", "Horror", "Romance", "Sci-Fi"]
        
        for genre in main_genres:
            genre_movies = self.clusters_df.filter(
                col("genres").contains(genre)
            ).orderBy(desc("quality_score")).limit(3).collect()
            
            if genre_movies:
                print(f"\n🎯 {genre}:")
                for i, movie in enumerate(genre_movies, 1):
                    print(f"   {i}. {movie.title} ({movie.year}) - {movie.average_rating:.1f}★")
                    print(f"      {movie.vote_count:,} votes | Score: {movie.quality_score:.2f}")
        
        # Cult movies (high rating + many votes)
        print("\n\n🌟 CULT MOVIES (Rating ≥ 8.5 & Votes ≥ 500K):")
        print("-" * 50)
        
        cult_movies = self.clusters_df.filter(
            (col("average_rating") >= 8.5) & (col("vote_count") >= 500000)
        ).orderBy(desc("average_rating"), desc("vote_count")).limit(10).collect()
        
        for i, movie in enumerate(cult_movies, 1):
            print(f"{i:2d}. {movie.title} ({movie.year}) - {movie.average_rating:.1f}★")
            print(f"     {movie.genres} | {movie.vote_count:,} votes")
    
    def custom_search(self):
        """
        Custom search with multiple filters
        """
        print("\n" + "="*60)
        print("🔧 CUSTOM SEARCH")
        print("="*60)
        
        # Collect criteria
        criteria = {}
        
        genre_filter = input("🎭 Genre (optional): ").strip()
        if genre_filter:
            criteria['genre'] = genre_filter
            
        try:
            min_year = input("📅 Minimum year (optional): ").strip()
            if min_year:
                criteria['min_year'] = int(min_year)
                
            max_year = input("📅 Maximum year (optional): ").strip()
            if max_year:
                criteria['max_year'] = int(max_year)
                
            min_rating = input("⭐ Minimum rating (optional): ").strip()
            if min_rating:
                criteria['min_rating'] = float(min_rating)
                
            min_votes = input("👥 Minimum votes (optional): ").strip()
            if min_votes:
                criteria['min_votes'] = int(min_votes)
                
            max_duration = input("⏱️ Maximum duration in min (optional): ").strip()
            if max_duration:
                criteria['max_duration'] = int(max_duration)
                
        except ValueError:
            print("❌ Invalid value. Using valid criteria only.")
        
        # Apply filters
        filtered_df = self.clusters_df
        
        if 'genre' in criteria:
            filtered_df = filtered_df.filter(col("genres").contains(criteria['genre']))
            
        if 'min_year' in criteria:
            filtered_df = filtered_df.filter(col("year") >= criteria['min_year'])
            
        if 'max_year' in criteria:
            filtered_df = filtered_df.filter(col("year") <= criteria['max_year'])
            
        if 'min_rating' in criteria:
            filtered_df = filtered_df.filter(col("average_rating") >= criteria['min_rating'])
            
        if 'min_votes' in criteria:
            filtered_df = filtered_df.filter(col("vote_count") >= criteria['min_votes'])
            
        if 'max_duration' in criteria:
            filtered_df = filtered_df.filter(col("duration") <= criteria['max_duration'])
        
        # Results
        count = filtered_df.count()
        print(f"\n📊 {count} movies found with these criteria")
        
        if count == 0:
            print("❌ No movie matches your criteria. Try less restrictive filters.")
            return
        
        # Sort results
        print("\n🔄 How to sort results?")
        print("1. By rating (descending)")
        print("2. By popularity (votes)")
        print("3. By quality score")
        print("4. By year (recent first)")
        print("5. By alphabetical order")
        
        sort_choice = input("➤ Sort choice (1-5, default=3): ").strip() or "3"
        
        if sort_choice == "1":
            filtered_df = filtered_df.orderBy(desc("average_rating"))
        elif sort_choice == "2":
            filtered_df = filtered_df.orderBy(desc("vote_count"))
        elif sort_choice == "3":
            filtered_df = filtered_df.orderBy(desc("quality_score"))
        elif sort_choice == "4":
            filtered_df = filtered_df.orderBy(desc("year"))
        elif sort_choice == "5":
            filtered_df = filtered_df.orderBy(asc("title"))
        
        # Display results
        limit = min(20, count)
        results = filtered_df.limit(limit).collect()
        
        print(f"\n🎬 TOP {limit} RESULTS:")
        print("-" * 80)
        
        for i, movie in enumerate(results, 1):
            print(f"{i:2d}. {movie.title} ({movie.year}) - {movie.average_rating:.1f}★")
            print(f"     {movie.genres} | {movie.duration}min | {movie.vote_count:,} votes")
            print(f"     Quality score: {movie.quality_score:.2f} | Cluster: {movie.cluster}")
            print()
    
    def cleanup(self):
        """
        Clean up resources
        """
        if self.spark:
            self.spark.stop()
            print("✅ Spark session closed")


# Main entry point
def main():
    
    # System initialization
    print("🚀 Initializing recommendation system...")
    recommender = MovieRecommendationSystem()
    
    # Data loading and preprocessing
    print("📊 Loading data...")
    recommender.load_data()
    
    print("🔧 Preprocessing data...")
    recommender.preprocess_data()
    
    # Model training
    print("🤖 Training K-means model...")
    recommender.train_kmeans(k=10)  # More clusters for better precision
    
    print("🔍 Preparing KNN model...")
    recommender.prepare_knn()
    
    print("✅ System ready!")
    
    # Launch interactive interface
    recommender.interactive_advanced_system()
    
    # Cleanup
    recommender.cleanup()

if __name__ == "__main__":
    main()

🚀 Initializing recommendation system...
📊 Loading data...
Data loaded and cleaned: 44438 movies
Data sample:
+---------+---------------------------------+--------+--------------+----------+----+-------------------------+
|tconst   |title                            |duration|average_rating|vote_count|year|genres                   |
+---------+---------------------------------+--------+--------------+----------+----+-------------------------+
|tt0010323|The Cabinet of Dr. Caligari      |67      |8.0           |72794     |1920|Horror,Mystery,Thriller  |
|tt0011000|Leaves From Satan's Book         |167     |6.6           |1402      |1920|Drama                    |
|tt0011130|Dr. Jekyll and Mr. Hyde          |69      |6.9           |6308      |1920|Drama,Horror,Sci-Fi      |
|tt0011157|Erotikon                         |106     |6.4           |1093      |1920|Comedy,Romance           |
|tt0011221|Genuine: The Tragedy of a Vampire|88      |5.9           |1199      |1920|Fantasy,Horror,Romance


➤ Choose an option (1-5):  2
🎬 Reference movie name:  Spider-man 3
📝 Number of recommendations (ex: 5):  5


❌ Movie 'Spider-man 3' not found in the database
🔍 Similar available movies: ['Spider-Man 3']

🎬 ADVANCED MOVIE RECOMMENDATION SYSTEM
1. 🎯 Smart recommendations by genre
2. 🔍 Recommendations based on a movie
3. 🏆 Top movies by genre and era
4. 🔧 Custom search
5. ❌ Exit



➤ Choose an option (1-5):  2
🎬 Reference movie name:  Spider-Man 3
📝 Number of recommendations (ex: 5):  5



🎬 Reference movie: Spider-Man 3 (2007)
🎭 Genres: Action, Sci-Fi, Adventure
⭐ Rating: 6.3 | 👥 Votes: 660,521

🎯 TOP 5 RECOMMENDATIONS:
1. 🎬 Maze Runner: The Scorch Trials (2015)
   ⭐ Rating: 6.3 | 👥 288,660 votes
   🎭 Genres: Action, Adventure, Sci-Fi
   📊 Scores: Global: 0.941 | Genres: 1.000 | Similarity: 0.992
   🏆 Quality: 4.62

2. 🎬 Terminator Genisys (2015)
   ⭐ Rating: 6.3 | 👥 298,457 votes
   🎭 Genres: Action, Adventure, Sci-Fi
   📊 Scores: Global: 0.940 | Genres: 1.000 | Similarity: 0.989
   🏆 Quality: 4.62

3. 🎬 Godzilla (2014)
   ⭐ Rating: 6.4 | 👥 451,420 votes
   🎭 Genres: Action, Adventure, Sci-Fi
   📊 Scores: Global: 0.939 | Genres: 1.000 | Similarity: 0.986
   🏆 Quality: 4.70

4. 🎬 Waterworld (1995)
   ⭐ Rating: 6.3 | 👥 216,231 votes
   🎭 Genres: Action, Adventure, Sci-Fi
   📊 Scores: Global: 0.939 | Genres: 1.000 | Similarity: 0.988
   🏆 Quality: 4.60

5. 🎬 Batman v Superman: Dawn of Justice (2016)
   ⭐ Rating: 6.5 | 👥 766,987 votes
   🎭 Genres: Action, Adventure, Sci-F


➤ Choose an option (1-5):  1
🎭 Preferred genre:  Action
⭐ Minimum rating (ex: 7.5):  8
📝 Number of recommendations (ex: 5):  5



Recommendations for genre 'Action':
Criteria: Rating >= 8.0
--------------------------------------------------------------------------------
1. The Dark Knight (2008) - Contemporary
   Rating: 9.0 | Votes: 2,980,743 | Score: 6.57
   Genres: Action,Crime,Drama | Duration: 152min

2. Inception (2010) - Recent
   Rating: 8.8 | Votes: 2,648,271 | Score: 6.42
   Genres: Action,Adventure,Sci-Fi | Duration: 148min

3. The Matrix (1999) - Modern
   Rating: 8.7 | Votes: 2,129,369 | Score: 6.34
   Genres: Action,Sci-Fi | Duration: 136min

4. Star Wars: Episode V - The Empire Strikes Back (1980) - Modern
   Rating: 8.7 | Votes: 1,424,881 | Score: 6.31
   Genres: Action,Adventure,Fantasy | Duration: 124min

5. Star Wars: Episode IV - A New Hope (1977) - Vintage
   Rating: 8.6 | Votes: 1,492,821 | Score: 6.24
   Genres: Action,Adventure,Fantasy | Duration: 121min


🎬 ADVANCED MOVIE RECOMMENDATION SYSTEM
1. 🎯 Smart recommendations by genre
2. 🔍 Recommendations based on a movie
3. 🏆 Top movies by ge


➤ Choose an option (1-5):  2
🎬 Reference movie name:  Spider-Man
📝 Number of recommendations (ex: 5):  5



🎬 Reference movie: Spider-Man (2002)
🎭 Genres: Action, Sci-Fi, Adventure
⭐ Rating: 7.4 | 👥 Votes: 911,741

🎯 TOP 5 RECOMMENDATIONS:
1. 🎬 Spider-Man 2 (2004)
   ⭐ Rating: 7.5 | 👥 736,369 votes
   🎭 Genres: Action, Adventure, Sci-Fi
   📊 Scores: Global: 0.954 | Genres: 1.000 | Similarity: 0.998
   🏆 Quality: 5.44

2. 🎬 The Fifth Element (1997)
   ⭐ Rating: 7.6 | 👥 521,774 votes
   🎭 Genres: Action, Adventure, Sci-Fi
   📊 Scores: Global: 0.952 | Genres: 1.000 | Similarity: 0.996
   🏆 Quality: 5.47

3. 🎬 Captain America: The Winter Soldier (2014)
   ⭐ Rating: 7.7 | 👥 923,930 votes
   🎭 Genres: Action, Adventure, Sci-Fi
   📊 Scores: Global: 0.950 | Genres: 1.000 | Similarity: 0.990
   🏆 Quality: 5.60

4. 🎬 Spider-Man: Homecoming (2017)
   ⭐ Rating: 7.4 | 👥 747,222 votes
   🎭 Genres: Action, Adventure, Sci-Fi
   📊 Scores: Global: 0.950 | Genres: 1.000 | Similarity: 0.993
   🏆 Quality: 5.38

5. 🎬 Total Recall (1990)
   ⭐ Rating: 7.5 | 👥 364,492 votes
   🎭 Genres: Action, Adventure, Sci-Fi
  


➤ Choose an option (1-5):  2
🎬 Reference movie name:  Spider-Ma
📝 Number of recommendations (ex: 5):  5


❌ Movie 'Spider-Ma' not found in the database
🔍 Similar available movies: ['Spider-Man 3', 'Spider-Man: Lotus', 'The Amazing Spider-Man 2', 'Spider-Man: Across the Spider-Verse', 'Vjeran Tomic: The Spider-Man of Paris']

🎬 ADVANCED MOVIE RECOMMENDATION SYSTEM
1. 🎯 Smart recommendations by genre
2. 🔍 Recommendations based on a movie
3. 🏆 Top movies by genre and era
4. 🔧 Custom search
5. ❌ Exit



➤ Choose an option (1-5):  3



🏆 TOP MOVIES BY CATEGORY

📅 BEST MOVIES BY DECADE:
----------------------------------------

🎬 2020s:
   1. Dune: Part Two (2024) - 8.5★
      Action,Adventure,Drama | 593,840 votes
   2. The Kashmir Files (2022) - 8.5★
      Drama | 575,866 votes
   3. Spider-Man: Across the Spider-Verse (2023) - 8.5★
      Action,Adventure,Animation | 435,092 votes

🎬 2010s:
   1. Inception (2010) - 8.8★
      Action,Adventure,Sci-Fi | 2,648,271 votes
   2. Interstellar (2014) - 8.7★
      Adventure,Drama,Sci-Fi | 2,274,060 votes
   3. Django Unchained (2012) - 8.5★
      Comedy,Drama,Western | 1,764,198 votes

🎬 2000s:
   1. The Dark Knight (2008) - 9.0★
      Action,Crime,Drama | 2,980,743 votes
   2. The Lord of the Rings: The Return of the King (2003) - 9.0★
      Adventure,Drama,Fantasy | 2,054,070 votes
   3. The Lord of the Rings: The Fellowship of the Ring (2001) - 8.9★
      Adventure,Drama,Fantasy | 2,083,263 votes

🎬 1990s:
   1. The Shawshank Redemption (1994) - 9.3★
      Drama | 3,001,


➤ Choose an option (1-5):  4



🔧 CUSTOM SEARCH


🎭 Genre (optional):  Action
📅 Minimum year (optional):  2022
📅 Maximum year (optional):  
⭐ Minimum rating (optional):  8
👥 Minimum votes (optional):  
⏱️ Maximum duration in min (optional):  



📊 62 movies found with these criteria

🔄 How to sort results?
1. By rating (descending)
2. By popularity (votes)
3. By quality score
4. By year (recent first)
5. By alphabetical order


➤ Sort choice (1-5, default=3):  1



🎬 TOP 20 RESULTS:
--------------------------------------------------------------------------------
 1. Jithender Reddy (2024) - 9.5★
     Action | 136min | 2,308 votes
     Quality score: 4.50 | Cluster: 4

 2. Devaki Nandana Vasudeva (2024) - 9.4★
     Action | 128min | 2,532 votes
     Quality score: 4.51 | Cluster: 4

 3. Attack on Titan the Movie: The Last Attack (2024) - 9.3★
     Action,Animation,Drama | 145min | 11,082 votes
     Quality score: 4.86 | Cluster: 0

 4. Kaveri (2024) - 9.1★
     Action | 101min | 1,462 votes
     Quality score: 4.44 | Cluster: 1

 5. Usha Parinayam (2024) - 9.0★
     Action,Romance | 144min | 2,048 votes
     Quality score: 4.47 | Cluster: 7

 6. Gally Gang Stars (2024) - 9.0★
     Action,Comedy,Crime | 157min | 2,006 votes
     Quality score: 4.47 | Cluster: 7

 7. Nenu Keerthana (2024) - 9.0★
     Action,Comedy,Drama | 155min | 2,047 votes
     Quality score: 4.47 | Cluster: 7

 8. Sarsenapati Hambirrao (2022) - 8.9★
     Action,Drama,History | 


➤ Choose an option (1-5):  1
🎭 Preferred genre:  Action
⭐ Minimum rating (ex: 7.5):  huit


❌ Input error. Please enter valid values.

🎬 ADVANCED MOVIE RECOMMENDATION SYSTEM
1. 🎯 Smart recommendations by genre
2. 🔍 Recommendations based on a movie
3. 🏆 Top movies by genre and era
4. 🔧 Custom search
5. ❌ Exit

👋 Goodbye!


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [Errno 104] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
