In [1]:
!pip install gradio pandas matplotlib seaborn wordcloud numpy



In [9]:
# Simple Netflix Data Analysis Dashboard with Gradio
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr
import io
import base64
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# ML imports
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib

# Set matplotlib backend and style
plt.switch_backend('Agg')
plt.style.use('default')

class NetflixAnalyzer:
    def __init__(self):
        self.data = None
        self.sample_data = self.create_sample_data()
        self.ml_model = None
        self.label_encoders = {}
        self.scaler = StandardScaler()
        self.tfidf_vectorizer = None
        self.content_features = None

    def create_sample_data(self):
        """Create sample data for demonstration"""
        sample_data = {
            'show_id': ['s1', 's3', 's6', 's14', 's8', 's20', 's25', 's30', 's35', 's40',
                       's45', 's50', 's55', 's60', 's65', 's70', 's75', 's80', 's85', 's90'],
            'type': ['Movie', 'TV Show', 'TV Show', 'Movie', 'Movie', 'Movie', 'TV Show',
                    'Movie', 'TV Show', 'Movie', 'Movie', 'TV Show', 'Movie', 'TV Show',
                    'Movie', 'TV Show', 'Movie', 'Movie', 'TV Show', 'Movie'],
            'title': ['Dick Johnson Is Dead', 'Ganglands', 'Midnight Mass', 'Confessions of an Invisible Girl',
                     'Sankofa', 'The Social Dilemma', 'Stranger Things', 'Extraction', 'The Crown',
                     'Bird Box', 'Roma', 'Money Heist', 'The Irishman', 'Dark', 'Marriage Story',
                     'Black Mirror', 'Klaus', 'The Platform', 'Ozark', 'Okja'],
            'director': ['Kirsten Johnson', 'Julien Leclercq', 'Mike Flanagan', 'Bruno Garotti',
                        'Haile Gerima', 'Jeff Orlowski', 'Matt Duffer', 'Sam Hargrave',
                        'Peter Morgan', 'Susanne Bier', 'Alfonso Cuarón', 'Álex Pina', 'Martin Scorsese',
                        'Baran bo Odar', 'Noah Baumbach', 'Charlie Brooker', 'Sergio Pablos',
                        'Galder Gaztelu-Urrutia', 'Jason Bateman', 'Bong Joon-ho'],
            'country': ['United States', 'France', 'United States', 'Brazil', 'United States',
                       'United States', 'United States', 'United States', 'United Kingdom',
                       'United States', 'Mexico', 'Spain', 'United States', 'Germany',
                       'United States', 'United Kingdom', 'Spain', 'Spain', 'United States', 'South Korea'],
            'date_added': ['9/25/2021', '9/24/2021', '9/24/2021', '9/22/2021', '9/24/2021',
                          '9/9/2020', '7/15/2019', '4/24/2020', '11/9/2019', '12/21/2018',
                          '12/14/2018', '12/20/2017', '11/27/2019', '6/27/2020', '12/6/2019',
                          '6/5/2019', '11/15/2019', '3/20/2020', '7/21/2017', '6/28/2017'],
            'release_year': [2020, 2021, 2021, 2021, 1993, 2020, 2016, 2020, 2016, 2018,
                            2018, 2017, 2019, 2017, 2019, 2011, 2019, 2019, 2017, 2017],
            'rating': ['PG-13', 'TV-MA', 'TV-MA', 'TV-PG', 'TV-MA', 'PG-13', 'TV-14', 'R',
                      'TV-MA', 'R', 'R', 'TV-MA', 'R', 'TV-MA', 'R', 'TV-MA', 'PG', 'TV-MA',
                      'TV-MA', 'TV-PG'],
            'duration': ['90 min', '1 Season', '1 Season', '91 min', '125 min', '94 min',
                        '4 Seasons', '116 min', '4 Seasons', '124 min', '135 min', '5 Seasons',
                        '209 min', '3 Seasons', '137 min', '5 Seasons', '96 min', '104 min',
                        '4 Seasons', '118 min'],
            'listed_in': ['Documentaries', 'Crime TV Shows, International TV Shows', 'TV Dramas, Horror TV Shows',
                         'Children & Family Movies, Comedies', 'Dramas, Independent Movies', 'Documentaries',
                         'TV Dramas, TV Sci-Fi & Fantasy', 'Action & Adventure, Thrillers', 'British TV Shows, TV Dramas',
                         'Horror Movies, Thrillers', 'Dramas, International Movies', 'Crime TV Shows, International TV Shows',
                         'Crime Movies, Dramas', 'Crime TV Shows, International TV Shows', 'Dramas, Independent Movies',
                         'British TV Shows, TV Sci-Fi & Fantasy', 'Children & Family Movies, Comedies',
                         'Horror Movies, International Movies', 'Crime TV Shows, TV Dramas', 'Children & Family Movies, Comedies']
        }
        return pd.DataFrame(sample_data)

    def get_sample_data_table(self):
        """Return sample data as HTML table for display"""
        try:
            if self.data is not None:
                display_data = self.data.head(10)
            else:
                display_data = self.sample_data.head(10)

            # Convert to HTML table with styling
            html_table = display_data.to_html(index=False, classes='table table-striped', escape=False)
            return html_table
        except Exception as e:
            return f"❌ Error displaying sample data: {str(e)}"

    def load_data(self, file):
        """Load data from uploaded file or use sample data"""
        try:
            if file is not None:
                self.data = pd.read_csv(file.name)
                return f"✅ Data loaded successfully! Shape: {self.data.shape}"
            else:
                self.data = self.sample_data.copy()
                return "📊 Using sample Netflix data for demonstration."
        except Exception as e:
            self.data = self.sample_data.copy()
            return f"❌ Error loading file: {str(e)}\n🔄 Using sample data instead."

    def clean_data(self):
        """Enhanced data cleaning"""
        try:
            if self.data is None:
                return "❌ No data loaded!"

            original_shape = self.data.shape

            # Remove duplicates
            self.data = self.data.drop_duplicates()

            # Clean text columns - remove HTML entities and extra spaces
            text_columns = ['title', 'director', 'country', 'listed_in']
            for col in text_columns:
                if col in self.data.columns:
                    self.data[col] = self.data[col].astype(str)
                    self.data[col] = self.data[col].str.replace('&amp;', '&', regex=False)
                    self.data[col] = self.data[col].str.replace('&amp;amp;', '&', regex=False)
                    self.data[col] = self.data[col].str.strip()
                    # Replace 'nan' strings with actual NaN
                    self.data[col] = self.data[col].replace('nan', np.nan)

            # Convert date_added to datetime with better handling
            if 'date_added' in self.data.columns:
                self.data['date_added'] = pd.to_datetime(self.data['date_added'], errors='coerce')
                # Extract date components
                self.data['year_added'] = self.data['date_added'].dt.year
                self.data['month_added'] = self.data['date_added'].dt.month

            # Clean rating column
            if 'rating' in self.data.columns:
                self.data['rating'] = self.data['rating'].str.strip()

            # Clean release_year column
            if 'release_year' in self.data.columns:
                self.data['release_year'] = pd.to_numeric(self.data['release_year'], errors='coerce')

            return f"✅ Enhanced data cleaning completed!\nOriginal: {original_shape} → Cleaned: {self.data.shape}\n📅 Date columns added\n🧹 Text cleaned & HTML entities removed"
        except Exception as e:
            return f"❌ Error cleaning data: {str(e)}"

    def create_plot_image(self, fig):
        """Convert matplotlib figure to image"""
        try:
            import tempfile
            import os

            # Create a temporary file
            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
            fig.savefig(temp_file.name, format='png', bbox_inches='tight', dpi=100, facecolor='white')
            plt.close(fig)
            return temp_file.name
        except Exception as e:
            plt.close(fig)
            return None

    def content_distribution_analysis(self):
        """Analyze content distribution only"""
        try:
            if self.data is None:
                return "❌ No data loaded!", None

            # Create single plot for content distribution
            fig, ax = plt.subplots(1, 1, figsize=(10, 6))

            # Content Distribution Analysis
            content_counts = self.data['type'].value_counts()
            content_counts.plot(kind='bar', ax=ax, color=['#ff7f7f', '#7fbfff'])
            ax.set_title('Content Distribution on Netflix', fontsize=16, pad=20)
            ax.set_xlabel('Content Type', fontsize=12)
            ax.set_ylabel('Count', fontsize=12)
            ax.tick_params(axis='x', rotation=0)

            # Add value labels on bars
            for i, v in enumerate(content_counts.values):
                ax.text(i, v + 0.1, str(v), ha='center', va='bottom', fontsize=11, fontweight='bold')

            plt.tight_layout()

            # Generate content insights
            total = len(self.data)
            insights = "🎬 CONTENT DISTRIBUTION ANALYSIS:\n\n"
            for content_type, count in content_counts.items():
                percentage = (count / total) * 100
                insights += f"• {content_type}: {count} titles ({percentage:.1f}%)\n"

            insights += f"\n📊 SUMMARY:\n"
            insights += f"• Total Content: {total} titles\n"
            insights += f"• Content Types: {len(content_counts)} categories\n"
            if content_counts.iloc[0] > content_counts.iloc[1]:
                insights += f"• {content_counts.index[0]} content dominates the platform\n"
            else:
                insights += f"• Balanced mix of content types\n"

            plot_img = self.create_plot_image(fig)
            return insights, plot_img

        except Exception as e:
            return f"❌ Error in content distribution analysis: {str(e)}", None

    def ratings_distribution_analysis(self):
        """Analyze ratings distribution only"""
        try:
            if self.data is None:
                return "❌ No data loaded!", None

            # Create single plot for ratings distribution
            fig, ax = plt.subplots(1, 1, figsize=(12, 6))

            # Ratings Analysis
            rating_counts = self.data['rating'].value_counts().head(10)
            rating_counts.plot(kind='bar', ax=ax, color='steelblue')
            ax.set_title('Top Content Ratings Distribution', fontsize=16, pad=20)
            ax.set_xlabel('Rating', fontsize=12)
            ax.set_ylabel('Count', fontsize=12)
            ax.tick_params(axis='x', rotation=45)

            # Add value labels
            for i, v in enumerate(rating_counts.values):
                ax.text(i, v + 0.1, str(v), ha='center', va='bottom', fontsize=10, fontweight='bold')

            plt.tight_layout()

            # Generate ratings insights
            insights = "⭐ RATINGS DISTRIBUTION ANALYSIS:\n\n"
            insights += f"Total unique ratings: {len(self.data['rating'].unique())}\n\n"
            insights += "Top 5 Ratings:\n"
            for i, (rating, count) in enumerate(rating_counts.head(5).items(), 1):
                percentage = (count / len(self.data)) * 100
                insights += f"{i}. {rating}: {count} titles ({percentage:.1f}%)\n"

            # Add rating category analysis
            insights += f"\n🔞 RATING CATEGORIES:\n"
            mature_ratings = ['R', 'TV-MA', 'NC-17']
            family_ratings = ['G', 'PG', 'TV-G', 'TV-Y', 'TV-Y7']
            teen_ratings = ['PG-13', 'TV-14']

            mature_count = sum([self.data['rating'].value_counts().get(rating, 0) for rating in mature_ratings])
            family_count = sum([self.data['rating'].value_counts().get(rating, 0) for rating in family_ratings])
            teen_count = sum([self.data['rating'].value_counts().get(rating, 0) for rating in teen_ratings])

            insights += f"• Mature Content: {mature_count} titles ({(mature_count/len(self.data)*100):.1f}%)\n"
            insights += f"• Family Content: {family_count} titles ({(family_count/len(self.data)*100):.1f}%)\n"
            insights += f"• Teen Content: {teen_count} titles ({(teen_count/len(self.data)*100):.1f}%)\n"

            plot_img = self.create_plot_image(fig)
            return insights, plot_img

        except Exception as e:
            return f"❌ Error in ratings distribution analysis: {str(e)}", None

    def countries_analysis(self):
        """Analyze content by countries"""
        try:
            if self.data is None:
                return "❌ No data loaded!", None

            country_counts = self.data['country'].value_counts().head(10)

            # Create horizontal bar plot
            fig, ax = plt.subplots(1, 1, figsize=(10, 8))
            country_counts.plot(kind='barh', ax=ax, color='green', alpha=0.7)
            ax.set_title('Top 10 Countries with Most Netflix Content', fontsize=14, pad=20)
            ax.set_xlabel('Number of Titles')
            ax.set_ylabel('Country')

            # Add value labels
            for i, v in enumerate(country_counts.values):
                ax.text(v + 0.1, i, str(v), ha='left', va='center')

            plt.tight_layout()

            # Generate insights
            insights = "🌍 COUNTRIES ANALYSIS:\n\n"
            insights += f"Content from {len(self.data['country'].unique())} different countries\n\n"
            insights += "Top 5 Countries:\n"
            for i, (country, count) in enumerate(country_counts.head(5).items(), 1):
                percentage = (count / len(self.data)) * 100
                insights += f"{i}. {country}: {count} titles ({percentage:.1f}%)\n"

            plot_img = self.create_plot_image(fig)
            return insights, plot_img

        except Exception as e:
            return f"❌ Error in countries analysis: {str(e)}", None

    def directors_analysis(self):
        """Analyze top 10 directors"""
        try:
            if self.data is None:
                return "❌ No data loaded!", None

            directors = self.data['director'].value_counts().head(10)  # CHANGED: from 8 to 10

            # Create horizontal bar plot
            fig, ax = plt.subplots(1, 1, figsize=(10, 8))  # CHANGED: increased height for 10 directors
            directors.plot(kind='barh', ax=ax, color='orange', alpha=0.7)
            ax.set_title('Top 10 Directors with Most Netflix Content', fontsize=14, pad=20)  # CHANGED: title
            ax.set_xlabel('Number of Titles')
            ax.set_ylabel('Director')

            # Add value labels
            for i, v in enumerate(directors.values):
                ax.text(v + 0.1, i, str(v), ha='left', va='center')

            plt.tight_layout()

            # Generate insights
            insights = "🎭 DIRECTORS ANALYSIS:\n\n"
            insights += f"Top 10 Directors by Content Volume:\n"  # CHANGED: from "Top Directors" to "Top 10 Directors"
            for i, (director, count) in enumerate(directors.items(), 1):
                insights += f"{i}. {director}: {count} titles\n"

            plot_img = self.create_plot_image(fig)
            return insights, plot_img

        except Exception as e:
            return f"❌ Error in directors analysis: {str(e)}", None

    def genres_analysis(self):
        """Analyze content by genres with enhanced features"""
        try:
            if self.data is None:
                return "❌ No data loaded!", None

            # Extract individual genres from listed_in column
            all_genres = []
            genre_counts_per_content = []  # NEW: Count genres per content

            for genres_list in self.data['listed_in'].dropna():
                # Split by comma and clean up
                genres = [genre.strip() for genre in genres_list.split(',')]
                all_genres.extend(genres)
                genre_counts_per_content.append(len(genres))  # NEW: Count genres per content

            # Count genre frequencies
            genre_counts = pd.Series(all_genres).value_counts().head(12)

            # NEW: Calculate average genres per content
            avg_genres_per_content = np.mean(genre_counts_per_content)

            # NEW: Find top genre
            top_genre = genre_counts.index[0] if len(genre_counts) > 0 else "Unknown"

            # Create horizontal bar plot
            fig, ax = plt.subplots(1, 1, figsize=(12, 8))
            genre_counts.plot(kind='barh', ax=ax, color='purple', alpha=0.7)
            ax.set_title('Top 12 Most Popular Genres on Netflix', fontsize=14, pad=20)
            ax.set_xlabel('Number of Titles')
            ax.set_ylabel('Genre')

            # Add value labels
            for i, v in enumerate(genre_counts.values):
                ax.text(v + 0.1, i, str(v), ha='left', va='center')

            plt.tight_layout()

            # Generate insights - ENHANCED
            insights = "🎭 GENRES ANALYSIS:\n\n"
            insights += f"📊 TOP GENRE: {top_genre} ({genre_counts.iloc[0]} titles)\n"  # NEW
            insights += f"📈 Average genres per content: {avg_genres_per_content:.1f}\n"  # NEW
            insights += f"Total unique genres: {len(pd.Series(all_genres).unique())}\n"
            insights += f"Total genre mentions: {len(all_genres)}\n\n"

            # NEW: Genre distribution stats
            insights += f"🔢 GENRE COUNT DISTRIBUTION:\n"
            genre_count_dist = pd.Series(genre_counts_per_content).value_counts().sort_index()
            for count, freq in genre_count_dist.head(5).items():
                insights += f"• {count} genre(s): {freq} titles\n"
            insights += "\n"

            insights += "Top 10 Most Popular Genres:\n"
            for i, (genre, count) in enumerate(genre_counts.head(10).items(), 1):
                percentage = (count / len(self.data)) * 100
                insights += f"{i}. {genre}: {count} titles ({percentage:.1f}%)\n"

            plot_img = self.create_plot_image(fig)
            return insights, plot_img

        except Exception as e:
            return f"❌ Error in genres analysis: {str(e)}", None

    def get_movie_details(self, movie_title):
        """NEW FEATURE: Get detailed information about a specific movie including genre count and duration extraction"""
        try:
            if self.data is None:
                return "❌ No data loaded!"

            # Search for the movie (case insensitive, partial match)
            movie_matches = self.data[self.data['title'].str.contains(movie_title, case=False, na=False)]

            if movie_matches.empty:
                # Show available movie titles for suggestion
                available_movies = self.data['title'].head(10).tolist()
                return f"❌ Movie '{movie_title}' not found!\n\n🎬 Available movies to try:\n" + "\n".join([f"• {movie}" for movie in available_movies])

            # Get the first match
            movie = movie_matches.iloc[0]

            # Extract duration in minutes
            duration_text = str(movie['duration']) if pd.notna(movie['duration']) else "Unknown"
            duration_minutes = 0
            duration_info = "Duration info not available"

            if 'min' in duration_text:
                try:
                    duration_minutes = int(duration_text.split(' ')[0])
                    duration_info = f"{duration_minutes} minutes"
                except:
                    duration_info = duration_text
            elif 'Season' in duration_text:
                try:
                    seasons = int(duration_text.split(' ')[0])
                    duration_info = f"{seasons} Season(s) - Estimated {seasons * 10} episodes"
                except:
                    duration_info = duration_text
            else:
                duration_info = duration_text

            # Count genres for this movie
            genres_list = str(movie['listed_in']) if pd.notna(movie['listed_in']) else ""
            if genres_list and genres_list != 'nan':
                genres = [genre.strip() for genre in genres_list.split(',')]
                genre_count = len(genres)
            else:
                genres = []
                genre_count = 0

            # Calculate content age
            current_year = datetime.now().year
            content_age = current_year - movie['release_year'] if pd.notna(movie['release_year']) else "Unknown"

            # Generate detailed report
            result = f"🎬 DETAILED MOVIE INFORMATION\n"
            result += "=" * 50 + "\n\n"

            result += f"📽️ TITLE: {movie['title']}\n"
            result += f"🎭 TYPE: {movie['type']}\n"
            result += f"🎬 DIRECTOR: {movie['director'] if pd.notna(movie['director']) else 'Unknown'}\n"
            result += f"🌍 COUNTRY: {movie['country'] if pd.notna(movie['country']) else 'Unknown'}\n"
            result += f"📅 RELEASE YEAR: {movie['release_year'] if pd.notna(movie['release_year']) else 'Unknown'}\n"
            result += f"🔞 RATING: {movie['rating'] if pd.notna(movie['rating']) else 'Unknown'}\n\n"

            # NEW FEATURES - Duration and Genre Analysis
            result += f"⏱️ DURATION ANALYSIS:\n"
            result += f"• Original Duration: {duration_text}\n"
            result += f"• Extracted Duration: {duration_info}\n"
            if duration_minutes > 0:
                result += f"• Duration in Minutes: {duration_minutes} min\n"
                result += f"• Duration Category: "
                if duration_minutes < 90:
                    result += "Short Film/Episode\n"
                elif duration_minutes < 120:
                    result += "Standard Length\n"
                elif duration_minutes < 180:
                    result += "Long Feature\n"
                else:
                    result += "Epic Length\n"
            result += "\n"

            result += f"🎭 GENRE ANALYSIS:\n"
            result += f"• Total Genre Count: {genre_count}\n"
            if genres:
                result += f"• All Genres: {', '.join(genres)}\n"
                result += f"• Primary Genre: {genres[0]}\n"
                if len(genres) > 1:
                    result += f"• Secondary Genres: {', '.join(genres[1:])}\n"

                # Genre category analysis
                result += f"\n🏷️ GENRE CATEGORIES:\n"
                action_genres = ['Action', 'Adventure', 'Thrillers']
                drama_genres = ['Dramas', 'Romantic Movies']
                comedy_genres = ['Comedies', 'Romantic Comedies']
                family_genres = ['Children & Family Movies', 'Family']

                for category, keywords in [('Action/Adventure', action_genres),
                                         ('Drama', drama_genres),
                                         ('Comedy', comedy_genres),
                                         ('Family', family_genres)]:
                    if any(keyword in genre for keyword in keywords for genre in genres):
                        result += f"• {category}: ✅\n"
            else:
                result += f"• No genre information available\n"

            result += f"\n📊 ADDITIONAL METRICS:\n"
            result += f"• Content Age: {content_age} years old\n"
            result += f"• Date Added: {movie['date_added'].strftime('%B %d, %Y') if pd.notna(movie['date_added']) else 'Unknown'}\n"

            # Similar content analysis
            if genre_count > 0:
                similar_count = 0
                for _, other_movie in self.data.iterrows():
                    if other_movie['title'] != movie['title']:
                        other_genres = str(other_movie['listed_in']) if pd.notna(other_movie['listed_in']) else ""
                        if other_genres:
                            other_genre_list = [g.strip() for g in other_genres.split(',')]
                            # Check for genre overlap
                            if any(genre in other_genre_list for genre in genres):
                                similar_count += 1

                result += f"• Similar Genre Content: ~{similar_count} titles with overlapping genres\n"

            result += f"\n🎯 CONTENT SUMMARY:\n"
            if duration_minutes > 0 and genre_count > 0:
                result += f"• This {duration_info} {movie['type'].lower()} spans {genre_count} genre(s)\n"
            result += f"• Primary focus: {genres[0] if genres else 'Unknown genre'}\n"
            if movie['type'] == 'Movie' and duration_minutes > 0:
                result += f"• Watch time commitment: {duration_minutes // 60}h {duration_minutes % 60}m\n"

            return result

        except Exception as e:
            return f"❌ Error getting movie details: {str(e)}"

    def prepare_ml_features(self):
        """Prepare features for ML models"""
        try:
            if self.data is None:
                return "❌ No data loaded!"

            # Create a copy for ML processing
            ml_data = self.data.copy()

            # Extract duration in minutes for movies, seasons for TV shows
            ml_data['duration_numeric'] = 0
            for idx, row in ml_data.iterrows():
                if pd.notna(row['duration']):
                    if 'min' in str(row['duration']):
                        ml_data.loc[idx, 'duration_numeric'] = int(str(row['duration']).split(' ')[0])
                    elif 'Season' in str(row['duration']):
                        ml_data.loc[idx, 'duration_numeric'] = int(str(row['duration']).split(' ')[0]) * 60  # Convert seasons to minutes equivalent

            # Create age of content feature
            current_year = datetime.now().year
            ml_data['content_age'] = current_year - ml_data['release_year']

            # Extract primary genre
            ml_data['primary_genre'] = ml_data['listed_in'].str.split(',').str[0]

            # Create success score based on multiple factors (synthetic for demo)
            np.random.seed(42)
            ml_data['success_score'] = (
                np.random.normal(0.5, 0.2, len(ml_data)) +
                (ml_data['duration_numeric'] / 200) * 0.3 +
                (2024 - ml_data['release_year']) * 0.01
            )
            ml_data['success_score'] = np.clip(ml_data['success_score'], 0, 1)

            # Create success categories
            ml_data['success_category'] = pd.cut(ml_data['success_score'],
                                               bins=[0, 0.3, 0.7, 1.0],
                                               labels=['Low', 'Medium', 'High'])

            return ml_data

        except Exception as e:
            return f"❌ Error preparing ML features: {str(e)}"

    def train_success_prediction_model(self):
        """Train content success prediction model"""
        try:
            ml_data = self.prepare_ml_features()
            if isinstance(ml_data, str):
                return ml_data

            # Select features for training
            feature_columns = ['release_year', 'duration_numeric', 'content_age']
            categorical_features = ['type', 'country', 'rating', 'primary_genre']

            # Prepare features
            X = ml_data[feature_columns].copy()

            # Encode categorical variables
            for col in categorical_features:
                if col in ml_data.columns:
                    le = LabelEncoder()
                    ml_data[col] = ml_data[col].fillna('Unknown')
                    X[col + '_encoded'] = le.fit_transform(ml_data[col])
                    self.label_encoders[col] = le

            # Target variable
            y = ml_data['success_category'].dropna()
            X = X.loc[y.index]

            # Split data
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

            # Scale features
            X_train_scaled = self.scaler.fit_transform(X_train)
            X_test_scaled = self.scaler.transform(X_test)

            # Train model
            self.ml_model = RandomForestClassifier(n_estimators=100, random_state=42)
            self.ml_model.fit(X_train_scaled, y_train)

            # Make predictions
            y_pred = self.ml_model.predict(X_test_scaled)
            accuracy = accuracy_score(y_test, y_pred)

            # Feature importance
            feature_names = X.columns
            importances = self.ml_model.feature_importances_

            # Create feature importance plot
            fig, ax = plt.subplots(1, 1, figsize=(10, 6))
            indices = np.argsort(importances)[::-1][:10]
            ax.bar(range(len(indices)), importances[indices])
            ax.set_title('Top 10 Feature Importances for Content Success Prediction')
            ax.set_xlabel('Features')
            ax.set_ylabel('Importance')
            ax.set_xticks(range(len(indices)))
            ax.set_xticklabels([feature_names[i] for i in indices], rotation=45)
            plt.tight_layout()

            results = f"🤖 CONTENT SUCCESS PREDICTION MODEL\n\n"
            results += f"✅ Model trained successfully!\n"
            results += f"📊 Model Accuracy: {accuracy:.3f} ({accuracy*100:.1f}%)\n"
            results += f"📈 Training Samples: {len(X_train)}\n"
            results += f"🧪 Test Samples: {len(X_test)}\n\n"
            results += f"🎯 FEATURE IMPORTANCE:\n"

            for i, idx in enumerate(indices[:5]):
                results += f"{i+1}. {feature_names[idx]}: {importances[idx]:.3f}\n"

            results += f"\n📝 Model can predict if content will have Low/Medium/High success based on:\n"
            results += f"• Content metadata (year, duration, genre)\n"
            results += f"• Production details (country, rating)\n"
            results += f"• Content age and type\n"

            plot_img = self.create_plot_image(fig)
            return results, plot_img

        except Exception as e:
            return f"❌ Error training model: {str(e)}", None

    def predict_content_success(self, content_type, country, rating, genre, release_year, duration):
        """Predict success for new content"""
        try:
            if self.ml_model is None:
                return "❌ Please train the model first!"

            # Prepare input data
            input_data = pd.DataFrame({
                'release_year': [release_year],
                'duration_numeric': [duration],
                'content_age': [datetime.now().year - release_year]
            })

            # Encode categorical features
            for col, value in [('type', content_type), ('country', country),
                              ('rating', rating), ('primary_genre', genre)]:
                if col in self.label_encoders:
                    try:
                        encoded_value = self.label_encoders[col].transform([value])[0]
                    except ValueError:
                        # Handle unseen categories
                        encoded_value = 0
                    input_data[col + '_encoded'] = encoded_value

            # Scale features
            input_scaled = self.scaler.transform(input_data)

            # Make prediction
            prediction = self.ml_model.predict(input_scaled)[0]
            probability = self.ml_model.predict_proba(input_scaled)[0]

            # Get class probabilities
            classes = self.ml_model.classes_
            prob_dict = dict(zip(classes, probability))

            result = f"🎯 CONTENT SUCCESS PREDICTION\n\n"
            result += f"📹 Content: {content_type} from {country}\n"
            result += f"🎬 Genre: {genre} | Rating: {rating}\n"
            result += f"📅 Release Year: {release_year} | Duration: {duration} min\n\n"
            result += f"🔮 PREDICTED SUCCESS: {prediction}\n\n"
            result += f"📊 SUCCESS PROBABILITIES:\n"
            for class_name, prob in prob_dict.items():
                result += f"• {class_name}: {prob:.1%}\n"

            # Add recommendations
            if prediction == 'High':
                result += f"\n✅ RECOMMENDATION: This content has strong success potential!"
            elif prediction == 'Medium':
                result += f"\n⚠️ RECOMMENDATION: Moderate success expected. Consider optimization."
            else:
                result += f"\n❌ RECOMMENDATION: High risk. Review content strategy."

            return result

        except Exception as e:
            return f"❌ Error making prediction: {str(e)}"

    def build_recommendation_system(self):
        """Build content-based recommendation system focused on genres"""
        try:
            if self.data is None:
                return "❌ No data loaded!"

            # Prepare content features for similarity
            content_data = self.data.copy()

            # Create feature text with HEAVY emphasis on genres
            content_data['features'] = (
                # Give genres 5x weight by repeating them
                (content_data['listed_in'].fillna('') + ' ') * 5 +
                # Add type 2x weight (Movie/TV Show similarity)
                (content_data['type'].fillna('') + ' ') * 2 +
                # Give rating some weight for age-appropriate content
                (content_data['rating'].fillna('') + ' ') * 2 +
                # Minimal weight to country and director
                content_data['country'].fillna('') + ' ' +
                content_data['director'].fillna('')
            )

            # Create TF-IDF vectors with better parameters for genre focus
            self.tfidf_vectorizer = TfidfVectorizer(
                max_features=2000,
                stop_words='english',
                ngram_range=(1, 2),  # Include bigrams for better genre matching
                min_df=1,  # Don't ignore rare genre combinations
                max_df=0.95  # Ignore overly common terms
            )
            self.content_features = self.tfidf_vectorizer.fit_transform(content_data['features'])

            results = f"🎯 GENRE-FOCUSED RECOMMENDATION SYSTEM BUILT\n\n"
            results += f"✅ Enhanced content-based filtering model created!\n"
            results += f"📊 Total content items: {len(content_data)}\n"
            results += f"🔢 Feature dimensions: {self.content_features.shape[1]}\n\n"
            results += f"🎬 RECOMMENDATION PRIORITY (BY WEIGHT):\n"
            results += f"1. 🎭 GENRES & CATEGORIES (5x weight) ⭐⭐⭐⭐⭐\n"
            results += f"2. 📺 Content Type (2x weight) ⭐⭐\n"
            results += f"3. 🔞 Content Rating (2x weight) ⭐⭐\n"
            results += f"4. 🌍 Country (1x weight) ⭐\n"
            results += f"5. 🎬 Director (1x weight) ⭐\n\n"
            results += f"🔍 IMPROVED MATCHING:\n"
            results += f"• Horror movies → Other horror content\n"
            results += f"• Comedy → Similar comedy styles\n"
            results += f"• Drama → Character-driven dramas\n"
            results += f"• Action → High-energy action content\n\n"
            results += f"🚀 System ready! Genre-based recommendations prioritized."

            return results

        except Exception as e:
            return f"❌ Error building recommendation system: {str(e)}"

    def get_recommendations(self, title, num_recommendations=5):
        """Get genre-focused recommendations for a specific title"""
        try:
            if self.content_features is None:
                return "❌ Please build the recommendation system first!"

            # Find the title in dataset
            title_matches = self.data[self.data['title'].str.contains(title, case=False, na=False)]

            if title_matches.empty:
                return f"❌ Title '{title}' not found! Try: {', '.join(self.data['title'].head(5))}"

            # Get the index of the first match
            title_idx = title_matches.index[0]
            title_info = self.data.iloc[title_idx]

            # Calculate similarity scores
            similarity_scores = cosine_similarity(
                self.content_features[title_idx:title_idx+1],
                self.content_features
            ).flatten()

            # Get top similar content (excluding the same title)
            similar_indices = similarity_scores.argsort()[::-1][1:num_recommendations+1]

            # Extract primary genre for better context
            primary_genre = title_info['listed_in'].split(',')[0].strip() if pd.notna(title_info['listed_in']) else 'Unknown'

            recommendations = f"🎯 GENRE-FOCUSED RECOMMENDATIONS FOR: {title_info['title']}\n\n"
            recommendations += f"📹 Original: {title_info['type']} | 🎭 Primary Genre: {primary_genre}\n"
            recommendations += f"🌍 Country: {title_info['country']} | 🔞 Rating: {title_info['rating']}\n\n"
            recommendations += f"🔍 TOP {num_recommendations} SIMILAR CONTENT (Genre Priority):\n\n"

            for i, idx in enumerate(similar_indices, 1):
                similar_content = self.data.iloc[idx]
                similarity_score = similarity_scores[idx]
                similar_primary_genre = similar_content['listed_in'].split(',')[0].strip() if pd.notna(similar_content['listed_in']) else 'Unknown'

                # Add genre match indicator
                genre_match = "🎯" if primary_genre.lower() in similar_content['listed_in'].lower() else "📍"

                recommendations += f"{i}. {genre_match} {similar_content['title']}\n"
                recommendations += f"   📊 Similarity: {similarity_score:.3f} ({similarity_score*100:.1f}%)\n"
                recommendations += f"   🎭 Primary Genre: {similar_primary_genre}\n"
                recommendations += f"   📋 All Genres: {similar_content['listed_in']}\n"
                recommendations += f"   🌍 {similar_content['country']} | 📺 {similar_content['type']} | 🔞 {similar_content['rating']}\n\n"

            # Add genre analysis
            recommendations += f"🎭 GENRE MATCHING ANALYSIS:\n"
            genre_matches = 0
            for idx in similar_indices:
                similar_content = self.data.iloc[idx]
                if primary_genre.lower() in similar_content['listed_in'].lower():
                    genre_matches += 1

            match_percentage = (genre_matches / len(similar_indices)) * 100
            recommendations += f"• Genre Match Rate: {genre_matches}/{len(similar_indices)} ({match_percentage:.0f}%)\n"
            recommendations += f"• Primary Genre: {primary_genre}\n"
            recommendations += f"🎯 = Exact genre match | 📍 = Related content\n"

            return recommendations

        except Exception as e:
            return f"❌ Error getting recommendations: {str(e)}"

    def create_dashboard_summary(self):
        """Create a comprehensive dashboard summary"""
        try:
            if self.data is None:
                return "❌ No data loaded!"

            summary = "📊 NETFLIX DATA ANALYSIS SUMMARY\n"
            summary += "=" * 50 + "\n\n"

            # Basic stats
            summary += f"📈 DATASET OVERVIEW:\n"
            summary += f"• Total Records: {len(self.data):,}\n"
            summary += f"• Total Columns: {len(self.data.columns)}\n"
            if 'date_added' in self.data.columns and not self.data['date_added'].isna().all():
                summary += f"• Date Range: {self.data['date_added'].min().strftime('%Y-%m-%d')} to {self.data['date_added'].max().strftime('%Y-%m-%d')}\n"
            summary += "\n"

            # Content breakdown
            if 'type' in self.data.columns:
                content_counts = self.data['type'].value_counts()
                summary += f"🎬 CONTENT BREAKDOWN:\n"
                for content_type, count in content_counts.items():
                    percentage = (count / len(self.data)) * 100
                    summary += f"• {content_type}: {count:,} ({percentage:.1f}%)\n"
                summary += "\n"

            # Top categories
            summary += f"🌟 TOP CATEGORIES:\n"
            if 'rating' in self.data.columns:
                summary += f"• Most Popular Rating: {self.data['rating'].mode().iloc[0]}\n"
            if 'country' in self.data.columns:
                summary += f"• Top Country: {self.data['country'].mode().iloc[0]}\n"
            if 'year_added' in self.data.columns and not self.data['year_added'].isna().all():
                summary += f"• Most Active Year: {int(self.data['year_added'].mode().iloc[0])}\n"
            summary += "\n"

            # Diversity metrics
            summary += f"🌍 DIVERSITY METRICS:\n"
            if 'country' in self.data.columns:
                summary += f"• Unique Countries: {self.data['country'].nunique()}\n"
            if 'director' in self.data.columns:
                summary += f"• Unique Directors: {self.data['director'].nunique()}\n"
            if 'rating' in self.data.columns:
                summary += f"• Unique Ratings: {self.data['rating'].nunique()}\n"
            if 'listed_in' in self.data.columns:
                summary += f"• Genre Categories: {self.data['listed_in'].nunique()}\n"

            return summary

        except Exception as e:
            return f"❌ Error creating summary: {str(e)}"

# Initialize the analyzer
analyzer = NetflixAnalyzer()

# Create improved Gradio interface with separate buttons
def create_interface():
    with gr.Blocks(title="Netflix Data Analysis Dashboard", theme=gr.themes.Soft()) as demo:

        gr.Markdown("# 🎬 Netflix Data Analysis Dashboard")
        gr.Markdown("Interactive analysis of Netflix content data with enhanced genre and director analysis")

        with gr.Tab("📊 Data Overview"):
            with gr.Row():
                with gr.Column():
                    file_input = gr.File(label="Upload Netflix CSV Dataset (Optional)", file_types=[".csv"])
                    with gr.Row():
                        load_btn = gr.Button("Load Data", variant="primary")
                        clean_btn = gr.Button("Clean Data", variant="secondary")
                    load_output = gr.Textbox(label="Status", lines=3, max_lines=10, show_label=True, container=True)

                with gr.Column():
                    get_summary_btn = gr.Button("Generate Summary Report", variant="primary")
                    summary_output = gr.Textbox(label="Dataset Summary", lines=25, max_lines=50, show_label=True, container=True)

            # Sample Data Table Section
            gr.Markdown("## 📋 Sample Data Overview")
            sample_table_btn = gr.Button("🔍 View Sample Data Table", variant="secondary")
            sample_table_output = gr.HTML(label="Sample Data Table")

        with gr.Tab("📈 Content & Rating Analysis"):
            gr.Markdown("## 📊 Content Distribution & Rating Analysis")
            gr.Markdown("**Analyze content types and ratings separately with dedicated buttons**")

            # Two separate buttons in one row
            with gr.Row():
                content_dist_btn = gr.Button("🎬 Analyze Content Distribution", variant="primary", size="lg")
                ratings_dist_btn = gr.Button("⭐ Analyze Ratings Distribution", variant="secondary", size="lg")

            # Two columns for separate outputs
            with gr.Row():
                with gr.Column():
                    gr.Markdown("### 🎬 Content Distribution Results")
                    content_insights = gr.Textbox(label="Content Distribution Analysis", lines=15, max_lines=25, show_label=True, container=True)
                    content_plot = gr.Image(label="Content Distribution Chart", show_label=True)

                with gr.Column():
                    gr.Markdown("### ⭐ Ratings Distribution Results")
                    ratings_insights = gr.Textbox(label="Ratings Distribution Analysis", lines=15, max_lines=25, show_label=True, container=True)
                    ratings_plot = gr.Image(label="Ratings Distribution Chart", show_label=True)

        with gr.Tab("🌍 Countries Analysis"):
            gr.Markdown("## 🌍 Analyze Content by Countries")
            countries_btn = gr.Button("🚀 Analyze Countries Distribution", variant="primary", size="lg")

            with gr.Row():
                with gr.Column():
                    countries_insights = gr.Textbox(label="Countries Analysis Insights", lines=15, max_lines=25, show_label=True, container=True)
                with gr.Column():
                    countries_plot = gr.Image(label="Countries Distribution Chart", show_label=True)

        with gr.Tab("🎭 Directors Analysis (Enhanced)"):
            gr.Markdown("## 🎭 Analyze Top 10 Directors")
            gr.Markdown("**Enhanced analysis showing top 10 directors with most Netflix content**")
            directors_btn = gr.Button("🚀 Analyze Top 10 Directors Distribution", variant="primary", size="lg")

            with gr.Row():
                with gr.Column():
                    directors_insights = gr.Textbox(label="Top 10 Directors Analysis Insights", lines=15, max_lines=25, show_label=True, container=True)
                with gr.Column():
                    directors_plot = gr.Image(label="Top 10 Directors Distribution Chart", show_label=True)

        with gr.Tab("🎬 Genres Analysis (Enhanced)"):
            gr.Markdown("## 🎬 Enhanced Genres Analysis")
            gr.Markdown("**Advanced genre analysis with genre count per content, top genre identification, and distribution statistics**")
            genres_btn = gr.Button("🚀 Analyze Enhanced Genres Distribution", variant="primary", size="lg")

            with gr.Row():
                with gr.Column():
                    genres_insights = gr.Textbox(label="Enhanced Genres Analysis Insights", lines=20, max_lines=30, show_label=True, container=True)
                with gr.Column():
                    genres_plot = gr.Image(label="Enhanced Genres Distribution Chart", show_label=True)

        with gr.Tab("🔍 Movie Details Lookup (NEW)"):
            gr.Markdown("## 🔍 Movie Details Lookup with Genre Count & Duration Analysis")
            gr.Markdown("**NEW FEATURE: Get detailed information about any movie including genre count and duration extraction**")

            with gr.Row():
                with gr.Column():
                    movie_title_input = gr.Textbox(
                        label="Enter Movie/Show Title",
                        placeholder="e.g., Bird Box, Stranger Things, Roma, The Irishman",
                        lines=1
                    )
                    get_movie_details_btn = gr.Button("🎬 Get Movie Details", variant="primary", size="lg")

                with gr.Column():
                    gr.Markdown("### 📝 What This Feature Does:")
                    gr.Markdown("• **Genre Count**: Counts total genres per movie")
                    gr.Markdown("• **Duration Extraction**: Extracts duration in minutes")
                    gr.Markdown("• **Content Analysis**: Detailed movie information")
                    gr.Markdown("• **Similar Content**: Finds related movies")

            movie_details_output = gr.Textbox(
                label="Movie Details & Analysis",
                lines=25,
                max_lines=40,
                show_label=True,
                container=True
            )

        with gr.Tab("🤖 ML: Success Prediction"):
            gr.Markdown("## 🤖 Machine Learning: Content Success Prediction")

            with gr.Row():
                with gr.Column():
                    train_model_btn = gr.Button("🚀 Train Success Prediction Model", variant="primary", size="lg")
                    ml_results = gr.Textbox(label="Model Training Results", lines=20, max_lines=30, show_label=True, container=True)
                with gr.Column():
                    ml_plot = gr.Image(label="Feature Importance Chart", show_label=True)

            gr.Markdown("### 🔮 Predict Success for New Content")
            with gr.Row():
                with gr.Column():
                    pred_type = gr.Dropdown(["Movie", "TV Show"], label="Content Type", value="Movie")
                    pred_country = gr.Dropdown(["United States", "United Kingdom", "Canada", "India", "Spain", "France"],
                                             label="Country", value="United States")
                    pred_rating = gr.Dropdown(["PG", "PG-13", "R", "TV-MA", "TV-14", "TV-PG"],
                                            label="Rating", value="PG-13")
                with gr.Column():
                    pred_genre = gr.Dropdown(["Action & Adventure", "Dramas", "Comedies", "Horror Movies", "Documentaries"],
                                           label="Primary Genre", value="Action & Adventure")
                    pred_year = gr.Slider(1990, 2024, value=2023, label="Release Year")
                    pred_duration = gr.Slider(60, 300, value=120, label="Duration (minutes)")

            predict_btn = gr.Button("🎯 Predict Content Success", variant="secondary")
            prediction_result = gr.Textbox(label="Prediction Results", lines=15, show_label=True, container=True)

        with gr.Tab("🎯 ML: Recommendations"):
            gr.Markdown("## 🎯 Machine Learning: Content Recommendation System")

            build_rec_btn = gr.Button("🚀 Build Recommendation System", variant="primary", size="lg")
            rec_results = gr.Textbox(label="Recommendation System Results", lines=15, max_lines=25, show_label=True, container=True)

            gr.Markdown("### 🔍 Get Content Recommendations")
            with gr.Row():
                with gr.Column():
                    title_input = gr.Textbox(label="Enter Content Title", placeholder="e.g., Stranger Things, Bird Box, Roma")
                    num_recs = gr.Slider(3, 10, value=5, label="Number of Recommendations")
                with gr.Column():
                    get_rec_btn = gr.Button("🎬 Get Recommendations", variant="secondary")
                    rec_output = gr.Textbox(label="Recommendations", lines=20, show_label=True, container=True)

        # Event handlers
        load_btn.click(analyzer.load_data, inputs=[file_input], outputs=[load_output])
        clean_btn.click(analyzer.clean_data, outputs=[load_output])
        get_summary_btn.click(analyzer.create_dashboard_summary, outputs=[summary_output])
        sample_table_btn.click(analyzer.get_sample_data_table, outputs=[sample_table_output])

        # SEPARATE BUTTON EVENT HANDLERS - This is the key change!
        content_dist_btn.click(analyzer.content_distribution_analysis, outputs=[content_insights, content_plot])
        ratings_dist_btn.click(analyzer.ratings_distribution_analysis, outputs=[ratings_insights, ratings_plot])

        countries_btn.click(analyzer.countries_analysis, outputs=[countries_insights, countries_plot])
        directors_btn.click(analyzer.directors_analysis, outputs=[directors_insights, directors_plot])
        genres_btn.click(analyzer.genres_analysis, outputs=[genres_insights, genres_plot])

        # NEW FEATURE: Movie Details Lookup
        get_movie_details_btn.click(analyzer.get_movie_details, inputs=[movie_title_input], outputs=[movie_details_output])

        # ML event handlers
        train_model_btn.click(analyzer.train_success_prediction_model, outputs=[ml_results, ml_plot])
        predict_btn.click(analyzer.predict_content_success,
                         inputs=[pred_type, pred_country, pred_rating, pred_genre, pred_year, pred_duration],
                         outputs=[prediction_result])
        build_rec_btn.click(analyzer.build_recommendation_system, outputs=[rec_results])
        get_rec_btn.click(analyzer.get_recommendations, inputs=[title_input, num_recs], outputs=[rec_output])

    return demo

# Launch the application
if __name__ == "__main__":
    print("🚀 Starting Enhanced Netflix Data Analysis Dashboard...")
    print("📊 Sample data included - you can test immediately!")
    print("🎭 NEW: Enhanced Genres Analysis with genre count per content and top genre")
    print("🎬 NEW: Top 10 Directors Analysis")
    print("🔍 NEW: Movie Details Lookup with Genre Count & Duration Analysis")

    # Create and launch the interface
    demo = create_interface()
    demo.launch(share=True, server_name="0.0.0.0")


🚀 Starting Enhanced Netflix Data Analysis Dashboard...
📊 Sample data included - you can test immediately!
🎭 NEW: Enhanced Genres Analysis with genre count per content and top genre
🎬 NEW: Top 10 Directors Analysis
🔍 NEW: Movie Details Lookup with Genre Count & Duration Analysis
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://394142ec5921e5d605.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
