In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

#  Data Loading and Exploration
def load_and_explore_data(file_path):
    """
    Load the dataset and perform initial exploration
    """
    try:
        df = pd.read_csv('Downloads/movies.csv')
        print(f"Dataset loaded successfully from {file_path}")
    except Exception as e:
        print(f"Error loading data: {e}")
        print("Creating a sample dataset for testing...")
        
        # Create a small sample dataset for testing
        data = {
            'Name': ['Movie 1', 'Movie 2', 'Movie 3', 'Movie 4', 'Movie 5'],
            'Year': [2020, 2019, 2021, 2018, 2022],
            'Duration': [120, 105, 135, 95, 110],
            'Genre': ['Action', 'Comedy', 'Drama', 'Action, Adventure', 'Comedy, Romance'],
            'Rating': [8.5, 7.2, 8.9, 6.8, 7.5],
            'Votes': [10000, 5000, 12000, 3000, 6000],
            'Director': ['Director 1', 'Director 2', 'Director 3', 'Director 1', 'Director 4'],
            'Actor 1': ['Actor A', 'Actor B', 'Actor C', 'Actor D', 'Actor B'],
            'Actor 2': ['Actor E', 'Actor F', 'Actor G', 'Actor H', 'Actor I'],
            'Actor 3': ['Actor J', 'Actor K', 'Actor L', 'Actor M', 'Actor N']
        }
        df = pd.DataFrame(data)
    
    print(f"Dataset loaded with {df.shape[0]} rows and {df.shape[1]} columns")
    print("\nFirst 5 records:")
    print(df.head())
    
    print("\nData Information:")
    print(df.info())
    
    print("\nStatistical Summary:")
    print(df.describe(include='all'))
    
    print("\nMissing Values:")
    print(df.isnull().sum())
    
    return df

#  Data Preprocessing
def preprocess_data(df):
    """
    Preprocess the data including handling missing values and encoding categorical variables
    """
    processed_df = df.copy()
    
    # Print data types before conversion
    print("\nData types before conversion:")
    print(processed_df.dtypes)
    
    #  Convert Year to numeric, forcing non-numeric values to NaN
    if 'Year' in processed_df.columns:
        processed_df['Year'] = pd.to_numeric(processed_df['Year'], errors='coerce')
    
    #  Convert Duration to numeric
    if 'Duration' in processed_df.columns:
        if processed_df['Duration'].dtype == 'object':
            # Extract numeric values from duration strings (e.g., "109 min" -> 109)
            processed_df['Duration'] = processed_df['Duration'].str.extract(r'(\d+)').astype(float)
        else:
            processed_df['Duration'] = pd.to_numeric(processed_df['Duration'], errors='coerce')
    
    #  Convert Rating to numeric
    if 'Rating' in processed_df.columns:
        processed_df['Rating'] = pd.to_numeric(processed_df['Rating'], errors='coerce')
    
    #  Convert Votes to numeric
    if 'Votes' in processed_df.columns:
        processed_df['Votes'] = pd.to_numeric(processed_df['Votes'], errors='coerce')
    
    # Print data types after conversion
    print("\nData types after conversion:")
    print(processed_df.dtypes)
    
    # Print value ranges to check for outliers
    print("\nValue ranges for numeric columns:")
    for col in ['Year', 'Duration', 'Rating', 'Votes']:
        if col in processed_df.columns:
            print(f"{col}: min={processed_df[col].min()}, max={processed_df[col].max()}, mean={processed_df[col].mean()}")
    
    #  Handle missing values
    # For numerical columns
    num_cols = ['Year', 'Duration', 'Rating', 'Votes']
    for col in num_cols:
        if col in processed_df.columns:
            # Fill with median
            median_val = processed_df[col].median()
            processed_df[col] = processed_df[col].fillna(median_val)
    
    # For categorical columns
    cat_cols = ['Name', 'Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
    for col in cat_cols:
        if col in processed_df.columns:
            # Fill with "Unknown"
            processed_df[col] = processed_df[col].fillna("Unknown")
    
    #  Process Genre column (might contain multiple genres)
    if 'Genre' in processed_df.columns:
        # Extract primary genre if multiple genres are present
        processed_df['Primary_Genre'] = processed_df['Genre'].apply(
            lambda x: x.split(',')[0].strip() if isinstance(x, str) else "Unknown"
        )
    
    return processed_df

#  Feature Engineering
def engineer_features(df):
    """
    Create new features that might help prediction performance
    """
    featured_df = df.copy()
    
    #  Extract decade from year
    if 'Year' in featured_df.columns:
        featured_df['Decade'] = (featured_df['Year'] // 10) * 10
    
    #  Calculate director success metrics
    if 'Director' in featured_df.columns and 'Rating' in featured_df.columns:
        # Average rating by director
        director_avg = featured_df.groupby('Director')['Rating'].mean().reset_index()
        director_avg.columns = ['Director', 'Director_Avg_Rating']
        
        # Number of movies by director
        director_count = featured_df.groupby('Director').size().reset_index()
        director_count.columns = ['Director', 'Director_Movie_Count']
        
        # Merge these features back
        featured_df = pd.merge(featured_df, director_avg, on='Director', how='left')
        featured_df = pd.merge(featured_df, director_count, on='Director', how='left')
    
    #  Calculate actor success metrics
    for actor_col in ['Actor 1', 'Actor 2', 'Actor 3']:
        if actor_col in featured_df.columns and 'Rating' in featured_df.columns:
            # Average rating by actor
            actor_avg = featured_df.groupby(actor_col)['Rating'].mean().reset_index()
            actor_avg.columns = [actor_col, f'{actor_col.replace(" ", "_")}_Avg_Rating']
            
            # Merge back
            featured_df = pd.merge(featured_df, actor_avg, on=actor_col, how='left')
    
    #  Create a "Star Power" feature - average of the three actors' average ratings
    actor_rating_cols = []
    for actor in ['Actor 1', 'Actor 2', 'Actor 3']:
        rating_col = f'{actor.replace(" ", "_")}_Avg_Rating'
        if rating_col in featured_df.columns:
            actor_rating_cols.append(rating_col)
    
    if actor_rating_cols:
        featured_df['Star_Power'] = featured_df[actor_rating_cols].mean(axis=1)
    
    #  Calculate average rating by genre
    if 'Primary_Genre' in featured_df.columns and 'Rating' in featured_df.columns:
        genre_avg = featured_df.groupby('Primary_Genre')['Rating'].mean().reset_index()
        genre_avg.columns = ['Primary_Genre', 'Genre_Avg_Rating']
        featured_df = pd.merge(featured_df, genre_avg, on='Primary_Genre', how='left')
    
    #  Calculate average rating by decade
    if 'Decade' in featured_df.columns and 'Rating' in featured_df.columns:
        decade_avg = featured_df.groupby('Decade')['Rating'].mean().reset_index()
        decade_avg.columns = ['Decade', 'Decade_Avg_Rating']
        featured_df = pd.merge(featured_df, decade_avg, on='Decade', how='left')
    
    #  Calculate log transform of votes to handle skewness
    if 'Votes' in featured_df.columns:
        featured_df['Log_Votes'] = np.log1p(featured_df['Votes'])
    
    #  Duration categories
    if 'Duration' in featured_df.columns:
        bins = [0, 90, 120, 150, 1000]
        labels = ['Short', 'Medium', 'Long', 'Very Long']
        featured_df['Duration_Category'] = pd.cut(featured_df['Duration'], bins=bins, labels=labels, right=False)
    
    return featured_df

# Prepare data for modeling
def prepare_for_modeling(df, target_col='Rating'):
    """
    Prepare the data for modeling by splitting into features and target
    and handling categorical variables
    """
    # Identify features to keep, excluding the original target
    feature_df = df.copy()
    
    # Check if target exists and has valid data
    if target_col not in feature_df.columns:
        raise ValueError(f"Target column '{target_col}' not found in dataframe")
    
    # Check target variable range and stats
    print(f"\nTarget variable '{target_col}' stats:")
    print(f"Count: {feature_df[target_col].count()}")
    print(f"Min: {feature_df[target_col].min()}")
    print(f"Max: {feature_df[target_col].max()}")
    print(f"Mean: {feature_df[target_col].mean()}")
    
    # Exclude the target and any redundant columns
    columns_to_drop = [target_col, 'Name']  # Add other columns that shouldn't be used for prediction
    
    # Only drop columns that exist
    cols_to_drop = [col for col in columns_to_drop if col in feature_df.columns]
    
    # Features and target
    X = feature_df.drop(cols_to_drop, axis=1)
    y = feature_df[target_col]
    
    # Identify numerical and categorical columns
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    print(f"Numerical features: {numerical_cols}")
    print(f"Categorical features: {categorical_cols}")
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test, numerical_cols, categorical_cols

#  Build and evaluate the model
def build_and_evaluate_model(X_train, X_test, y_train, y_test, numerical_cols, categorical_cols):
    """
    Build, train and evaluate the prediction model
    """
    # Create preprocessor
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    # Combine transformers in a column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )
    
    # Create and evaluate different models
    models = {
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(random_state=42),
        'Linear Regression': LinearRegression()
    }
    
    results = {}
    
    for name, model in models.items():
        # Create a pipeline with preprocessing and model
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        
        # Fit the model
        print(f"\nTraining {name}...")
        pipeline.fit(X_train, y_train)
        
        # Make predictions
        y_pred = pipeline.predict(X_test)
        
        # Evaluate
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        results[name] = {
            'MSE': mse,
            'RMSE': rmse,
            'MAE': mae,
            'R2': r2,
            'Pipeline': pipeline
        }
        
        print(f"{name} Results:")
        print(f"  MSE: {mse:.4f}")
        print(f"  RMSE: {rmse:.4f}")
        print(f"  MAE: {mae:.4f}")
        print(f"  R-squared: {r2:.4f}")
    
    # Find the best model
    best_model_name = max(results, key=lambda k: results[k]['R2'])
    best_model = results[best_model_name]['Pipeline']
    
    print(f"\nBest model: {best_model_name} with R2: {results[best_model_name]['R2']:.4f}")
    
    return best_model, results

#  Feature Importance Analysis
def analyze_feature_importance(model, numerical_cols, categorical_cols):
    """
    Analyze and visualize feature importance from the best model
    """
    # Check if the model has feature_importances_ attribute (tree-based models)
    if hasattr(model[-1], 'feature_importances_'):
        try:
            # Get feature names after preprocessing
            preprocessor = model[0]
            
            # Get transformed feature names
            transformed_numerical = numerical_cols
            
            # For categorical features, get all the one-hot encoded column names
            ohe = preprocessor.transformers_[1][1].named_steps['onehot']
            transformed_categorical = ohe.get_feature_names_out(categorical_cols).tolist()
            
            all_features = transformed_numerical + transformed_categorical
            
            # Get feature importances
            importances = model[-1].feature_importances_
            
            # Ensure lengths match
            if len(all_features) == len(importances):
                # Create a DataFrame for better visualization
                feature_importance = pd.DataFrame({
                    'Feature': all_features,
                    'Importance': importances
                }).sort_values('Importance', ascending=False)
                
                print("\nFeature Importance:")
                print(feature_importance.head(20))  # Show top 20 features
                
                # Plot
                plt.figure(figsize=(10, 8))
                sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
                plt.title('Top 15 Feature Importances')
                plt.tight_layout()
                plt.savefig('feature_importance.png')
                plt.close()
                
                return feature_importance
            else:
                print(f"Feature length mismatch: {len(all_features)} features vs {len(importances)} importances")
                return None
        except Exception as e:
            print(f"Error analyzing feature importance: {str(e)}")
            return None
    else:
        print("Model doesn't have feature_importances_ attribute")
        return None

#  Hyperparameter Tuning
def tune_best_model(X_train, y_train, best_model_name, numerical_cols, categorical_cols):
    """
    Perform hyperparameter tuning on the best model
    """
    # Create the preprocessor
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )
    
    # Define the parameter grid based on the best model
    if best_model_name == 'Random Forest':
        model = RandomForestRegressor(random_state=42)
        param_grid = {
            'model__n_estimators': [50, 100],
            'model__max_depth': [None, 20],
            'model__min_samples_split': [2, 5]
        }
    elif best_model_name == 'Gradient Boosting':
        model = GradientBoostingRegressor(random_state=42)
        param_grid = {
            'model__n_estimators': [50, 100],
            'model__learning_rate': [0.01, 0.1],
            'model__max_depth': [3, 5]
        }
    else:  # Linear Regression - not much to tune
        model = LinearRegression()
        param_grid = {}
    
    # Create the pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Skip grid search tuning if Linear Regression but still fit the model
    if best_model_name == 'Linear Regression':
        print("Linear Regression doesn't require hyperparameter tuning")
        pipeline.fit(X_train, y_train)  # Make sure to fit the model
        return pipeline
    
    # Perform grid search
    print(f"\nPerforming hyperparameter tuning for {best_model_name}...")
    grid_search = GridSearchCV(
        pipeline, 
        param_grid, 
        cv=3,  # Reduced from 5 for speed
        scoring='neg_mean_squared_error',
        n_jobs=-1  # Use all available cores
    )
    
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV score: {-grid_search.best_score_:.4f} (MSE)")
    
    return grid_search.best_estimator_

#  Estimate movie rating
def estimate_movie_rating(model, featured_df, new_movie_info):
    """
    Estimate a movie rating based on movie information
    """
    try:
        # Create a DataFrame with a single row for the new movie
        new_movie_df = pd.DataFrame([new_movie_info])
        
        # Process the new movie data
        processed_df = new_movie_df.copy()
        
        # Add all columns from featured_df that are missing in processed_df
        for col in featured_df.columns:
            if col not in processed_df.columns:
                # Add with appropriate default value based on data type
                if col in ['Year', 'Duration', 'Votes']:
                    processed_df[col] = featured_df[col].median()
                else:
                    if featured_df[col].dtype.name == 'category':
                        processed_df[col] = featured_df[col].mode().iloc[0] if not featured_df[col].mode().empty else "Unknown"
                    elif pd.api.types.is_numeric_dtype(featured_df[col]):
                        processed_df[col] = featured_df[col].median()
                    else:
                        processed_df[col] = featured_df[col].mode().iloc[0] if not featured_df[col].mode().empty else "Unknown"
        
        # Feature engineering for the new movie
        # Extract primary genre
        if 'Genre' in processed_df.columns and 'Primary_Genre' not in processed_df.columns:
            processed_df['Primary_Genre'] = processed_df['Genre'].apply(
                lambda x: x.split(',')[0].strip() if isinstance(x, str) else "Unknown"
            )
        
        # Add decade from year
        if 'Year' in processed_df.columns and 'Decade' not in processed_df.columns:
            processed_df['Decade'] = (processed_df['Year'] // 10) * 10
        
        # Add Log_Votes
        if 'Votes' in processed_df.columns and 'Log_Votes' not in processed_df.columns:
            processed_df['Log_Votes'] = np.log1p(processed_df['Votes'])
        
        # Calculate duration category
        if 'Duration' in processed_df.columns and 'Duration_Category' not in processed_df.columns:
            bins = [0, 90, 120, 150, 1000]
            labels = ['Short', 'Medium', 'Long', 'Very Long']
            processed_df['Duration_Category'] = pd.cut(processed_df['Duration'], bins=bins, labels=labels, right=False)
        
        # If there are features that the model expects but we don't have, we need to drop them
        if 'Rating' in processed_df.columns:
            processed_df = processed_df.drop('Rating', axis=1)
        
        if 'Name' in processed_df.columns:
            processed_df = processed_df.drop('Name', axis=1)
        
        # Make the prediction
        prediction = model.predict(processed_df)
        
        return round(prediction[0], 1)  # Round to 1 decimal place
    
    except Exception as e:
        print(f"Error estimating rating: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

#  Create interactive GUI interface
def create_interactive_rating_estimator(model, featured_df):
    """
    Creates an interactive interface for estimating movie ratings
    """
    try:
        import tkinter as tk
        from tkinter import ttk, messagebox
    except ImportError:
        print("Tkinter not available. Using console interface instead.")
        return demonstrate_rating_estimator(model, featured_df)
    
    # Create the main window
    root = tk.Tk()
    root.title("Movie Rating Estimator")
    root.geometry("600x500")
    root.configure(padx=20, pady=20)
    
    # Create style
    style = ttk.Style()
    style.configure("TLabel", font=("Arial", 11))
    style.configure("TButton", font=("Arial", 11))
    style.configure("TEntry", font=("Arial", 11))
    
    # Create a frame
    frame = ttk.Frame(root, padding=10)
    frame.pack(fill="both", expand=True)
    
    # Create variables to store input
    movie_name = tk.StringVar()
    movie_year = tk.StringVar()
    movie_duration = tk.StringVar()
    movie_genre = tk.StringVar()
    movie_director = tk.StringVar()
    movie_actor1 = tk.StringVar()
    movie_actor2 = tk.StringVar()
    movie_actor3 = tk.StringVar()
    
    # Create input fields
    ttk.Label(frame, text="Movie Title:").grid(column=0, row=0, sticky="w", pady=5)
    ttk.Entry(frame, width=40, textvariable=movie_name).grid(column=1, row=0, pady=5)
    
    ttk.Label(frame, text="Release Year:").grid(column=0, row=1, sticky="w", pady=5)
    ttk.Entry(frame, width=40, textvariable=movie_year).grid(column=1, row=1, pady=5)
    
    ttk.Label(frame, text="Duration (minutes):").grid(column=0, row=2, sticky="w", pady=5)
    ttk.Entry(frame, width=40, textvariable=movie_duration).grid(column=1, row=2, pady=5)
    
    ttk.Label(frame, text="Genre(s) (comma separated):").grid(column=0, row=3, sticky="w", pady=5)
    ttk.Entry(frame, width=40, textvariable=movie_genre).grid(column=1, row=3, pady=5)
    
    ttk.Label(frame, text="Director:").grid(column=0, row=4, sticky="w", pady=5)
    ttk.Entry(frame, width=40, textvariable=movie_director).grid(column=1, row=4, pady=5)
    
    ttk.Label(frame, text="Lead Actor/Actress:").grid(column=0, row=5, sticky="w", pady=5)
    ttk.Entry(frame, width=40, textvariable=movie_actor1).grid(column=1, row=5, pady=5)
    
    ttk.Label(frame, text="Supporting Actor/Actress 1:").grid(column=0, row=6, sticky="w", pady=5)
    ttk.Entry(frame, width=40, textvariable=movie_actor2).grid(column=1, row=6, pady=5)
    
    ttk.Label(frame, text="Supporting Actor/Actress 2:").grid(column=0, row=7, sticky="w", pady=5)
    ttk.Entry(frame, width=40, textvariable=movie_actor3).grid(column=1, row=7, pady=5)
    
    # Create result labels
    result_frame = ttk.Frame(frame, padding=(0, 10, 0, 0))
    result_frame.grid(column=0, row=9, columnspan=2, sticky="ew")
    
    rating_label = ttk.Label(result_frame, text="", font=("Arial", 12, "bold"))
    rating_label.pack(pady=5)
    
    interpretation_label = ttk.Label(result_frame, text="")
    interpretation_label.pack(pady=5)
    
    # Define the predict function
    def predict_rating():
        try:
            # Validate inputs
            name = movie_name.get().strip()
            if not name:
                messagebox.showerror("Error", "Please enter a movie title")
                return
            
            try:
                year = int(movie_year.get().strip())
                if not (1900 <= year <= 2030):
                    messagebox.showerror("Error", "Year must be between 1900 and 2030")
                    return
            except ValueError:
                messagebox.showerror("Error", "Year must be a valid number")
                return
            
            try:
                duration = int(movie_duration.get().strip())
                if not (30 <= duration <= 300):
                    messagebox.showerror("Error", "Duration must be between 30 and 300 minutes")
                    return
            except ValueError:
                messagebox.showerror("Error", "Duration must be a valid number")
                return
            
            genre = movie_genre.get().strip()
            if not genre:
                messagebox.showerror("Error", "Please enter at least one genre")
                return
            
            director = movie_director.get().strip() or "Unknown"
            actor1 = movie_actor1.get().strip() or "Unknown"
            actor2 = movie_actor2.get().strip() or "Unknown"
            actor3 = movie_actor3.get().strip() or "Unknown"
            
            # Create movie info dictionary
            movie_info = {
                'Name': name,
                'Year': year,
                'Duration': duration,
                'Genre': genre,
                'Director': director,
                'Actor 1': actor1,
                'Actor 2': actor2,
                'Actor 3': actor3,
                'Votes': 5000  # Default value
            }
            
            # Make prediction
            predicted_rating = estimate_movie_rating(model, featured_df, movie_info)
            
            if predicted_rating is not None:
                # Update the rating label
                rating_label.config(text=f"Predicted Rating: {predicted_rating}/10")
                
                # Update interpretation
                if predicted_rating >= 8.0:
                    interpretation = "This movie is predicted to be excellent!"
                elif predicted_rating >= 7.0:
                    interpretation = "This movie is predicted to be very good."
                elif predicted_rating >= 6.0:
                    interpretation = "This movie is predicted to be above average."
                elif predicted_rating >= 5.0:
                    interpretation = "This movie is predicted to be average."
                else:
                    interpretation = "This movie is predicted to be below average."
                
                interpretation_label.config(text=interpretation)
            else:
                rating_label.config(text="Unable to predict rating")
                interpretation_label.config(text="Please check console for debug information")
        
        except Exception as e:
            messagebox.showerror("Error", f"An error occurred: {str(e)}")
            print(f"Error: {str(e)}")
            import traceback
            traceback.print_exc()
    
    # Create predict button
    predict_button = ttk.Button(frame, text="Predict Rating", command=predict_rating)
    predict_button.grid(column=0, row=8, columnspan=2, pady=15)
    
    # Create reset button
    def reset_form():
        movie_name.set("")
        movie_year.set("")
        movie_duration.set("")
        movie_genre.set("")
        movie_director.set("")
        movie_actor1.set("")
        movie_actor2.set("")
        movie_actor3.set("")
        rating_label.config(text="")
        interpretation_label.config(text="")
    
    reset_button = ttk.Button(frame, text="Reset", command=reset_form)
    reset_button.grid(column=0, row=10, columnspan=2, pady=5)
    
    # Create a sample movie button
    def load_sample():
        movie_name.set("The Awesome Adventure")
        movie_year.set("2024")
        movie_duration.set("142")
        movie_genre.set("Action, Adventure")
        movie_director.set("Steven Spielberg")
        movie_actor1.set("Tom Hanks")
        movie_actor2.set("Emma Stone")
        movie_actor3.set("Denzel Washington")
    
    sample_button = ttk.Button(frame, text="Load Sample Movie", command=load_sample)
    sample_button.grid(column=0, row=11, columnspan=2, pady=5)
    
    # Start the GUI
    root.mainloop()

#  Console demo
def demonstrate_rating_estimator(model, featured_df):
    """
    Console-based demonstration of the rating estimator
    """
    print("\n=== Movie Rating Estimator ===")
    print("Enter information about a movie to predict its rating")
    
    try:
        # Input movie information
        name = input("Movie Title: ")
        
        year = None
        while year is None:
            try:
                year = int(input("Release Year: "))
                if not (1900 <= year <= 2030):
                    print("Year must be between 1900 and 2030")
                    year = None
            except ValueError:
                print("Please enter a valid year as a number")
        
        duration = None
        while duration is None:
            try:
                duration = int(input("Duration (minutes): "))
                if not (30 <= duration <= 300):
                    print("Duration must be between 30 and 300 minutes")
                    duration = None
            except ValueError:
                print("Please enter a valid duration as a number")
        
        genre = input("Genre(s) (comma separated): ")
        director = input("Director: ") or "Unknown"
        actor1 = input("Lead Actor/Actress: ") or "Unknown"
        actor2 = input("Supporting Actor/Actress 1: ") or "Unknown"
        actor3 = input("Supporting Actor/Actress 2: ") or "Unknown"
        
        # Create movie info dictionary
        movie_info = {
            'Name': name,
            'Year': year,
            'Duration': duration,
            'Genre': genre,
            'Director': director,
            'Actor 1': actor1,
            'Actor 2': actor2,
            'Actor 3': actor3,
            'Votes': 5000  # Default value
        }
        
        # Make prediction
        predicted_rating = estimate_movie_rating(model, featured_df, movie_info)
        
        if predicted_rating is not None:
            print(f"\nPredicted Rating: {predicted_rating}/10")
            
            # Add interpretation
            if predicted_rating >= 8.0:
                interpretation = "This movie is predicted to be excellent!"
            elif predicted_rating >= 7.0:
                interpretation = "This movie is predicted to be very good."
            elif predicted_rating >= 6.0:
                interpretation = "This movie is predicted to be above average."
            elif predicted_rating >= 5.0:
                interpretation = "This movie is predicted to be average."
            else:
                interpretation = "This movie is predicted to be below average."
            
            print(interpretation)
        else:
            print("Unable to predict rating. Please check console for debug information.")
    
    except Exception as e:
        print(f"Error: {str(e)}")
        import traceback
        traceback.print_exc()
    
    print("\nThanks for using the Movie Rating Estimator!")

#  Visualize the predictions
def visualize_predictions(model, X_test, y_test):
    """
    Create visualizations of the model's predictions
    """
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Create a dataframe with actual and predicted values
    results_df = pd.DataFrame({
        'Actual': y_test,
        'Predicted': y_pred,
        'Error': y_test - y_pred
    })
    
    # 1. Actual vs Predicted scatterplot
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.6)
    
    # Add perfect prediction line
    min_val = min(min(y_test), min(y_pred))
    max_val = max(max(y_test), max(y_pred))
    plt.plot([min_val, max_val], [min_val, max_val], 'r--')
    
    plt.xlabel('Actual Rating')
    plt.ylabel('Predicted Rating')
    plt.title('Actual vs Predicted Ratings')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('actual_vs_predicted.png')
    plt.close()
    
    # 2. Error distribution histogram
    plt.figure(figsize=(10, 6))
    plt.hist(results_df['Error'], bins=20, alpha=0.7, color='blue', edgecolor='black')
    plt.axvline(x=0, color='r', linestyle='--')
    plt.xlabel('Prediction Error (Actual - Predicted)')
    plt.ylabel('Frequency')
    plt.title('Distribution of Prediction Errors')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('error_distribution.png')
    plt.close()
    
    # 3. Error vs Predicted plot
    plt.figure(figsize=(10, 6))
    plt.scatter(y_pred, results_df['Error'], alpha=0.6)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel('Predicted Rating')
    plt.ylabel('Prediction Error')
    plt.title('Prediction Error vs Predicted Rating')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('error_vs_predicted.png')
    plt.close()
    
    # Return results dataframe for further analysis
    return results_df

#  Save and load the model
def save_model(model, filename='movie_rating_model.pkl'):
    """
    Save the trained model to a file
    """
    import pickle
    
    try:
        with open(filename, 'wb') as f:
            pickle.dump(model, f)
        print(f"Model saved successfully to {filename}")
        return True
    except Exception as e:
        print(f"Error saving model: {str(e)}")
        return False

def load_model(filename='movie_rating_model.pkl'):
    """
    Load a trained model from a file
    """
    import pickle
    
    try:
        with open(filename, 'rb') as f:
            model = pickle.load(f)
        print(f"Model loaded successfully from {filename}")
        return model
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        return None

# 13. Main function
def main():
    """
    Main function to run the entire pipeline
    """
    print("=" * 50)
    print("Movie Rating Prediction System")
    print("=" * 50)
    
    # Ask for file path
    file_path = input("Enter the path to your movie dataset CSV file (or press Enter to use sample data): ")
    if not file_path.strip():
        file_path = "sample_movies.csv"  # This will trigger the sample data creation
    
    # 1. Load and explore data
    df = load_and_explore_data(file_path)
    
    # 2. Preprocess data
    processed_df = preprocess_data(df)
    
    # 3. Engineer features
    featured_df = engineer_features(processed_df)
    
    # 4. Prepare for modeling
    try:
        X_train, X_test, y_train, y_test, numerical_cols, categorical_cols = prepare_for_modeling(featured_df)
        
        # 5. Build and evaluate model
        best_model, results = build_and_evaluate_model(X_train, X_test, y_train, y_test, numerical_cols, categorical_cols)
        
        # 6. Analyze feature importance
        feature_importance = analyze_feature_importance(best_model, numerical_cols, categorical_cols)
        
        # 7. Tune the best model
        best_model_name = max(results, key=lambda k: results[k]['R2'])
        tuned_model = tune_best_model(X_train, y_train, best_model_name, numerical_cols, categorical_cols)
        
        # 8. Evaluate the tuned model
        tuned_preds = tuned_model.predict(X_test)
        tuned_r2 = r2_score(y_test, tuned_preds)
        tuned_rmse = np.sqrt(mean_squared_error(y_test, tuned_preds))
        
        print(f"\nTuned Model Performance:")
        print(f"  RMSE: {tuned_rmse:.4f}")
        print(f"  R-squared: {tuned_r2:.4f}")
        
        # 9. Visualize predictions
        results_df = visualize_predictions(tuned_model, X_test, y_test)
        
        # 10. Save the model
        save_model(tuned_model)
        
        # 11. Run the interactive rating estimator
        print("\nStarting interactive rating estimator...")
        try:
            create_interactive_rating_estimator(tuned_model, featured_df)
        except Exception as e:
            print(f"Error creating GUI: {str(e)}. Using console interface instead.")
            demonstrate_rating_estimator(tuned_model, featured_df)
    
    except Exception as e:
        print(f"An error occurred in the modeling pipeline: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Movie Rating Prediction System


Enter the path to your movie dataset CSV file (or press Enter to use sample data):  movies


Error loading data: 'utf-8' codec can't decode byte 0xe1 in position 76763: invalid continuation byte
Creating a sample dataset for testing...
Dataset loaded with 5 rows and 10 columns

First 5 records:
      Name  Year  Duration              Genre  Rating  Votes    Director  \
0  Movie 1  2020       120             Action     8.5  10000  Director 1   
1  Movie 2  2019       105             Comedy     7.2   5000  Director 2   
2  Movie 3  2021       135              Drama     8.9  12000  Director 3   
3  Movie 4  2018        95  Action, Adventure     6.8   3000  Director 1   
4  Movie 5  2022       110    Comedy, Romance     7.5   6000  Director 4   

   Actor 1  Actor 2  Actor 3  
0  Actor A  Actor E  Actor J  
1  Actor B  Actor F  Actor K  
2  Actor C  Actor G  Actor L  
3  Actor D  Actor H  Actor M  
4  Actor B  Actor I  Actor N  

Data Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dty