In [2]:
!pip install gradio pandas numpy matplotlib seaborn plotly





In [None]:
import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

# Enhanced data preprocessing functions
def convert_into_bytes(column_name):
    if isinstance(column_name, str):
        if 'k' in column_name.lower():
            return float(column_name.lower().replace("k", "")) * 1024
        elif 'm' in column_name.lower():
            return float(column_name.lower().replace("m", "")) * 1024 * 1024
        elif 'g' in column_name.lower():
            return float(column_name.lower().replace("g", "")) * 1024 * 1024 * 1024
        elif 'varies with device' in column_name.lower():
            return np.nan
        else:
            try:
                return float(column_name)
            except:
                return np.nan
    return column_name

def installs_cleaner(install):
    if isinstance(install, str):
        install = install.replace("+", "").replace(",", "")
        try:
            return int(install)
        except:
            return 0
    return install if pd.notna(install) else 0

def adjust_price(price):
    if isinstance(price, str):
        if '$' in price:
            try:
                return float(price.replace("$", ""))
            except:
                return 0.0
    return price if pd.notna(price) else 0.0

def create_advanced_features(df):
    """Create advanced features for better prediction"""
    df_enhanced = df.copy()

    # Review-to-Install ratio (engagement metric)
    if 'Reviews' in df.columns and 'Installs' in df.columns:
        df_enhanced['Review_Install_Ratio'] = np.where(
            df_enhanced['Installs'] > 0,
            df_enhanced['Reviews'] / (df_enhanced['Installs'] + 1),
            0
        )

    # Log transformations for skewed features
    if 'Reviews' in df.columns:
        df_enhanced['Log_Reviews'] = np.log1p(df_enhanced['Reviews'])
    if 'Installs' in df.columns:
        df_enhanced['Log_Installs'] = np.log1p(df_enhanced['Installs'])
    if 'Size_MB' in df.columns:
        df_enhanced['Log_Size'] = np.log1p(df_enhanced['Size_MB'].fillna(0))

    # Price categories
    if 'Price' in df.columns:
        df_enhanced['Price_Category'] = pd.cut(
            df_enhanced['Price'],
            bins=[-0.1, 0, 1, 5, 20, float('inf')],
            labels=['Free', 'Cheap', 'Moderate', 'Expensive', 'Premium']
        )

    # Category popularity score
    if 'Category' in df.columns:
        category_counts = df_enhanced['Category'].value_counts()
        df_enhanced['Category_Popularity'] = df_enhanced['Category'].map(category_counts)

    return df_enhanced

def preprocess_data(df):
    """Enhanced comprehensive data preprocessing pipeline"""
    cleaning_log = []

    try:
        original_rows = len(df)
        cleaning_log.append(f"📊 Original dataset: {original_rows:,} rows")

        # Remove problematic rows
        if 'Category' in df.columns:
            problematic_rows = df[df['Category'].astype(str).str.contains(r'^\d+\.\d+$', na=False)].index
            if len(problematic_rows) > 0:
                df = df.drop(problematic_rows, errors='ignore')
                cleaning_log.append(f"🗑️ Removed {len(problematic_rows)} problematic category rows")

        # Convert Reviews to integer
        if 'Reviews' in df.columns:
            original_nulls = df['Reviews'].isnull().sum()
            df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce').fillna(0).astype(int)
            cleaning_log.append(f"🔢 Reviews column: Fixed {original_nulls} null values")

        # Process Size column with enhanced conversion
        if 'Size' in df.columns:
            original_nulls = df['Size'].isnull().sum()
            df['Size'] = df['Size'].apply(convert_into_bytes)
            df['Size_MB'] = df['Size'] / (1024 * 1024)
            # Fill missing sizes with median by category
            if 'Category' in df.columns:
                df['Size_MB'] = df.groupby('Category')['Size_MB'].transform(
                    lambda x: x.fillna(x.median())
                )
            df['Size_MB'] = df['Size_MB'].fillna(df['Size_MB'].median())
            cleaning_log.append(f"📱 Size column: Enhanced conversion, {original_nulls} null values handled")

        # Enhanced Installs processing
        if 'Installs' in df.columns:
            original_nulls = df['Installs'].isnull().sum()
            df['Installs'] = df['Installs'].apply(installs_cleaner)
            df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce').fillna(0).astype(int)

            # Create more granular Install categories
            bins = [-1, 0, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 10000000000]
            labels = ['No installs', 'Very low', 'Low', 'Moderate', 'Good', 'High', 'Very High', 'Excellent', 'Top Tier', 'Viral']
            df['Installs_category'] = pd.cut(df['Installs'], bins=bins, labels=labels)
            cleaning_log.append(f"📈 Installs column: Enhanced categories, {original_nulls} null values handled")

        # Enhanced Price processing
        if 'Price' in df.columns:
            original_nulls = df['Price'].isnull().sum()
            df['Price'] = df['Price'].apply(adjust_price)
            df['Price'] = pd.to_numeric(df['Price'], errors='coerce').fillna(0)
            cleaning_log.append(f"💰 Price column: Cleaned format, {original_nulls} null values handled")

        # Smart Rating handling with multiple strategies
        if 'Rating' in df.columns:
            original_rating_nulls = df['Rating'].isnull().sum()

            # Strategy 1: Fill by category and install level
            if 'Category' in df.columns and 'Installs_category' in df.columns:
                for cat in df['Category'].unique():
                    for inst_cat in df['Installs_category'].unique():
                        if pd.notna(cat) and pd.notna(inst_cat):
                            mask = (df['Category'] == cat) & (df['Installs_category'] == inst_cat) & df['Rating'].isnull()
                            if mask.sum() > 0:
                                mean_rating = df[(df['Category'] == cat) & (df['Installs_category'] == inst_cat) & df['Rating'].notna()]['Rating'].mean()
                                if pd.notna(mean_rating):
                                    df.loc[mask, 'Rating'] = mean_rating

            # Strategy 2: Fill remaining by category only
            remaining_nulls = df['Rating'].isnull()
            if remaining_nulls.sum() > 0 and 'Category' in df.columns:
                category_means = df.groupby('Category')['Rating'].mean()
                for cat in df['Category'].unique():
                    if pd.notna(cat):
                        mask = (df['Category'] == cat) & df['Rating'].isnull()
                        if mask.sum() > 0:
                            df.loc[mask, 'Rating'] = category_means.get(cat, 4.0)

            # Strategy 3: Fill any remaining with global median
            df['Rating'] = df['Rating'].fillna(df['Rating'].median())

            final_rating_nulls = df['Rating'].isnull().sum()
            fixed_ratings = original_rating_nulls - final_rating_nulls
            cleaning_log.append(f"⭐ Rating column: Smart filling fixed {fixed_ratings} missing ratings")

        # Remove duplicates
        original_rows_before_dedup = len(df)
        df = df.drop_duplicates()
        duplicates_removed = original_rows_before_dedup - len(df)
        if duplicates_removed > 0:
            cleaning_log.append(f"🔄 Removed {duplicates_removed} duplicate rows")

        # Create advanced features
        df = create_advanced_features(df)
        cleaning_log.append(f"🚀 Created advanced features for better predictions")

        final_rows = len(df)
        cleaning_log.append(f"✅ Final dataset: {final_rows:,} rows ({final_rows - original_rows:+,} change)")

        return df, cleaning_log
    except Exception as e:
        cleaning_log.append(f"❌ Error during preprocessing: {str(e)}")
        return df, cleaning_log

def preprocess_reviews_data(df_reviews):
    """Preprocess user reviews data"""
    if df_reviews is None or df_reviews.empty:
        return df_reviews

    try:
        # Basic cleaning for reviews dataset
        df_reviews = df_reviews.dropna(subset=['App'])
        df_reviews = df_reviews.drop_duplicates()
        return df_reviews
    except Exception as e:
        print(f"Reviews preprocessing error: {e}")
        return df_reviews

# Enhanced visualization functions
def create_category_distribution(df):
    """Create interactive category distribution plot"""
    if df is None or df.empty or 'Category' not in df.columns:
        return None

    try:
        category_counts = df['Category'].value_counts().head(15)

        fig = px.bar(
            x=category_counts.index,
            y=category_counts.values,
            title="Top 15 App Categories by Count",
            labels={'x': 'Category', 'y': 'Number of Apps'},
            color=category_counts.values,
            color_continuous_scale='viridis',
            text=category_counts.values
        )
        fig.update_traces(texttemplate='%{text}', textposition='outside')
        fig.update_layout(xaxis_tickangle=-45, height=600)
        return fig
    except Exception as e:
        print(f"Category distribution error: {e}")
        return None

def create_rating_distribution(df):
    """Create rating distribution with category breakdown"""
    if df is None or df.empty or 'Rating' not in df.columns:
        return None

    try:
        fig = px.histogram(
            df,
            x='Rating',
            color='Installs_category' if 'Installs_category' in df.columns else None,
            title="Rating Distribution by Install Category",
            nbins=20,
            barmode='overlay',
            opacity=0.7
        )
        fig.update_layout(height=500)
        return fig
    except Exception as e:
        print(f"Rating distribution error: {e}")
        return None

def create_installs_vs_reviews(df):
    """Create scatter plot of installs vs reviews"""
    if df is None or df.empty or 'Reviews' not in df.columns or 'Installs' not in df.columns:
        return None

    try:
        # Sample data for better performance
        sample_df = df.sample(min(5000, len(df)))

        fig = px.scatter(
            sample_df,
            x=np.log10(sample_df['Reviews'] + 1),
            y=np.log10(sample_df['Installs'] + 1),
            color='Rating' if 'Rating' in df.columns else None,
            size='Size_MB' if 'Size_MB' in df.columns else None,
            title="Relationship between Reviews and Installs (Log Scale)",
            labels={'x': 'Log10(Reviews + 1)', 'y': 'Log10(Installs + 1)'},
            hover_data=['App', 'Category', 'Rating'] if all(col in df.columns for col in ['App', 'Category', 'Rating']) else None
        )
        fig.update_layout(height=600)
        return fig
    except Exception as e:
        print(f"Scatter plot error: {e}")
        return None

def create_category_metrics(df):
    """Create comprehensive category metrics"""
    if df is None or df.empty or 'Category' not in df.columns:
        return pd.DataFrame()

    try:
        metrics_dict = {}

        if 'Rating' in df.columns:
            metrics_dict['Avg_Rating'] = df.groupby('Category')['Rating'].mean()
        if 'Reviews' in df.columns:
            metrics_dict['Total_Reviews'] = df.groupby('Category')['Reviews'].sum()
            metrics_dict['Avg_Reviews'] = df.groupby('Category')['Reviews'].mean()
        if 'Installs' in df.columns:
            metrics_dict['Total_Installs'] = df.groupby('Category')['Installs'].sum()
            metrics_dict['Avg_Installs'] = df.groupby('Category')['Installs'].mean()
        if 'Price' in df.columns:
            metrics_dict['Avg_Price'] = df.groupby('Category')['Price'].mean()

        metrics_dict['App_Count'] = df.groupby('Category').size()

        metrics = pd.DataFrame(metrics_dict).round(2)

        # Sort by total installs if available, otherwise by app count
        if 'Total_Installs' in metrics.columns:
            metrics = metrics.sort_values('Total_Installs', ascending=False).head(15)
        else:
            metrics = metrics.sort_values('App_Count', ascending=False).head(15)

        return metrics
    except Exception as e:
        print(f"Category metrics error: {e}")
        return pd.DataFrame()

def create_price_analysis(df):
    """Enhanced price analysis"""
    if df is None or df.empty or 'Price' not in df.columns:
        return None

    try:
        df_copy = df.copy()
        df_copy['App_Type'] = df_copy['Price'].apply(lambda x: 'Free' if pd.isna(x) or x == 0 else 'Paid')

        # Create subplots
        fig = make_subplots(
            rows=1, cols=2,
            subplot_titles=('Free vs Paid Apps', 'Price Distribution (Paid Apps)'),
            specs=[[{"type": "bar"}, {"type": "histogram"}]]
        )

        # Free vs Paid
        type_counts = df_copy['App_Type'].value_counts()
        fig.add_trace(
            go.Bar(x=type_counts.index, y=type_counts.values, name="App Type"),
            row=1, col=1
        )

        # Price distribution for paid apps
        paid_apps = df_copy[df_copy['Price'] > 0]['Price']
        if len(paid_apps) > 0:
            fig.add_trace(
                go.Histogram(x=paid_apps, nbinsx=20, name="Price Distribution"),
                row=1, col=2
            )

        fig.update_layout(height=500, showlegend=False, title_text="Enhanced Price Analysis")
        return fig

    except Exception as e:
        print(f"Price analysis error: {e}")
        return None

def create_correlation_heatmap(df):
    """Create enhanced correlation heatmap"""
    if df is None or df.empty:
        return None

    try:
        numeric_cols = ['Rating', 'Reviews', 'Installs', 'Price', 'Size_MB', 'Log_Reviews', 'Log_Installs', 'Review_Install_Ratio']
        available_cols = [col for col in numeric_cols if col in df.columns]

        if len(available_cols) < 2:
            return None

        correlation_matrix = df[available_cols].corr()

        fig = px.imshow(
            correlation_matrix,
            text_auto=True,
            aspect="auto",
            title="Enhanced Correlation Matrix of Features",
            color_continuous_scale='RdBu',
            zmin=-1, zmax=1
        )
        fig.update_layout(height=600)
        return fig
    except Exception as e:
        print(f"Correlation heatmap error: {e}")
        return None

def create_sentiment_analysis(df_reviews):
    """Analyze sentiment from reviews data"""
    if df_reviews is None or df_reviews.empty or 'Sentiment' not in df_reviews.columns:
        return None

    try:
        sentiment_counts = df_reviews['Sentiment'].value_counts()

        fig = px.pie(
            values=sentiment_counts.values,
            names=sentiment_counts.index,
            title="Sentiment Distribution of User Reviews",
            color_discrete_sequence=px.colors.qualitative.Set3
        )
        fig.update_traces(textposition='inside', textinfo='percent+label')
        fig.update_layout(height=500)
        return fig
    except Exception as e:
        print(f"Sentiment analysis error: {e}")
        return None

def create_size_distribution(df):
    """Create app size distribution"""
    if df is None or df.empty or 'Size_MB' not in df.columns:
        return None

    try:
        size_data = df['Size_MB'].dropna()
        size_data = size_data[size_data <= size_data.quantile(0.95)]  # Remove extreme outliers

        fig = px.histogram(
            x=size_data,
            nbins=40,
            title="App Size Distribution (MB) - 95th Percentile",
            labels={'x': 'Size (MB)', 'y': 'Count'},
            marginal="box"  # Add box plot
        )
        fig.update_layout(height=500)
        return fig
    except Exception as e:
        print(f"Size distribution error: {e}")
        return None

def create_installs_pie(df):
    """Create installs category pie chart"""
    if df is None or df.empty or 'Installs_category' not in df.columns:
        return None

    try:
        installs_counts = df['Installs_category'].value_counts()

        fig = px.pie(
            values=installs_counts.values,
            names=installs_counts.index,
            title="Installs Category Distribution",
            color_discrete_sequence=px.colors.qualitative.Pastel
        )
        fig.update_traces(textposition='inside', textinfo='percent+label')
        fig.update_layout(height=500)
        return fig
    except Exception as e:
        print(f"Installs pie error: {e}")
        return None

def create_missing_values_plot(df):
    """Create missing values visualization"""
    if df is None or df.empty:
        return None

    try:
        missing_pct = (df.isnull().mean() * 100).sort_values(ascending=False)
        missing_pct = missing_pct[missing_pct > 0]

        if missing_pct.empty:
            fig = go.Figure()
            fig.add_annotation(
                text="No missing values found! ✅",
                xref="paper", yref="paper",
                x=0.5, y=0.5, xanchor='center', yanchor='middle',
                showarrow=False,
                font=dict(size=20, color="green")
            )
            fig.update_layout(title="Missing Values Analysis", height=400)
            return fig

        fig = px.bar(
            x=missing_pct.index,
            y=missing_pct.values,
            title="Missing Values Percentage by Column",
            labels={'x': 'Column', 'y': 'Percentage Missing'},
            color=missing_pct.values,
            color_continuous_scale='Reds'
        )
        fig.update_layout(height=400, xaxis_tickangle=-45)
        return fig
    except Exception as e:
        print(f"Missing values plot error: {e}")
        return None

def get_duplicates_info(df):
    """Get duplicates information"""
    if df is None or df.empty:
        return "No data available"

    try:
        duplicates = df.duplicated().sum()
        total_rows = len(df)
        duplicate_pct = (duplicates / total_rows) * 100

        return f"""
🔁 **Duplicate Analysis:**
• Total duplicates: **{duplicates:,}**
• Total rows: **{total_rows:,}**
• Duplicate percentage: **{duplicate_pct:.2f}%**
        """
    except Exception as e:
        return f"Error analyzing duplicates: {e}"

def get_safe_info_string(df_processed):
    """Generate safe info string"""
    if df_processed is None or df_processed.empty:
        return "❌ No data processed. Please check your CSV file format."

    try:
        total_apps = len(df_processed)
        categories = df_processed['Category'].nunique() if 'Category' in df_processed.columns else 0
        unique_apps = df_processed['App'].nunique() if 'App' in df_processed.columns else 0

        # Safe rating calculation
        if 'Rating' in df_processed.columns and not df_processed['Rating'].isnull().all():
            avg_rating = f"{df_processed['Rating'].mean():.2f}"
            rating_std = f"{df_processed['Rating'].std():.2f}"
        else:
            avg_rating = "N/A"
            rating_std = "N/A"

        # Safe reviews calculation
        if 'Reviews' in df_processed.columns:
            total_reviews = f"{df_processed['Reviews'].sum():,}"
            avg_reviews = f"{df_processed['Reviews'].mean():.0f}"
        else:
            total_reviews = "N/A"
            avg_reviews = "N/A"

        # Safe price calculations
        if 'Price' in df_processed.columns:
            free_apps = f"{(df_processed['Price'] == 0).sum():,}"
            paid_apps = f"{(df_processed['Price'] > 0).sum():,}"
            avg_price = f"${df_processed[df_processed['Price'] > 0]['Price'].mean():.2f}" if any(df_processed['Price'] > 0) else "N/A"
        else:
            free_apps = "N/A"
            paid_apps = "N/A"
            avg_price = "N/A"

        info_text = f"""📊 **Enhanced Apps Dataset Overview:**
• Total Apps: {total_apps:,}
• Categories: {categories}
• Unique Apps: {unique_apps:,}
• Average Rating: {avg_rating} (±{rating_std})
• Total Reviews: {total_reviews}
• Avg Reviews per App: {avg_reviews}
• Free Apps: {free_apps}
• Paid Apps: {paid_apps}
• Avg Price (Paid): {avg_price}

✅ **Data successfully enhanced and ready for advanced analysis!**"""

        return info_text
    except Exception as e:
        return f"❌ Error generating info: {str(e)}"

def get_safe_preview(df_processed):
    """Generate safe data preview"""
    if df_processed is None or df_processed.empty:
        return pd.DataFrame()

    try:
        preview_cols = ['App', 'Category', 'Rating', 'Reviews', 'Installs', 'Price', 'Size_MB']
        available_cols = [col for col in preview_cols if col in df_processed.columns]

        if not available_cols:
            available_cols = list(df_processed.columns)[:8]

        return df_processed[available_cols].head(10)
    except Exception as e:
        print(f"Preview error: {e}")
        return pd.DataFrame()

# FIXED: Category choices function
def get_category_choices(df_processed):
    """Get category choices safely with proper sorting - COMPLETELY FIXED"""
    try:
        if df_processed is None or df_processed.empty:
            return ["All"]

        if 'Category' not in df_processed.columns:
            return ["All"]

        # Get unique categories, remove NaN/empty values
        unique_categories = df_processed['Category'].dropna().unique()
        unique_categories = [str(cat).strip() for cat in unique_categories
                           if str(cat).strip() not in ['', 'nan', 'None', 'NaN']]

        # Sort categories alphabetically
        unique_categories = sorted(list(set(unique_categories)))

        # Return with "All" as first option
        return ["All"] + unique_categories

    except Exception as e:
        print(f"Category choices error: {e}")
        return ["All"]

# FIXED: Enhanced Predictive Modeling Functions
def prepare_enhanced_features_for_modeling(df, selected_features):
    """Prepare enhanced features for machine learning - FIXED"""
    if df is None or df.empty or 'Rating' not in df.columns:
        return None, None, None, None, "❌ No data or Rating column not found"

    try:
        # Remove rows with missing ratings
        model_df = df.dropna(subset=['Rating']).copy()

        if len(model_df) < 100:
            return None, None, None, None, "❌ Not enough data for modeling (need at least 100 samples)"

        # Prepare feature matrix
        X_data = []
        feature_names = []

        # Create label encoders dictionary
        label_encoders = {}

        # Add selected features
        for feature in selected_features:
            if feature == 'Reviews' and 'Reviews' in model_df.columns:
                feature_data = model_df['Reviews'].fillna(0).values
                X_data.append(feature_data)
                feature_names.append('Reviews')

            elif feature == 'Log_Reviews' and 'Log_Reviews' in model_df.columns:
                feature_data = model_df['Log_Reviews'].fillna(0).values
                X_data.append(feature_data)
                feature_names.append('Log_Reviews')

            elif feature == 'Installs' and 'Installs' in model_df.columns:
                feature_data = model_df['Installs'].fillna(0).values
                X_data.append(feature_data)
                feature_names.append('Installs')

            elif feature == 'Log_Installs' and 'Log_Installs' in model_df.columns:
                feature_data = model_df['Log_Installs'].fillna(0).values
                X_data.append(feature_data)
                feature_names.append('Log_Installs')

            elif feature == 'Size_MB' and 'Size_MB' in model_df.columns:
                feature_data = model_df['Size_MB'].fillna(model_df['Size_MB'].median()).values
                X_data.append(feature_data)
                feature_names.append('Size_MB')

            elif feature == 'Price' and 'Price' in model_df.columns:
                feature_data = model_df['Price'].fillna(0).values
                X_data.append(feature_data)
                feature_names.append('Price')

            elif feature == 'Review_Install_Ratio' and 'Review_Install_Ratio' in model_df.columns:
                feature_data = model_df['Review_Install_Ratio'].fillna(0).values
                X_data.append(feature_data)
                feature_names.append('Review_Install_Ratio')

            elif feature == 'Category' and 'Category' in model_df.columns:
                le = LabelEncoder()
                category_data = model_df['Category'].fillna('Unknown')
                category_encoded = le.fit_transform(category_data)
                X_data.append(category_encoded)
                feature_names.append('Category')
                label_encoders['Category'] = le

            elif feature == 'Category_Popularity' and 'Category_Popularity' in model_df.columns:
                feature_data = model_df['Category_Popularity'].fillna(0).values
                X_data.append(feature_data)
                feature_names.append('Category_Popularity')

        if not X_data:
            return None, None, None, None, "❌ No valid features selected or available in data"

        X = np.column_stack(X_data)
        y = model_df['Rating'].values

        # Remove any remaining NaN or infinite values
        valid_mask = np.isfinite(X).all(axis=1) & np.isfinite(y)
        X = X[valid_mask]
        y = y[valid_mask]

        if len(X) < 100:
            return None, None, None, None, "❌ Not enough valid data after cleaning"

        return X, y, feature_names, label_encoders, None

    except Exception as e:
        return None, None, None, None, f"❌ Error preparing features: {str(e)}"

# FIXED: Model training function
def train_enhanced_rating_prediction_model(df, selected_features):
    """Train enhanced machine learning model with multiple algorithms - COMPLETELY FIXED"""
    if not selected_features:
        return "❌ Please select at least one feature for training", None, None, None, None

    X, y, feature_names, label_encoders, error = prepare_enhanced_features_for_modeling(df, selected_features)

    if error:
        return error, None, None, None, None

    try:
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Scale features using RobustScaler (better for outliers)
        scaler = RobustScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Train multiple models
        models = {
            'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10),
            'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42, max_depth=6),
            'Linear Regression': LinearRegression()
        }

        best_model = None
        best_score = -np.inf
        best_model_name = ""

        model_results = {}

        for name, model in models.items():
            # Train model
            model.fit(X_train_scaled, y_train)

            # Make predictions
            y_pred = model.predict(X_test_scaled)

            # Calculate metrics
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            # Cross-validation score
            try:
                cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=3, scoring='r2')
                cv_mean = cv_scores.mean()
            except:
                cv_mean = r2  # Fallback to test R2

            model_results[name] = {
                'model': model,
                'r2': r2,
                'rmse': rmse,
                'mae': mae,
                'cv_score': cv_mean
            }

            # Track best model
            if r2 > best_score:
                best_score = r2
                best_model = model
                best_model_name = name

        # Prepare results
        results_text = f"""🤖 **Enhanced Model Training Results:**

📊 **Dataset Info:**
• Training samples: {len(X_train):,}
• Testing samples: {len(X_test):,}
• Features used: {', '.join(feature_names)}

🏆 **Best Model: {best_model_name}**

🎯 **Model Comparison:**"""

        for name, results in model_results.items():
            emoji = "🥇" if name == best_model_name else "🥈" if results['r2'] == sorted([r['r2'] for r in model_results.values()], reverse=True)[1] else "🥉"
            results_text += f"""
{emoji} **{name}:**
   • R² Score: {results['r2']:.4f} ({results['r2']*100:.2f}% variance explained)
   • Cross-Val R²: {results['cv_score']:.4f}
   • RMSE: {results['rmse']:.4f}
   • MAE: {results['mae']:.4f}"""

        results_text += f"""

📈 **Best Model Interpretation:**
• Explains {best_score*100:.2f}% of rating variance
• Average prediction error: ±{model_results[best_model_name]['mae']:.2f} stars
• Cross-validation stability: {model_results[best_model_name]['cv_score']:.3f}
{"• 🎉 Excellent model performance!" if best_score > 0.6 else "• 🎊 Good model performance!" if best_score > 0.4 else "• ⚠️ Model needs improvement - try more features"}
"""

        # Feature importance (for tree-based models)
        importance_fig = None
        if hasattr(best_model, 'feature_importances_'):
            importance_fig = create_enhanced_feature_importance_plot(best_model, feature_names)

        # FIXED: Return the actual model object, not results
        return results_text, best_model, scaler, importance_fig, label_encoders

    except Exception as e:
        return f"❌ Error training model: {str(e)}", None, None, None, None

def create_enhanced_feature_importance_plot(model, feature_names):
    """Create enhanced feature importance visualization"""
    try:
        importances = model.feature_importances_

        # Sort by importance
        indices = np.argsort(importances)[::-1]
        sorted_features = [feature_names[i] for i in indices]
        sorted_importances = importances[indices]

        fig = px.bar(
            x=sorted_importances,
            y=sorted_features,
            orientation='h',
            title="Feature Importance for Rating Prediction",
            labels={'x': 'Importance', 'y': 'Features'},
            color=sorted_importances,
            color_continuous_scale='plasma'
        )
        fig.update_layout(height=max(400, len(feature_names) * 40), yaxis={'categoryorder':'total ascending'})
        return fig
    except Exception as e:
        print(f"Feature importance plot error: {e}")
        return None

# FIXED: Prediction function
def predict_enhanced_rating(model, scaler, label_encoders, feature_names, features_dict):
    """Enhanced prediction with confidence intervals - COMPLETELY FIXED"""
    if model is None:
        return "❌ Please train a model first"

    if scaler is None:
        return "❌ Model scaler not available. Please retrain the model."

    try:
        # Prepare input features in the same order as training
        input_features = []

        for feature in feature_names:
            if feature == 'Reviews':
                input_features.append(features_dict.get('reviews', 0))
            elif feature == 'Log_Reviews':
                input_features.append(np.log1p(features_dict.get('reviews', 0)))
            elif feature == 'Installs':
                input_features.append(features_dict.get('installs', 0))
            elif feature == 'Log_Installs':
                input_features.append(np.log1p(features_dict.get('installs', 0)))
            elif feature == 'Size_MB':
                input_features.append(features_dict.get('size', 25))
            elif feature == 'Price':
                input_features.append(features_dict.get('price', 0))
            elif feature == 'Review_Install_Ratio':
                reviews = features_dict.get('reviews', 0)
                installs = features_dict.get('installs', 0)
                ratio = reviews / (installs + 1) if installs > 0 else 0
                input_features.append(ratio)
            elif feature == 'Category' and label_encoders and 'Category' in label_encoders:
                category = features_dict.get('category', 'GAME')
                try:
                    category_encoded = label_encoders['Category'].transform([category])[0]
                except (ValueError, KeyError):
                    category_encoded = 0  # Default encoding for unknown categories
                input_features.append(category_encoded)
            elif feature == 'Category_Popularity':
                # Approximate category popularity
                input_features.append(1000)  # Default moderate popularity

        if not input_features:
            return "❌ No valid features provided"

        # Scale and predict
        input_array = np.array(input_features).reshape(1, -1)

        # Check if input array matches expected features
        if input_array.shape[1] != len(feature_names):
            return f"❌ Feature mismatch. Expected {len(feature_names)} features, got {input_array.shape[1]}"

        input_scaled = scaler.transform(input_array)
        prediction = model.predict(input_scaled)[0]

        # Ensure prediction is within valid range
        prediction = max(1.0, min(5.0, prediction))

        # Get confidence estimate (for tree-based models)
        confidence_info = ""
        if hasattr(model, 'estimators_'):
            try:
                # For ensemble methods, get prediction variance
                predictions = [tree.predict(input_scaled)[0] for tree in model.estimators_[:10]]  # Sample first 10 trees
                std_pred = np.std(predictions)
                confidence_info = f" (±{std_pred:.2f} std)"
            except:
                confidence_info = ""

        # Provide interpretation
        if prediction >= 4.5:
            rating_interpretation = "🌟 Excellent app potential!"
        elif prediction >= 4.0:
            rating_interpretation = "👍 Good app potential!"
        elif prediction >= 3.5:
            rating_interpretation = "😐 Average app potential"
        else:
            rating_interpretation = "⚠️ Below average potential"

        return f"""🎯 **Predicted Rating: {prediction:.2f}⭐{confidence_info}**

{rating_interpretation}

📊 **Input Summary:**
• Reviews: {features_dict.get('reviews', 0):,}
• Installs: {features_dict.get('installs', 0):,}
• Size: {features_dict.get('size', 25):.1f} MB
• Price: ${features_dict.get('price', 0):.2f}
• Category: {features_dict.get('category', 'GAME')}"""

    except Exception as e:
        return f"❌ Error making prediction: {str(e)}"

# FIXED: Enhanced filtering and insights
def filter_apps_enhanced(df, categories, min_rating, min_installs=0, max_price=None):
    """Enhanced app filtering with multiple criteria - COMPLETELY FIXED"""
    if df is None or df.empty:
        return pd.DataFrame(), "❌ No data available. Please upload and process the apps data first."

    try:
        filtered = df.copy()
        applied_filters = []

        # FIXED: Category filter handling
        if categories:
            # Handle different input types
            if isinstance(categories, str):
                categories = [categories]

            # Remove empty/None values
            categories = [cat for cat in categories if cat and str(cat).strip() != ""]

            if categories and "All" not in categories and 'Category' in df.columns:
                # Apply category filter
                filtered = filtered[filtered['Category'].isin(categories)]
                applied_filters.append(f"Categories: {', '.join(categories)}")

        # Apply rating filter
        if 'Rating' in filtered.columns:
            filtered = filtered[filtered['Rating'] >= min_rating]
            applied_filters.append(f"Min Rating: {min_rating}")

        # Apply installs filter
        if min_installs > 0 and 'Installs' in filtered.columns:
            filtered = filtered[filtered['Installs'] >= min_installs]
            applied_filters.append(f"Min Installs: {min_installs:,}")

        # Apply price filter
        if max_price is not None and 'Price' in filtered.columns:
            filtered = filtered[filtered['Price'] <= max_price]
            applied_filters.append(f"Max Price: ${max_price}")

        # Get top apps with enhanced ranking
        if len(filtered) > 0:
            display_cols = ['App', 'Category', 'Rating', 'Reviews', 'Installs', 'Price', 'Size_MB']
            available_cols = [col for col in display_cols if col in filtered.columns]

            # Enhanced ranking algorithm
            if 'Installs' in filtered.columns and 'Rating' in filtered.columns:
                # Combine rating and popularity for better ranking
                filtered['Ranking_Score'] = (
                    filtered['Rating'] * 0.4 +  # 40% weight to rating
                    np.log1p(filtered['Installs']) * 0.4 +  # 40% to install popularity
                    np.log1p(filtered['Reviews']) * 0.2  # 20% to review count
                )
                top_apps = filtered.nlargest(25, 'Ranking_Score')[available_cols]
            elif 'Installs' in filtered.columns:
                top_apps = filtered.nlargest(25, 'Installs')[available_cols]
            elif 'Rating' in filtered.columns:
                top_apps = filtered.nlargest(25, 'Rating')[available_cols]
            else:
                top_apps = filtered[available_cols].head(25)

            # Clean up the display
            if 'Ranking_Score' in top_apps.columns:
                top_apps = top_apps.drop('Ranking_Score', axis=1)
        else:
            top_apps = pd.DataFrame()

        # Generate enhanced insights
        if len(filtered) > 0:
            insights = generate_enhanced_insights(filtered, applied_filters)
        else:
            insights = f"""❌ **No apps found matching the criteria:**

🔍 **Applied Filters:**
{chr(10).join([f'• {f}' for f in applied_filters]) if applied_filters else '• None'}

💡 **Try adjusting your filters:**
• Lower the minimum rating
• Reduce minimum installs requirement
• Increase maximum price
• Select different categories"""

        return top_apps, insights

    except Exception as e:
        return pd.DataFrame(), f"❌ Error filtering data: {str(e)}"

def generate_enhanced_insights(filtered_df, applied_filters):
    """Generate comprehensive insights for filtered data"""
    try:
        total_apps = len(filtered_df)

        # Basic stats
        avg_rating = filtered_df['Rating'].mean() if 'Rating' in filtered_df.columns else 0
        rating_std = filtered_df['Rating'].std() if 'Rating' in filtered_df.columns else 0

        # Category analysis
        top_category = "N/A"
        category_diversity = 0
        if 'Category' in filtered_df.columns and len(filtered_df) > 0:
            category_counts = filtered_df['Category'].value_counts()
            top_category = category_counts.index[0] if len(category_counts) > 0 else "N/A"
            category_diversity = filtered_df['Category'].nunique()

        # App analysis
        most_reviewed = "N/A"
        most_installed = "N/A"
        highest_rated = "N/A"

        if 'Reviews' in filtered_df.columns and len(filtered_df) > 0 and filtered_df['Reviews'].max() > 0:
            most_reviewed = filtered_df.loc[filtered_df['Reviews'].idxmax(), 'App']

        if 'Installs' in filtered_df.columns and len(filtered_df) > 0 and filtered_df['Installs'].max() > 0:
            most_installed = filtered_df.loc[filtered_df['Installs'].idxmax(), 'App']

        if 'Rating' in filtered_df.columns and len(filtered_df) > 0:
            max_rating = filtered_df['Rating'].max()
            highest_rated_apps = filtered_df[filtered_df['Rating'] == max_rating]['App'].head(3).tolist()
            highest_rated = ', '.join(highest_rated_apps) if highest_rated_apps else "N/A"

        # Price analysis
        free_apps = paid_apps = avg_price = "N/A"
        if 'Price' in filtered_df.columns:
            free_apps = f"{(filtered_df['Price'] == 0).sum():,}"
            paid_apps = f"{(filtered_df['Price'] > 0).sum():,}"
            paid_df = filtered_df[filtered_df['Price'] > 0]
            if len(paid_df) > 0:
                avg_price = f"${paid_df['Price'].mean():.2f}"

        # Size analysis
        avg_size = "N/A"
        if 'Size_MB' in filtered_df.columns:
            avg_size = f"{filtered_df['Size_MB'].mean():.1f} MB"

        insights = f"""🎯 **Enhanced Filtered Results:**

📊 **Overview:**
• Apps found: **{total_apps:,}**
• Category diversity: **{category_diversity}** categories
• Average rating: **{avg_rating:.2f}** (±{rating_std:.2f})
• Average size: **{avg_size}**

💰 **Pricing:**
• Free apps: **{free_apps}**
• Paid apps: **{paid_apps}**
• Avg price (paid): **{avg_price}**

🏆 **Top Performers:**
• Most reviewed: **{most_reviewed}**
• Most installed: **{most_installed}**
• Highest rated: **{highest_rated}**
• Dominant category: **{top_category}**

🔍 **Applied Filters:**
{chr(10).join([f'• {f}' for f in applied_filters]) if applied_filters else '• None (showing all data)'}

💡 **Market Insights:**
{"• 🔥 Highly competitive segment!" if total_apps > 1000 else "• 📈 Moderate competition" if total_apps > 100 else "• 🎯 Low competition - great opportunity!"}
{"• ⭐ Premium quality apps!" if avg_rating != "N/A" and float(avg_rating) > 4.2 else "👍 Good quality apps" if avg_rating != "N/A" and float(avg_rating) > 3.5 else "⚠️ Quality improvement needed"}
{market_maturity_insight}
"""


        return insights
    except Exception as e:
        return f"❌ Error generating insights: {str(e)}"

# FIXED: Main Gradio Interface
def main_interface():
    with gr.Blocks(title="Enhanced Google Play Store Apps Analytics Dashboard", theme=gr.themes.Soft()) as demo:
        gr.Markdown("""
        # 📱 Enhanced Google Play Store Apps Analytics Dashboard

        **🚀 Advanced insights with machine learning predictions and comprehensive analytics**

        Upload your dataset and discover deep patterns in app categories, ratings, installations, and more!
        """)

        # Global state variables
        processed_df = gr.State()
        processed_reviews_df = gr.State()
        trained_model = gr.State()
        model_scaler = gr.State()
        model_label_encoders = gr.State()
        model_features_state = gr.State([])

        with gr.Tab("📂 Upload & Clean"):
            with gr.Row():
                with gr.Column():
                    apps_file = gr.File(
                        label="Upload Apps CSV (googleplaystore.csv)",
                        file_types=[".csv"],
                        type="filepath"
                    )
                    upload_btn = gr.Button("📤 Upload Data", variant="primary")
                    clean_btn = gr.Button("🧹 Clean & Enhance Data", variant="secondary")

                with gr.Column():
                    reviews_file = gr.File(
                        label="Upload Reviews CSV (optional)",
                        file_types=[".csv"],
                        type="filepath"
                    )
                    process_reviews_btn = gr.Button("🔄 Process Reviews Data", variant="secondary")

            with gr.Row():
                with gr.Column():
                    data_info = gr.Textbox(
                        label="Dataset Information",
                        lines=15,
                        interactive=False,
                        value="Upload your CSV file to get started..."
                    )
                with gr.Column():
                    cleaning_log = gr.Textbox(
                        label="Data Cleaning & Enhancement Log",
                        lines=15,
                        interactive=False,
                        value="Upload and clean data to see detailed processing log..."
                    )

            data_preview = gr.Dataframe(
                label="Data Preview (Enhanced)",
                interactive=False,
            )

        with gr.Tab("📈 Category Analysis"):
            analyze_category_btn = gr.Button("🔍 Analyze Categories", variant="primary", size="lg")
            category_plot = gr.Plot(label="Category Distribution")
            category_metrics_table = gr.Dataframe(label="Comprehensive Category Metrics")

        with gr.Tab("⭐ Rating Analysis"):
            analyze_rating_btn = gr.Button("🔍 Analyze Ratings", variant="primary", size="lg")
            rating_plot = gr.Plot(label="Rating Distribution by Install Categories")
            correlation_plot = gr.Plot(label="Enhanced Feature Correlations")

        with gr.Tab("📱 Install vs Reviews"):
            analyze_scatter_btn = gr.Button("🔍 Analyze Relationships", variant="primary", size="lg")
            scatter_plot = gr.Plot(label="Advanced Installs vs Reviews Analysis")

        with gr.Tab("💰 Price Analysis"):
            analyze_price_btn = gr.Button("🔍 Analyze Pricing", variant="primary", size="lg")
            price_plot = gr.Plot(label="Comprehensive Price Analysis")

        with gr.Tab("📦 Size & Installs"):
            analyze_size_btn = gr.Button("🔍 Analyze Size & Installs", variant="primary", size="lg")
            size_plot = gr.Plot(label="App Size Distribution with Statistics")
            installs_pie_plot = gr.Plot(label="Enhanced Installs Category Distribution")

        with gr.Tab("🩺 Data Quality"):
            analyze_quality_btn = gr.Button("🔍 Analyze Data Quality", variant="primary", size="lg")
            missing_plot = gr.Plot(label="Missing Values Analysis")
            duplicates_info = gr.Markdown()

        with gr.Tab("😊 Sentiment Analysis"):
            analyze_sentiment_btn = gr.Button("🔍 Analyze Sentiments", variant="primary", size="lg")
            sentiment_plot = gr.Plot(label="User Review Sentiments")
            sentiment_info = gr.Textbox(
                label="Sentiment Analysis Info",
                lines=5,
                interactive=False
            )

        with gr.Tab("🤖 Enhanced ML Prediction"):
            gr.Markdown("### 🚀 Advanced Rating Prediction with Multiple Models")

            with gr.Row():
                with gr.Column():
                    model_features = gr.CheckboxGroup(
                        label="Select Features for Enhanced Prediction",
                        choices=["Reviews", "Log_Reviews", "Installs", "Log_Installs", "Size_MB", "Price", "Category", "Review_Install_Ratio", "Category_Popularity"],
                        value=["Log_Reviews", "Log_Installs", "Review_Install_Ratio"],
                        interactive=True
                    )
                    train_model_btn = gr.Button("🔮 Train Enhanced Models", variant="primary", size="lg")

                with gr.Column():
                    model_results = gr.Textbox(
                        label="Enhanced Model Performance Results",
                        lines=12,
                        interactive=False,
                        placeholder="Train models to see comprehensive performance comparison..."
                    )

            with gr.Row():
                with gr.Column():
                    predict_reviews = gr.Number(label="Reviews Count", value=5000, precision=0)
                    predict_installs = gr.Number(label="Installs Count", value=100000, precision=0)
                    predict_size = gr.Number(label="Size (MB)", value=25.0)
                    predict_price = gr.Number(label="Price ($)", value=0.0)

                with gr.Column():
                    predict_category = gr.Dropdown(
                        label="App Category",
                        choices=["GAME", "COMMUNICATION", "TOOLS", "PRODUCTIVITY", "ENTERTAINMENT", "SOCIAL", "EDUCATION", "PHOTOGRAPHY", "SHOPPING", "TRAVEL_AND_LOCAL"],
                        value="GAME"
                    )
                    predict_btn = gr.Button("🎯 Predict Rating", variant="secondary", size="lg")
                    predicted_rating = gr.Textbox(
                        label="Enhanced Prediction Result",
                        interactive=False,
                        lines=8,
                        placeholder="Train model and enter values to get detailed prediction..."
                    )

            feature_importance_plot = gr.Plot(label="Enhanced Feature Importance Analysis")

        with gr.Tab("🎯 Enhanced App Insights"):
            gr.Markdown("### 🔍 Advanced Multi-Criteria App Filtering")

            with gr.Row():
                with gr.Column():
                    category_filter = gr.Dropdown(
                        label="Select Categories (Multi-select)",
                        choices=["All"],
                        value=["All"],
                        multiselect=True
                    )
                    rating_filter = gr.Slider(
                        minimum=1.0,
                        maximum=5.0,
                        value=4.0,
                        step=0.1,
                        label="Minimum Rating"
                    )

                with gr.Column():
                    installs_filter = gr.Number(
                        label="Minimum Installs",
                        value=0,
                        precision=0
                    )
                    price_filter = gr.Number(
                        label="Maximum Price ($)",
                        value=100,
                        precision=2
                    )

            filter_btn = gr.Button("🔍 Apply Enhanced Filters", variant="primary", size="lg")

            filtered_apps = gr.Dataframe(
                label="Top Filtered Apps (Enhanced Ranking)",
                interactive=False
            )

            insights_text = gr.Textbox(
                label="Comprehensive Insights & Analytics",
                lines=12,
                interactive=False
            )

        # FIXED: Event handler functions
        def upload_data(file_path):
            if file_path is None:
                return "❌ No file uploaded", pd.DataFrame(), None, gr.Dropdown(choices=["All"], value=["All"]), []

            try:
                df = pd.read_csv(file_path)
                if df.empty:
                    return "❌ CSV file is empty", pd.DataFrame(), None, gr.Dropdown(choices=["All"], value=["All"]), []

                info = f"""📤 **Data Upload Successful!**
• File loaded: {len(df):,} rows, {len(df.columns)} columns
• Columns: {', '.join(df.columns.tolist())}
• Memory usage: ~{df.memory_usage(deep=True).sum() / 1024**2:.1f} MB

🔄 **Next Step:** Click "Clean & Enhance Data" to apply advanced preprocessing and feature engineering"""

                preview_df = df.head(10)
                categories = get_category_choices(df)

                return info, preview_df, df, gr.Dropdown(choices=categories, value=["All"]), []

            except Exception as e:
                return f"❌ Error loading file: {str(e)}", pd.DataFrame(), None, gr.Dropdown(choices=["All"], value=["All"]), []

        def clean_data(df):
            if df is None or df.empty:
                return "❌ No data to clean. Please upload a file first.", pd.DataFrame(), None, gr.Dropdown(choices=["All"], value=["All"]), "❌ No cleaning performed"

            try:
                df_processed, log = preprocess_data(df.copy())

                info = get_safe_info_string(df_processed)
                preview_df = get_safe_preview(df_processed)
                categories = get_category_choices(df_processed)
                log_text = "\n".join(log)

                return info, preview_df, df_processed, gr.Dropdown(choices=categories, value=["All"]), log_text

            except Exception as e:
                error_log = f"❌ Error during cleaning: {str(e)}"
                return f"❌ Error cleaning data: {str(e)}", pd.DataFrame(), None, gr.Dropdown(choices=["All"], value=["All"]), error_log

        def process_reviews_data(file_path):
            if file_path is None:
                return "❌ No reviews file uploaded", None

            try:
                df_reviews = pd.read_csv(file_path)
                df_reviews_processed = preprocess_reviews_data(df_reviews.copy())

                unique_apps = df_reviews_processed['App'].nunique() if 'App' in df_reviews_processed.columns else 0

                info = f"""📝 **Enhanced Reviews Dataset Overview:**
• Total Reviews: {len(df_reviews_processed):,}
• Unique Apps: {unique_apps:,}
• Columns: {', '.join(df_reviews_processed.columns.tolist())}

✅ **Reviews data successfully processed!**"""

                return info, df_reviews_processed

            except Exception as e:
                return f"❌ Error processing reviews file: {str(e)}", None

        def filter_apps_handler(df, categories, min_rating, min_installs, max_price):
            """Enhanced filtering handler - FIXED"""
            if df is None or df.empty:
                return pd.DataFrame(), "❌ No data available. Please upload and process the apps data first."

            return filter_apps_enhanced(df, categories, min_rating, min_installs, max_price)

        # Individual analysis functions
        def analyze_categories(df):
            if df is None or df.empty:
                return None, pd.DataFrame()
            category_fig = create_category_distribution(df)
            metrics_table = create_category_metrics(df)
            return category_fig, metrics_table

        def analyze_ratings(df):
            if df is None or df.empty:
                return None, None
            rating_fig = create_rating_distribution(df)
            corr_fig = create_correlation_heatmap(df)
            return rating_fig, corr_fig

        def analyze_scatter(df):
            if df is None or df.empty:
                return None
            return create_installs_vs_reviews(df)

        def analyze_pricing(df):
            if df is None or df.empty:
                return None
            return create_price_analysis(df)

        def analyze_size_installs(df):
            if df is None or df.empty:
                return None, None
            size_fig = create_size_distribution(df)
            installs_fig = create_installs_pie(df)
            return size_fig, installs_fig

        def analyze_data_quality(df):
            if df is None or df.empty:
                return None, "No data available"
            missing_fig = create_missing_values_plot(df)
            dup_info = get_duplicates_info(df)
            return missing_fig, dup_info

        def analyze_sentiments(df_reviews):
            if df_reviews is None or df_reviews.empty:
                return None, "❌ No reviews data available. Please upload the reviews CSV file first."

            sentiment_fig = create_sentiment_analysis(df_reviews)

            if sentiment_fig is None:
                info = "❌ Sentiment column not found in reviews data. Please ensure your reviews CSV has a 'Sentiment' column."
            else:
                sentiment_stats = df_reviews['Sentiment'].value_counts()
                info = f"""📊 **Enhanced Sentiment Analysis Results:**
• Total Reviews Analyzed: {len(df_reviews):,}
• Positive Reviews: {sentiment_stats.get('Positive', 0):,} ({sentiment_stats.get('Positive', 0)/len(df_reviews)*100:.1f}%)
• Negative Reviews: {sentiment_stats.get('Negative', 0):,} ({sentiment_stats.get('Negative', 0)/len(df_reviews)*100:.1f}%)
• Neutral Reviews: {sentiment_stats.get('Neutral', 0):,} ({sentiment_stats.get('Neutral', 0)/len(df_reviews)*100:.1f}%)

✅ **Enhanced analysis complete!**"""

            return sentiment_fig, info

        def train_model_handler(df, selected_features):
            """Handle enhanced model training - FIXED"""
            if df is None or df.empty:
                return "❌ No data available. Please upload and clean data first.", None, None, None, None, []

            if not selected_features:
                return "❌ Please select at least one feature for training.", None, None, None, None, []

            results, model, scaler, importance_fig, label_encoders = train_enhanced_rating_prediction_model(df, selected_features)
            return results, model, scaler, importance_fig, label_encoders, selected_features

        def predict_rating_handler(model, scaler, label_encoders, model_features, reviews, installs, size, price, category):
            """Handle enhanced rating prediction - FIXED"""
            if model is None:
                return "❌ Please train a model first by selecting features and clicking 'Train Enhanced Models'."

            if not model_features:
                return "❌ No model features available. Please train the model first."

            features_dict = {
                'reviews': reviews,
                'installs': installs,
                'size': size,
                'price': price,
                'category': category
            }

            return predict_enhanced_rating(model, scaler, label_encoders, model_features, features_dict)

        # FIXED: Event handlers
        upload_btn.click(
            fn=upload_data,
            inputs=[apps_file],
            outputs=[data_info, data_preview, processed_df, category_filter, cleaning_log]
        )

        clean_btn.click(
            fn=clean_data,
            inputs=[processed_df],
            outputs=[data_info, data_preview, processed_df, category_filter, cleaning_log]
        )

        process_reviews_btn.click(
            fn=process_reviews_data,
            inputs=[reviews_file],
            outputs=[data_info, processed_reviews_df]
        )

        # Analysis Event Handlers
        analyze_category_btn.click(
            fn=analyze_categories,
            inputs=[processed_df],
            outputs=[category_plot, category_metrics_table]
        )

        analyze_rating_btn.click(
            fn=analyze_ratings,
            inputs=[processed_df],
            outputs=[rating_plot, correlation_plot]
        )

        analyze_scatter_btn.click(
            fn=analyze_scatter,
            inputs=[processed_df],
            outputs=[scatter_plot]
        )

        analyze_price_btn.click(
            fn=analyze_pricing,
            inputs=[processed_df],
            outputs=[price_plot]
        )

        analyze_size_btn.click(
            fn=analyze_size_installs,
            inputs=[processed_df],
            outputs=[size_plot, installs_pie_plot]
        )

        analyze_quality_btn.click(
            fn=analyze_data_quality,
            inputs=[processed_df],
            outputs=[missing_plot, duplicates_info]
        )

        analyze_sentiment_btn.click(
            fn=analyze_sentiments,
            inputs=[processed_reviews_df],
            outputs=[sentiment_plot, sentiment_info]
        )

        # Machine Learning Event Handlers
        train_model_btn.click(
            fn=train_model_handler,
            inputs=[processed_df, model_features],
            outputs=[model_results, trained_model, model_scaler, feature_importance_plot,
                    model_label_encoders, model_features_state]
        )

        predict_btn.click(
            fn=predict_rating_handler,
            inputs=[trained_model, model_scaler, model_label_encoders, model_features_state,
                   predict_reviews, predict_installs, predict_size, predict_price, predict_category],
            outputs=[predicted_rating]
        )

        # FIXED: App Insights Event Handler
        filter_btn.click(
            fn=filter_apps_handler,
            inputs=[processed_df, category_filter, rating_filter, installs_filter, price_filter],
            outputs=[filtered_apps, insights_text]
        )

    return demo

def main():
    """Main function to run the Gradio interface"""
    try:
        demo = main_interface()
        demo.launch(
            share=True,
            server_name="0.0.0.0",
            server_port=7860,
            show_error=True,
            debug=True
        )
    except Exception as e:
        print(f"Error launching interface: {e}")
        # Fallback to basic launch
        demo = main_interface()
        demo.launch()

if __name__ == "__main__":
    main()


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://9d8cd7cfde7f4f565f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
