In [2]:
import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import io
import base64
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, roc_auc_score
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pickle
import warnings
warnings.filterwarnings('ignore')

In [3]:
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
except:
    pass

In [5]:
class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        # Preserve important question words
        self.question_words = {'what', 'how', 'why', 'when', 'where', 'who', 'which', 'whose', 'whom'}
    
    def clean_text(self, text):
        if not text:
            return ""
        text = text.lower()
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s\?\!\.]', '', text)
        return text.strip()
    
    def remove_stopwords(self, text):
        tokens = word_tokenize(text)
        filtered_tokens = [word for word in tokens 
                          if word not in self.stop_words or word in self.question_words]
        return ' '.join(filtered_tokens)
    
    def lemmatize_text(self, text):
        tokens = word_tokenize(text)
        lemmatized_tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        return ' '.join(lemmatized_tokens)
    
    def preprocess(self, text):
        text = self.clean_text(text)
        text = self.remove_stopwords(text)
        text = self.lemmatize_text(text)
        return text

## MODEL TRAINING FUNCTIONS

In [6]:
trained_models = {}
preprocessor = TextPreprocessor()
sample_data = None

def create_sample_dataset():
    sample_data = {
        'text': [
            "What is the capital of France?",
            "The capital of France is Paris.",
            "How do we solve this equation?",
            "Machine learning is a subset of artificial intelligence.",
            "Can you explain the concept of neural networks?",
            "Neural networks are inspired by biological neurons.",
            "What are the main components of a computer?",
            "A computer consists of hardware and software components.",
            "Why is data preprocessing important?",
            "Data preprocessing helps improve model performance.",
            "How does gradient descent work?",
            "Gradient descent is an optimization algorithm.",
            "What is the difference between supervised and unsupervised learning?",
            "Supervised learning uses labeled data for training.",
            "When should we use cross-validation?",
            "Cross-validation helps assess model generalization.",
            "What is overfitting in machine learning?",
            "Overfitting occurs when a model learns training data too well.",
            "How can we prevent overfitting?",
            "Regularization techniques can help prevent overfitting.",
            "What is the purpose of feature scaling?",
            "Feature scaling normalizes the range of features.",
            "Why do we split data into training and testing sets?",
            "Data splitting helps evaluate model performance on unseen data.",
            "What is the bias-variance tradeoff?",
            "The bias-variance tradeoff balances model complexity and performance.",
            "Could you clarify this concept?",
            "This concept is fundamental to understanding AI.",
            "Is there a better approach to this problem?",
            "There are multiple approaches to solve this problem.",
            "Which algorithm performs better?",
            "Algorithm performance depends on the specific use case.",
            "How do you implement this in Python?",
            "Python implementation requires several libraries.",
            "What happens if we increase the learning rate?",
            "Increasing the learning rate may cause convergence issues."
        ],
        'label': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
    }
    return pd.DataFrame(sample_data)

In [7]:
def train_models(vectorizer_type, classifier_type, progress=gr.Progress()):
    global trained_models, sample_data
    
    progress(0, desc="Loading data...")
    
    # Get sample data
    if sample_data is None:
        sample_data = create_sample_dataset()
    
    df = sample_data.copy()
    
    progress(0.2, desc="Preprocessing text...")
    
    # Preprocess text
    df['text_processed'] = df['text'].apply(preprocessor.preprocess)
    
    progress(0.4, desc="Creating vectorizer...")
    
    # Create vectorizer
    vectorizers = {
        'TF-IDF Unigram': TfidfVectorizer(max_features=1000, ngram_range=(1, 1)),
        'TF-IDF Bigram': TfidfVectorizer(max_features=1000, ngram_range=(1, 2)),
        'Bag of Words': CountVectorizer(max_features=1000, ngram_range=(1, 1)),
        'Character N-grams': TfidfVectorizer(analyzer='char', ngram_range=(2, 4), max_features=1000)
    }
    
    classifiers = {
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'SVM Linear': SVC(kernel='linear', probability=True, random_state=42),
        'Naive Bayes': MultinomialNB()
    }
    
    vectorizer = vectorizers[vectorizer_type]
    classifier = classifiers[classifier_type]
    
    progress(0.6, desc="Extracting features...")
    
    # Extract features
    X = vectorizer.fit_transform(df['text_processed'])
    y = df['label']
    
    progress(0.8, desc="Training model...")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )
    
    # Train model
    classifier.fit(X_train, y_train)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    y_pred_proba = classifier.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    cv_scores = cross_val_score(classifier, X_train, y_train, cv=5, scoring='roc_auc')
    
    progress(1.0, desc="Saving model...")
    
    # Store trained model
    model_key = f"{vectorizer_type}_{classifier_type}"
    trained_models[model_key] = {
        'vectorizer': vectorizer,
        'classifier': classifier,
        'roc_auc': roc_auc,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'y_test': y_test,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    # Generate classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    
    return (
        f"Model trained successfully!\n\n"
        f"Configuration: {classifier_type} with {vectorizer_type}\n"
        f"ROC AUC: {roc_auc:.4f}\n"
        f"Cross-validation: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})\n"
        f"Accuracy: {report['accuracy']:.4f}\n"
        f"Precision (Questions): {report['1']['precision']:.4f}\n"
        f"Recall (Questions): {report['1']['recall']:.4f}\n"
        f"F1-Score (Questions): {report['1']['f1-score']:.4f}",
        model_key
    )

In [8]:
def predict_single_text(text, model_key):
    """Predict whether a single text is a question"""
    if not text.strip():
        return "Please enter some text to analyze."
    
    if not model_key or model_key not in trained_models:
        return "Please train a model first."
    
    model_info = trained_models[model_key]
    
    # Preprocess text
    processed_text = preprocessor.preprocess(text)
    
    # Vectorize
    text_vector = model_info['vectorizer'].transform([processed_text])
    
    # Predict
    prediction = model_info['classifier'].predict(text_vector)[0]
    probabilities = model_info['classifier'].predict_proba(text_vector)[0]
    
    result = {
        'Original Text': text,
        'Processed Text': processed_text,
        'Prediction': 'Question' if prediction == 1 else 'Statement',
        'Confidence': f"{max(probabilities):.1%}",
        'Question Probability': f"{probabilities[1]:.1%}",
        'Statement Probability': f"{probabilities[0]:.1%}"
    }
    
    # Format output
    output = []
    for key, value in result.items():
        output.append(f"**{key}:** {value}")
    
    return "\n".join(output)


In [9]:
def analyze_text_batch(text_batch, model_key):
    """Analyze multiple sentences"""
    if not text_batch.strip():
        return "Please enter text to analyze."
    
    if not model_key or model_key not in trained_models:
        return "Please train a model first."
    
    # Split text into sentences
    sentences = [s.strip() for s in text_batch.split('\n') if s.strip()]
    
    if not sentences:
        return "No valid sentences found."
    
    model_info = trained_models[model_key]
    results = []
    
    for i, sentence in enumerate(sentences, 1):
        # Preprocess
        processed = preprocessor.preprocess(sentence)
        
        # Vectorize and predict
        text_vector = model_info['vectorizer'].transform([processed])
        prediction = model_info['classifier'].predict(text_vector)[0]
        probabilities = model_info['classifier'].predict_proba(text_vector)[0]
        
        results.append({
            'Sentence': i,
            'Text': sentence,
            'Prediction': 'Question' if prediction == 1 else 'Statement',
            'Confidence': f"{max(probabilities):.1%}",
            'Question_Prob': probabilities[1]
        })
    
    # Create DataFrame for display
    df_results = pd.DataFrame(results)
    
    # Summary statistics
    total_sentences = len(results)
    questions = sum(1 for r in results if r['Prediction'] == 'Question')
    statements = total_sentences - questions
    
    summary = (
        f"**Analysis Summary:**\n"
        f"Total Sentences: {total_sentences}\n"
        f"Questions: {questions} ({questions/total_sentences:.1%})\n"
        f"Statements: {statements} ({statements/total_sentences:.1%})\n\n"
    )
    
    # Format detailed results
    detailed = "**Detailed Results:**\n"
    for result in results:
        detailed += (
            f"{result['Sentence']}. {result['Text']}\n"
            f"   → {result['Prediction']} ({result['Confidence']})\n\n"
        )
    
    return summary + detailed


In [10]:
def create_roc_plot(model_key):
    """Create ROC curve plot"""
    if not model_key or model_key not in trained_models:
        return None
    
    model_info = trained_models[model_key]
    
    # Calculate ROC curve
    fpr, tpr, _ = roc_curve(model_info['y_test'], model_info['y_pred_proba'])
    roc_auc = model_info['roc_auc']
    
    # Create plot
    fig = go.Figure()
    
    # ROC curve
    fig.add_trace(go.Scatter(
        x=fpr, y=tpr,
        mode='lines',
        name=f'ROC Curve (AUC = {roc_auc:.3f})',
        line=dict(color='blue', width=2)
    ))
    
    # Diagonal line
    fig.add_trace(go.Scatter(
        x=[0, 1], y=[0, 1],
        mode='lines',
        name='Random Classifier',
        line=dict(color='red', dash='dash')
    ))
    
    fig.update_layout(
        title=f'ROC Curve - {model_key.replace("_", " with ")}',
        xaxis_title='False Positive Rate',
        yaxis_title='True Positive Rate',
        showlegend=True,
        width=600,
        height=500
    )
    
    return fig

In [11]:
def create_confusion_matrix_plot(model_key):
    """Create confusion matrix plot"""
    if not model_key or model_key not in trained_models:
        return None
    
    model_info = trained_models[model_key]
    
    # Calculate confusion matrix
    cm = confusion_matrix(model_info['y_test'], model_info['y_pred'])
    
    # Create heatmap
    fig = px.imshow(
        cm,
        labels=dict(x="Predicted", y="Actual", color="Count"),
        x=['Statement', 'Question'],
        y=['Statement', 'Question'],
        color_continuous_scale='Blues',
        title=f'Confusion Matrix - {model_key.replace("_", " with ")}'
    )
    
    # Add text annotations
    for i in range(len(cm)):
        for j in range(len(cm[0])):
            fig.add_annotation(
                x=j, y=i,
                text=str(cm[i][j]),
                showarrow=False,
                font=dict(color="white" if cm[i][j] > cm.max()/2 else "black", size=16)
            )
    
    fig.update_layout(width=500, height=400)
    
    return fig

In [12]:
def compare_all_models():
    """Compare all trained models"""
    if not trained_models:
        return "No models trained yet. Please train some models first."
    
    # Create comparison data
    comparison_data = []
    for model_key, model_info in trained_models.items():
        vectorizer_type, classifier_type = model_key.split('_', 1)
        comparison_data.append({
            'Model': model_key.replace('_', ' + '),
            'Vectorizer': vectorizer_type,
            'Classifier': classifier_type,
            'ROC_AUC': model_info['roc_auc'],
            'CV_Mean': model_info['cv_mean'],
            'CV_Std': model_info['cv_std']
        })
    
    df_comparison = pd.DataFrame(comparison_data)
    df_comparison = df_comparison.sort_values('ROC_AUC', ascending=False)
    
    # Format for display
    output = "**Model Performance Comparison:**\n\n"
    for _, row in df_comparison.iterrows():
        output += (
            f"**{row['Model']}**\n"
            f"ROC AUC: {row['ROC_AUC']:.4f}\n"
            f"CV Score: {row['CV_Mean']:.4f} (±{row['CV_Std']:.4f})\n\n"
        )
    
    return output

In [13]:
def upload_and_analyze_file(file, model_key):
    """Analyze uploaded text file"""
    if file is None:
        return "Please upload a text file."
    
    if not model_key or model_key not in trained_models:
        return "Please train a model first."
    
    try:
        # Read file content
        content = file.decode('utf-8') if isinstance(file, bytes) else str(file)
        
        # Split into sentences
        sentences = [s.strip() for s in content.split('\n') if s.strip()]
        
        if not sentences:
            return "No valid sentences found in the file."
        
        # Analyze with batch function
        return analyze_text_batch('\n'.join(sentences), model_key)
        
    except Exception as e:
        return f"Error processing file: {str(e)}"


In [15]:
def create_gradio_interface():
    """Create the main Gradio interface"""
    
    with gr.Blocks(title="Question Classification System", theme=gr.themes.Soft()) as interface:
        
        gr.Markdown("""
        # 🎓 Question Classification System
        ### EdTech ML Pipeline for Lecture Transcript Analysis
        
        This system helps identify questions within lecture transcripts using various machine learning approaches.
        """)
        
        # Model training section
        with gr.Tab("🔧 Model Training"):
            gr.Markdown("### Train Classification Models")
            gr.Markdown("Select a vectorizer and classifier combination to train a new model.")
            
            with gr.Row():
                with gr.Column():
                    vectorizer_choice = gr.Dropdown(
                        choices=['TF-IDF Unigram', 'TF-IDF Bigram', 'Bag of Words', 'Character N-grams'],
                        label="Vectorization Method",
                        value='TF-IDF Unigram'
                    )
                    classifier_choice = gr.Dropdown(
                        choices=['Logistic Regression', 'Random Forest', 'SVM Linear', 'Naive Bayes'],
                        label="Classification Algorithm",
                        value='Logistic Regression'
                    )
                    train_btn = gr.Button("🚀 Train Model", variant="primary")
                
                with gr.Column():
                    training_output = gr.Textbox(
                        label="Training Results",
                        lines=10,
                        interactive=False
                    )
                    current_model = gr.Textbox(
                        label="Current Model Key",
                        visible=False
                    )
            
            train_btn.click(
                fn=train_models,
                inputs=[vectorizer_choice, classifier_choice],
                outputs=[training_output, current_model]
            )
        
        # Single prediction section
        with gr.Tab("🔍 Single Text Analysis"):
            gr.Markdown("### Analyze Individual Sentences")
            gr.Markdown("Enter a sentence to determine if it's a question or statement.")
            
            with gr.Row():
                with gr.Column():
                    single_text_input = gr.Textbox(
                        label="Enter text to analyze",
                        placeholder="What is machine learning?",
                        lines=3
                    )
                    model_selector1 = gr.Dropdown(
                        label="Select Trained Model",
                        choices=[],
                        interactive=True
                    )
                    analyze_single_btn = gr.Button("🔍 Analyze Text", variant="primary")
                
                with gr.Column():
                    single_analysis_output = gr.Textbox(
                        label="Analysis Results",
                        lines=8,
                        interactive=False
                    )
            
            analyze_single_btn.click(
                fn=predict_single_text,
                inputs=[single_text_input, model_selector1],
                outputs=single_analysis_output
            )
        
        # Batch analysis section
        with gr.Tab("📄 Batch Text Analysis"):
            gr.Markdown("### Analyze Multiple Sentences")
            gr.Markdown("Enter multiple sentences (one per line) for batch analysis.")
            
            with gr.Row():
                with gr.Column():
                    batch_text_input = gr.Textbox(
                        label="Enter sentences (one per line)",
                        placeholder="What is AI?\nAI is artificial intelligence.\nHow does it work?",
                        lines=10
                    )
                    model_selector2 = gr.Dropdown(
                        label="Select Trained Model",
                        choices=[],
                        interactive=True
                    )
                    analyze_batch_btn = gr.Button("📊 Analyze Batch", variant="primary")
                
                with gr.Column():
                    batch_analysis_output = gr.Textbox(
                        label="Batch Analysis Results",
                        lines=15,
                        interactive=False
                    )
            
            analyze_batch_btn.click(
                fn=analyze_text_batch,
                inputs=[batch_text_input, model_selector2],
                outputs=batch_analysis_output
            )
        
        # File upload section
        with gr.Tab("📁 File Analysis"):
            gr.Markdown("### Upload and Analyze Text Files")
            gr.Markdown("Upload a text file containing sentences to analyze.")
            
            with gr.Row():
                with gr.Column():
                    file_upload = gr.File(
                        label="Upload Text File",
                        file_types=[".txt"],
                        type="binary"
                    )
                    model_selector3 = gr.Dropdown(
                        label="Select Trained Model",
                        choices=[],
                        interactive=True
                    )
                    analyze_file_btn = gr.Button("📁 Analyze File", variant="primary")
                
                with gr.Column():
                    file_analysis_output = gr.Textbox(
                        label="File Analysis Results",
                        lines=15,
                        interactive=False
                    )
            
            analyze_file_btn.click(
                fn=upload_and_analyze_file,
                inputs=[file_upload, model_selector3],
                outputs=file_analysis_output
            )
        
        # Visualization section
        with gr.Tab("📈 Model Visualization"):
            gr.Markdown("### Model Performance Visualization")
            
            with gr.Row():
                model_selector4 = gr.Dropdown(
                    label="Select Model for Visualization",
                    choices=[],
                    interactive=True
                )
                viz_btn = gr.Button("📈 Generate Plots", variant="primary")
            
            with gr.Row():
                with gr.Column():
                    roc_plot = gr.Plot(label="ROC Curve")
                with gr.Column():
                    confusion_plot = gr.Plot(label="Confusion Matrix")
            
            viz_btn.click(
                fn=lambda model_key: (create_roc_plot(model_key), create_confusion_matrix_plot(model_key)),
                inputs=model_selector4,
                outputs=[roc_plot, confusion_plot]
            )
        
        # Model comparison section
        with gr.Tab("⚖️ Model Comparison"):
            gr.Markdown("### Compare All Trained Models")
            
            compare_btn = gr.Button("⚖️ Compare Models", variant="primary")
            comparison_output = gr.Textbox(
                label="Model Comparison Results",
                lines=15,
                interactive=False
            )
            
            compare_btn.click(
                fn=compare_all_models,
                outputs=comparison_output
            )
        
        # Update model choices when models are trained
        def update_model_choices():
            choices = list(trained_models.keys())
            return (
                gr.Dropdown.update(choices=choices),
                gr.Dropdown.update(choices=choices),
                gr.Dropdown.update(choices=choices),
                gr.Dropdown.update(choices=choices)
            )
        
        # Update dropdowns when training completes
        train_btn.click(
            fn=update_model_choices,
            outputs=[model_selector1, model_selector2, model_selector3, model_selector4]
        )
        
        # Example section
        with gr.Tab("💡 Examples & Help"):
            gr.Markdown()
    
    return interface

In [None]:
if __name__ == "__main__":
    # Create and launch the interface
    demo = create_gradio_interface()
    
    # Launch with sharing enabled for external access
    demo.launch(
        share=True,  # Creates a public link
        server_name="0.0.0.0",  # Allow external connections
        server_port=7860,  # Default Gradio port
        show_api=True,  # Show API documentation
        favicon_path=None,
        ssl_verify=False
    )

# Alternative launch for local development only
def launch_local():
    """Launch interface for local development only"""
    demo = create_gradio_interface()
    demo.launch(
        share=False,
        server_name="127.0.0.1",
        server_port=7860,
        show_api=True
    )

# Instructions for deployment
"""
DEPLOYMENT INSTRUCTIONS:
=======================

1. Local Development:
   - Run: python gradio_ui.py
   - Access: http://localhost:7860

2. Install Required Packages:
   pip install gradio pandas numpy matplotlib seaborn plotly scikit-learn nltk

3. For Production Deployment:
   - Use launch_local() for local-only access
   - Use demo.launch(share=True) for public sharing
   - Consider using Gradio Spaces for cloud deployment

4. Features Available:
   - Interactive model training
   - Real-time text classification
   - Batch processing
   - File upload and analysis
   - Performance visualization
   - Model comparison

5. Customization:
   - Modify sample_data for your specific dataset
   - Add more vectorizers or classifiers
   - Customize the UI theme and layout
   - Add additional evaluation metrics
"""