In [None]:
import tkinter as tk
from tkinter import ttk, scrolledtext, messagebox
import re # Keep re for robustness, but prioritize NLTK tokenizer

# --- NLTK Imports Re-enabled for requested features (Stemming, Lemmatization, POS) ---
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

# --- Sklearn Imports Re-enabled for Vectorization ---
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

# --- NLTK Data Downloads (FIXED) ---
# Ensure necessary NLTK data is available. We define a helper function 
# to check and download resources if they are missing.
def ensure_nltk_resources():
    required_resources = [
        "punkt", 
        "stopwords", 
        "wordnet", 
        "averaged_perceptron_tagger"
    ]
    
    for resource in required_resources:
        try:
            # Check if the resource is found locally
            nltk.data.find(f'tokenizers/{resource}') # This generic path check works for all
        except LookupError:
            # If not found, download it
            print(f"Downloading missing NLTK data: {resource}...")
            # Catching the correct exception (nltk.DownloadError) is critical here
            try:
                nltk.download(resource, quiet=True)
            except nltk.DownloadError as e:
                # Handle cases where download itself fails (e.g., network issue)
                print(f"Error downloading {resource}: {e}")
                # We show a warning to the user that the resource is needed
                messagebox.showwarning(
                    "NLTK Data Missing", 
                    f"Required NLTK resource '{resource}' could not be automatically downloaded. Please ensure you have network access or run 'import nltk; nltk.download(\"{resource}\")' manually."
                )

# Execute the check/download once when the script starts
ensure_nltk_resources()


# --- Core NLP Pipeline Logic ---

def execute_nlp_pipeline(input_text, steps_config):
    """
    Executes the configured NLP pipeline, focusing on Tokenization, Cleaning,
    Stemming, Lemmatization, POS Tagging, BoW, and TF-IDF.
    """
    
    # 1. Initial Tokenization
    current_tokens = word_tokenize(input_text)
    
    pipeline_results = {
        "01_Initial Tokens": current_tokens
    }
    
    # 2. Lowercasing (Part of "Tokens and Clean")
    if steps_config["use_lowercase"]:
        current_tokens = [t.lower() for t in current_tokens]
        pipeline_results["02_Lowercase Tokens"] = current_tokens
        
    # 3. Stopword Removal (Part of "Tokens and Clean")
    if steps_config["filter_stopwords"]:
        english_stops = set(stopwords.words("english"))
        # Filter out non-alphabetic tokens and stopwords
        current_tokens = [word for word in current_tokens if word.isalpha() and word not in english_stops]
        pipeline_results["03_Filtered Tokens"] = current_tokens

    # 4. Lemmatization (Lematizzation)
    if steps_config["use_lemmatization"]:
        # Lemmatization needs POS tags to be effective, but we proceed with default WordNet behavior
        lemmatizer = WordNetLemmatizer()
        try:
            lemmatized_tokens = [lemmatizer.lemmatize(word) for word in current_tokens]
            pipeline_results["04_Lemmatized Tokens"] = lemmatized_tokens
            current_tokens = lemmatized_tokens
        except LookupError:
            # WordNet is missing, show error to prevent crash
            messagebox.showerror("NLTK Error", "WordNet data is missing. Lemmatization failed.")
            pipeline_results["04_Lemmatized Tokens"] = ["Error: WordNet data missing."]
            

    # 5. Stemming (Stemming)
    if steps_config["use_stemming"]:
        stemmer = PorterStemmer()
        stemmed_tokens = [stemmer.stem(word) for word in current_tokens]
        pipeline_results["05_Stemmed Tokens"] = stemmed_tokens
        current_tokens = stemmed_tokens
        
    # 6. Part-of-Speech (POS) Tagging (Pos Tagging)
    if steps_config["compute_pos"]:
        try:
            pos_tags = pos_tag(current_tokens)
            pipeline_results["06_POS Tags"] = pos_tags
        except LookupError:
             messagebox.showerror("NLTK Error", "POS Tagger data is missing. POS Tagging failed.")
             pipeline_results["06_POS Tags"] = ["Error: POS Tagger data missing."]

    # --- Vectorization Steps ---
    
    # Check if we have any tokens left for vectorization
    if not current_tokens:
        # Avoid error if all tokens were removed (e.g., text was just "The a an")
        if steps_config["compute_bow"] or steps_config["compute_tfidf"]:
             messagebox.showwarning("Empty Tokens", "No tokens remain after cleaning; cannot compute vectors.")
        return pipeline_results
        
    # Prepare text for vectorizers (they expect a list of documents)
    processed_doc = " ".join(current_tokens)

    # 7. Bag of Words (BoW)
    if steps_config["compute_bow"]:
        # Use token_pattern to match tokens we already created
        vectorizer_bow = CountVectorizer(token_pattern=r'\b\w+\b') 
        X_bow = vectorizer_bow.fit_transform([processed_doc])
        
        pipeline_results["07_BoW Vocabulary"] = vectorizer_bow.get_feature_names_out().tolist()
        pipeline_results["08_BoW Vector (Counts)"] = X_bow.toarray().tolist() # Vector/Matrix output

    # 8. TF-IDF
    if steps_config["compute_tfidf"]:
        vectorizer_tfidf = TfidfVectorizer(token_pattern=r'\b\w+\b')
        X_tfidf = vectorizer_tfidf.fit_transform([processed_doc])
        
        pipeline_results["09_TF-IDF Vocabulary"] = vectorizer_tfidf.get_feature_names_out().tolist()
        # Round scores for cleaner output display
        tfidf_scores = [[round(score, 4) for score in row] for row in X_tfidf.toarray()]
        pipeline_results["10_TF-IDF Vector (Scores)"] = tfidf_scores # Vector/Matrix output

    return pipeline_results


# --- Tkinter GUI Implementation ---

class NlpApp(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("Advanced NLP Feature Explorer")
        self.geometry("700x750") 
        self.configure(bg='#e8f0f8')
        
        # Define modern styles
        style = ttk.Style(self)
        style.theme_use('clam')
        style.configure('TFrame', background='#e8f0f8')
        style.configure('TLabel', background='#e8f0f8', font=('Helvetica', 10))
        style.configure('TCheckbutton', background='#e8f0f8', font=('Helvetica', 10))
        style.configure('TButton', font=('Helvetica', 10, 'bold'), background='#007bff', foreground='white')
        style.map('TButton', background=[('active', '#0056b3')])


        # Variables to store checkbox states - NOW INCLUDING BoW and TF-IDF
        self.option_vars = {
            "use_lowercase": tk.BooleanVar(value=True),
            "filter_stopwords": tk.BooleanVar(value=True),
            "use_stemming": tk.BooleanVar(value=False), 
            "use_lemmatization": tk.BooleanVar(value=False), 
            "compute_pos": tk.BooleanVar(value=False),
            "compute_bow": tk.BooleanVar(value=False),       # ADDED
            "compute_tfidf": tk.BooleanVar(value=False),     # ADDED
        }

        self.create_widgets()

    def create_widgets(self):
        # Title
        title_label = tk.Label(
            self,
            text="Advanced NLP Feature Explorer",
            font=("Helvetica", 18, "bold"),
            bg="#007bff",
            fg="white",
            pady=10
        )
        title_label.pack(fill='x', pady=(10, 5), padx=10)

        main_frame = ttk.Frame(self, padding="10 10 10 10")
        main_frame.pack(pady=10, padx=10, fill='both', expand=True)

        # 1. Input Section
        input_frame = ttk.LabelFrame(main_frame, text="Input Text", padding="10")
        input_frame.pack(fill='x', pady=5)
        
        self.text_input = scrolledtext.ScrolledText(
            input_frame, 
            wrap=tk.WORD, 
            width=80, 
            height=6, 
            font=("Arial", 10),
            padx=5, pady=5, 
            bd=1, relief=tk.SUNKEN
        )
        # Set a default value
        self.text_input.insert(tk.END, "The large computers are rapidly changing the world of data science.")
        self.text_input.pack(fill='x', expand=True)

        # 2. Options Section
        options_frame = ttk.LabelFrame(main_frame, text="Select Linguistic Processing Steps (Requires NLTK & Scikit-learn)", padding="10")
        options_frame.pack(fill='x', pady=10)
        
        row_idx, col_idx = 0, 0
        
        # Mapping for display names (updated for all requested options)
        display_map = {
            "use_lowercase": "Tokens & Clean: Lowercase",
            "filter_stopwords": "Tokens & Clean: Stopword Removal",
            "use_stemming": "Stemming (Porter)",
            "use_lemmatization": "Lemmatization",
            "compute_pos": "POS Tagging (Part-of-Speech)",
            "compute_bow": "BoW (Frequency Vector)",
            "compute_tfidf": "TF-IDF (Score Vector)",
        }

        for key, var in self.option_vars.items():
            ttk.Checkbutton(
                options_frame,
                text=display_map.get(key, key),
                variable=var,
                style='TCheckbutton'
            ).grid(row=row_idx, column=col_idx, padx=15, pady=5, sticky="w")
            
            col_idx += 1
            if col_idx > 2:
                col_idx = 0
                row_idx += 1

        # 3. Process Button
        process_button = ttk.Button(
            main_frame,
            text="Execute Linguistic Analysis",
            command=self.run_analysis,
            style='TButton'
        )
        process_button.pack(pady=10)

        # 4. Output Section
        output_frame = ttk.LabelFrame(main_frame, text="Analysis Results", padding="10")
        output_frame.pack(fill='both', expand=True, pady=5)

        self.output_text = scrolledtext.ScrolledText(
            output_frame,
            wrap=tk.WORD,
            width=80,
            height=15,
            font=("Consolas", 9),
            padx=5, pady=5,
            bd=1, relief=tk.SUNKEN
        )
        self.output_text.pack(fill='both', expand=True)
        self.output_text.insert(tk.END, "Results will appear here after execution.\n\nNote: This tool uses NLTK and scikit-learn features.")
        
    def run_analysis(self):
        """Retrieves input, runs the pipeline, and displays output."""
        
        input_text = self.text_input.get("1.0", tk.END).strip()
        
        if not input_text:
            messagebox.showerror("Input Error", "Please enter text to analyze.")
            return

        steps_config = {key: var.get() for key, var in self.option_vars.items()}
        
        if not any(steps_config.values()):
            messagebox.showwarning("Selection Warning", "Please select at least one preprocessing step.")
            return
            
        try:
            results = execute_nlp_pipeline(input_text, steps_config)
            self.display_results(results)
        except Exception as e:
            messagebox.showerror("Processing Error", f"An error occurred during text processing: {e}")
            print(f"Error details: {e}")

    def display_results(self, results):
        """Formats and inserts the results into the output text box."""
        
        self.output_text.delete("1.0", tk.END)
        
        # Sort results by the numerical prefix for sequential display
        sorted_keys = sorted(results.keys())
        
        for key in sorted_keys:
            value = results[key]
            # Clean key name for display
            clean_key = key.split('_', 1)[-1].replace('_', ' ')
            
            self.output_text.insert(tk.END, f"--- {clean_key} ---\n", 'header')
            
            if isinstance(value, list) and all(isinstance(v, list) for v in value):
                # Handle Vector/Matrix outputs (BoW and TF-IDF)
                self.output_text.insert(tk.END, "Vector (Document 1):\n") 
                # Format vector rows neatly
                for row in value:
                    formatted_row = [str(item) for item in row]
                    self.output_text.insert(tk.END, f"  [ {', '.join(formatted_row)} ]\n")
            elif key == "06_POS Tags":
                # Special formatting for POS tags
                pos_text = ' '.join([f"({word}, {tag})" for word, tag in value])
                self.output_text.insert(tk.END, f"  {pos_text}\n\n")
            elif isinstance(value, list):
                # Handle Tokens/Vocabulary Lists
                self.output_text.insert(tk.END, f"  {', '.join(map(str, value))}\n\n")
            else:
                self.output_text.insert(tk.END, f"  {str(value)}\n\n")

        # Configure tag for bold header text
        self.output_text.tag_config('header', font=('Consolas', 10, 'bold'), foreground='#0056b3')


if __name__ == '__main__':
    # Run the application
    app = NlpApp()
    app.mainloop()

Downloading missing NLTK data: stopwords...
Downloading missing NLTK data: wordnet...
Downloading missing NLTK data: averaged_perceptron_tagger...
