### Imports

In [1]:
import pandas as pd
import gradio as gr
import spacy
from nltk.stem import PorterStemmer

nlp = spacy.load("en_core_web_sm")
stemmer = PorterStemmer()

### Tokenization

In [2]:
def tokenize(text):
    doc = nlp(text)
    return [token.text for token in doc]

### Stop Words

In [3]:
def stop_words(text):
    doc = nlp(text)
    return [token.text for token in doc if token.is_stop and not token.is_punct]

### Stemming

In [4]:
def stemming(text):
    doc = nlp(text)
    return [stemmer.stem(token.text) for token in doc]

### Lemmatization

In [5]:
def lemmatization(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc]

### Parts Of Speech Tagging

In [6]:
def pos_tagging(text):
    doc = nlp(text)
    return [(token.text, token.pos_) for token in doc]

### Named Entity Recognition

In [7]:
def ner(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

### Full Pipeline - stemming OR lemmatization

In [8]:
def run_all(text, mode):
    result = {
        "tokenize": tokenize(text),
        "stop_words": stop_words(text),
        "pos_tagging": pos_tagging(text),
        "ner": ner(text),
    }
    if (mode == "stemming"):
        result["stemmed"] = stemming(text)
    elif (mode == "lemmatization"):
        result["lemmatized"] = lemmatization(text)
    else:
        result["lemmatized"] = lemmatization(text)

    return result

### Visibility
On 'Dataset' tab - when user select 'Full Pipeline' then only we want to allow choose 'Stemming' or 'Lemmatization'

In [9]:
def toggle_visibility(nlp_choice):
    if nlp_choice == "Full Pipeline":
        return gr.update(visible=True)
    else:
        return gr.update(visible=False)

### Reading Columns of Dataset

In [10]:
def get_columns(file):
    if file is None:
        return {"error": "No file uploaded."}
    elif file.name.endswith(".csv"):
        df = pd.read_csv(file.name)
    else:
        df = pd.read_excel(file.name)
    return gr.update(choices=["All Columns"] + df.columns.tolist())

### File Processing

In [11]:
def process_file(file, column, nlp_choice, mode="lemmatization"):
    if not file:
        return {"error": "No file uploaded."}
    if not column:
        return {"error": "No column selected."}
    if not nlp_choice:
        return {"error": "No NLP function selected."}

    # Load file
    if file.name.endswith(".csv"):
        df = pd.read_csv(file)
    elif file.name.endswith((".xlsx", ".xls")):
        df = pd.read_excel(file)
    else:
        return {"error": "Unsupported file type."}

    # Determine columns
    columns = [column] if column != "All Columns" else df.columns.tolist()

    # NLP function map
    nlp_map = {
        "Tokenization": tokenize,
        "Stop Words": stop_words,
        "Stemming": stemming,
        "Lemmatization": lemmatization,
        "POS Tagging": pos_tagging,
        "NER": ner,
    }

    for col in columns:
        def process_text(text):
            text = str(text)
            try:
                if nlp_choice == "Full Pipeline":
                    return run_all(text, mode)

                func = nlp_map.get(nlp_choice)
                return func(text) if func else {}
            except Exception as e:
                return {"error": str(e)}

        df[f"{col}_processed"] = df[col].apply(process_text)

    return df.to_dict(orient="records")

### Tabs UI - Gradio

In [12]:
text_tabs = gr.TabbedInterface(
    [
        gr.Interface(fn=lambda text, mode: run_all(text, mode), inputs=[gr.Textbox(label="Text"), gr.Radio(["stemming", "lemmatization"])], outputs="json", title="Full Pipeline"),
        gr.Interface(fn=tokenize, inputs="text", outputs="json", title="Tokenization"),
        gr.Interface(fn=stop_words, inputs="text", outputs="json", title="Stop Words"),
        gr.Interface(fn=stemming, inputs="text", outputs="json", title="Stemming"),
        gr.Interface(fn=lemmatization, inputs="text", outputs="json", title="Lemmatization"),
        gr.Interface(fn=pos_tagging, inputs="text", outputs="json", title="POS_Tagging"),
        gr.Interface(fn=ner, inputs="text", outputs="json", title="NER"),
    ],
    tab_names=["Full Pipeline", "Tokenization", "Stop Words", "Stemming", "Lemmatization", "POS_Tagging", "NER"]
)

### Block UI - Gradio

In [13]:
with gr.Blocks() as dataset_tab:
    gr.Markdown("## Run NLP on Uploaded File")
    file_input = gr.File(label="Upload Your File", type="filepath")
    column_dropdown = gr.Radio(["All Columns"], value="All Columns", label="Select Column to Process")
    nlp_choice = gr.Radio(["Full Pipeline", "Tokenization", "Stop Words", "POS Tagging", "NER", "Stemming", "Lemmatization"], label="Select NLP Function")
    mode = gr.Radio(["stemming", "lemmatization"], visible=False)
    output = gr.JSON(label="Processed Data")
    run_btn = gr.Button("Run NLP")

    nlp_choice.change(fn=toggle_visibility, inputs=nlp_choice, outputs=mode)
    file_input.change(fn=get_columns, inputs=file_input, outputs=column_dropdown)
    run_btn.click(fn=process_file, inputs=[file_input, column_dropdown, nlp_choice, mode], outputs=output)

app = gr.TabbedInterface([text_tabs, dataset_tab], ["Text", "Dataset"])

### Launch

In [14]:
app.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a7032e7d1edb3f28dd.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


