<a href="https://colab.research.google.com/github/AJaysAIk/AI-Telegram-chatbot--Clarity-x1/blob/main/NLP_Enginner_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install spacy datasets fastapi uvicorn pyngrok

In [None]:
from datasets import load_dataset

# Load the CoNLL-2003 dataset
dataset = load_dataset("conll2003")

# Access train, validation, and test splits
train_data = dataset["train"]
valid_data = dataset["validation"]
test_data = dataset["test"]

In [None]:
import spacy
from spacy.training import Example

def convert_to_spacy_format(data):
    spacy_data = []
    for item in data:
        text = " ".join(item["tokens"])
        entities = []
        start = 0
        for token, tag in zip(item["tokens"], item["ner_tags"]):
            end = start + len(token)
            if tag != 0:  # 0 is 'O' (no entity)
                # Map numeric tag to label (e.g., 1 -> "B-PER", simplified here)
                label = train_data.features["ner_tags"].feature.names[tag]
                entities.append((start, end, label))
            start = end + 1  # Account for space between tokens
        spacy_data.append((text, {"entities": entities}))
    return spacy_data

# Convert datasets
train_spacy = convert_to_spacy_format(train_data)
valid_spacy = convert_to_spacy_format(valid_data)
test_spacy = convert_to_spacy_format(test_data)

In [None]:
!pip install spacy datasets

from datasets import load_dataset
import spacy
import random
from spacy.util import minibatch
from spacy.training import Example
from spacy.scorer import Scorer

# Load dataset
dataset = load_dataset("conll2003", trust_remote_code=True)
train_data = dataset["train"]
valid_data = dataset["validation"]
test_data = dataset["test"]

# Preprocessing
def convert_to_spacy_format(data):
    spacy_data = []
    for item in data:
        text = " ".join(item["tokens"])
        entities = []
        start = 0
        current_entity = None
        for token, tag in zip(item["tokens"], item["ner_tags"]):
            end = start + len(token)
            label = train_data.features["ner_tags"].feature.names[tag]
            if label.startswith("B-"):
                if current_entity:
                    entities.append(current_entity)
                current_entity = (start, end, label[2:])
            elif label.startswith("I-") and current_entity and label[2:] == current_entity[2]:
                current_entity = (current_entity[0], end, current_entity[2])
            elif current_entity:
                entities.append(current_entity)
                current_entity = None
            start = end + 1
        if current_entity:
            entities.append(current_entity)
        spacy_data.append((text, {"entities": entities}))
    return spacy_data

train_spacy = convert_to_spacy_format(train_data)
valid_spacy = convert_to_spacy_format(valid_data)
test_spacy = convert_to_spacy_format(test_data)

# Training
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")
labels = set(label[2:] for label in train_data.features["ner_tags"].feature.names if label != "O")
for label in labels:
    ner.add_label(label)

examples = [Example.from_dict(nlp.make_doc(text), annot) for text, annot in train_spacy]
with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != "ner"]):
    optimizer = nlp.initialize()
    for epoch in range(30):  # Increased to 35 epochs
        random.shuffle(examples)
        losses = {}
        batches = minibatch(examples, size=8)
        for batch in batches:
            nlp.update(batch, sgd=optimizer, losses=losses)
        print(f"Epoch {epoch}, Losses: {losses}")

nlp.to_disk("/content/ner_model")

# Debug Predictions
nlp = spacy.load("/content/ner_model")
text = "John lives in New York"
doc = nlp(text)
print("Sample Predictions:")
for ent in doc.ents:
    print(ent.text, ent.label_)

# Evaluation
scorer = Scorer()
examples = []
for text, annot in test_spacy[:100]:
    doc = nlp(text)
    pred_example = Example.from_dict(doc, annot)
    examples.append(pred_example)

scores = scorer.score(examples)
print(f"Precision: {scores['ents_p']:.2f}")
print(f"Recall: {scores['ents_r']:.2f}")
print(f"F1-Score: {scores['ents_f']:.2f}")

Epoch 0, Losses: {'ner': 13388.059176656068}
Epoch 1, Losses: {'ner': 5787.428180922152}
Epoch 2, Losses: {'ner': 4035.1305129183015}
Epoch 3, Losses: {'ner': 3146.1011637135603}
Epoch 4, Losses: {'ner': 2526.496547476645}
Epoch 5, Losses: {'ner': 2117.071306945934}
Epoch 6, Losses: {'ner': 1953.8799516727697}
Epoch 7, Losses: {'ner': 1665.241062070864}
Epoch 8, Losses: {'ner': 1395.0527904438675}
Epoch 9, Losses: {'ner': 1264.217857148425}
Epoch 10, Losses: {'ner': 1190.2713842074074}
Epoch 11, Losses: {'ner': 1130.5566890687883}
Epoch 12, Losses: {'ner': 944.6374352266769}
Epoch 13, Losses: {'ner': 996.6170158904142}
Epoch 14, Losses: {'ner': 863.4668836630956}
Epoch 15, Losses: {'ner': 913.6614795557617}
Epoch 16, Losses: {'ner': 830.9698171600661}
Epoch 17, Losses: {'ner': 811.3245761300665}


In [13]:
from fastapi import FastAPI, HTTPException, Depends
from fastapi.security import HTTPBasic, HTTPBasicCredentials
from pydantic import BaseModel
import spacy

# Initialize FastAPI app and load the model
app = FastAPI()
security = HTTPBasic()
nlp = spacy.load("/content/ner_model")

# Define input schema
class TextInput(BaseModel):
    text: str

# Authentication function
def verify_credentials(credentials: HTTPBasicCredentials = Depends(security)):
    if credentials.username != "admin" or credentials.password != "password":
        raise HTTPException(status_code=401, detail="Unauthorized")
    return True

@app.get("/hello")
async def read_hello():
    return {"message": "Hello, world!"}


# Prediction endpoint
@app.post("/predict")
def predict(input: TextInput, auth: bool = Depends(verify_credentials)):
    if not input.text:
        raise HTTPException(status_code=400, detail="Text input required")
    doc = nlp(input.text)
    entities = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
    return {"entities": entities}

In [19]:
import uvicorn
from pyngrok import ngrok
import threading

# Set your ngrok authtoken (replace with your actual token)
ngrok.set_auth_token("2uABFAkcvmIjmgVHYebtEovHfwc_2bHesjMKse3ZA79bEiyRR")

# Define your app if you haven't already (make sure `app` is defined)
# For example:
# from fastapi import FastAPI
# app = FastAPI()

# Run Uvicorn in a separate thread
def run():
    # Optionally change the port if 8000 is taken
    uvicorn.run(app, host="0.0.0.0", port=8005)

thread = threading.Thread(target=run)
thread.start()

# Create an Ngrok tunnel to port 8000 (or your chosen port)
public_url = ngrok.connect(8005)
print(f"Public URL: {public_url}")


INFO:     Started server process [8823]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8005 (Press CTRL+C to quit)


Public URL: NgrokTunnel: "https://3788-34-106-144-35.ngrok-free.app" -> "http://localhost:8005"


In [18]:
from pyngrok import ngrok
ngrok.kill()
