# Setup (lightweight)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.random.seed(42)

# Tiny sentiment dataset (no downloads)

In [2]:
# Small handcrafted dataset so we avoid downloading datasets
texts = [
    "I love this project, it works great!",
    "This is the best thing I have used.",
    "Amazing experience, very helpful.",
    "I am happy with the results.",
    "Terrible, it keeps failing.",
    "I hate this, waste of time.",
    "Very disappointing and buggy.",
    "This is the worst experience ever.",
    "Not bad, could be better.",
    "Okay-ish, acceptable result.",
    "I am not satisfied.",
    "I am satisfied."
]

# 1 = positive, 0 = negative (keep it simple)
labels = [1,1,1,1,0,0,0,0,1,1,0,1]

df_sent = pd.DataFrame({"text": texts, "label": labels})
df_sent

Unnamed: 0,text,label
0,"I love this project, it works great!",1
1,This is the best thing I have used.,1
2,"Amazing experience, very helpful.",1
3,I am happy with the results.,1
4,"Terrible, it keeps failing.",0
5,"I hate this, waste of time.",0
6,Very disappointing and buggy.,0
7,This is the worst experience ever.,0
8,"Not bad, could be better.",1
9,"Okay-ish, acceptable result.",1


# Offline baseline (TF-IDF + Logistic Regression) + metrics ✅

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(
    df_sent["text"], df_sent["label"], test_size=0.3, random_state=42, stratify=df_sent["label"]
)

tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=1)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec  = tfidf.transform(X_test)

clf = LogisticRegression(max_iter=2000)
clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)

print("Offline baseline metrics (TF-IDF + LogisticRegression)")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
cm

Offline baseline metrics (TF-IDF + LogisticRegression)
Accuracy: 0.5
F1: 0.6666666666666666
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.50      1.00      0.67         2

    accuracy                           0.50         4
   macro avg       0.25      0.50      0.33         4
weighted avg       0.25      0.50      0.33         4



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


array([[0, 2],
       [0, 2]])

# NLP Task 1 (LLM-powered): Sentiment via Transformers pipeline

## HuggingFace pipeline

In [6]:
# If this cell fails due to slow internet, that's OK — you still have offline baseline.
!pip -q install transformers sentencepiece accelerate

from transformers import pipeline

# small + stable sentiment model
sentiment = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

tests = [
    "I really love this!",
    "This is horrible and disappointing.",
    "It's okay, not great."
]
sentiment(tests)

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

[{'label': 'POSITIVE', 'score': 0.9998812675476074},
 {'label': 'NEGATIVE', 'score': 0.9998063445091248},
 {'label': 'NEGATIVE', 'score': 0.9766001105308533}]

## Compare pipeline predictions vs labels (simple metric)

In [7]:
# Run pipeline on our small dataset, map POSITIVE/NEGATIVE to 1/0
pred_labels = []
for t in df_sent["text"].tolist():
    out = sentiment(t)[0]["label"]
    pred_labels.append(1 if out.upper().startswith("POS") else 0)

acc = accuracy_score(df_sent["label"], pred_labels)
f1  = f1_score(df_sent["label"], pred_labels)

print("Transformers pipeline (tiny) on our dataset")
print("Accuracy:", acc)
print("F1:", f1)

Transformers pipeline (tiny) on our dataset
Accuracy: 1.0
F1: 1.0


# NLP Task 2 (LLM-powered): Question Answering over a short context

## QA pipeline (tiny model)

In [13]:
from transformers import pipeline

qa = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

context = """
ThromAI is a smart waste and drainage monitoring pilot project in Thimphu.
It uses photo verification, basic machine learning models, and dashboards.
The pilot duration is nine months, starting 1 February 2026 and ending 31 October 2026.
"""
qa(question="What city is the pilot in?", context=context)

config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/102 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

{'score': 0.9868605141364242, 'start': 67, 'end': 74, 'answer': 'Thimphu'}

## Prompt experimentation (same info, different phrasing)

In [14]:
questions = [
    "When does the pilot start?",
    "What is the start date of the pilot?",
    "What is the pilot duration?",
    "When does the pilot end?"
]

for q in questions:
    ans = qa(question=q, context=context)
    print(q, "->", ans["answer"], "(score:", round(ans["score"], 3), ")")

When does the pilot start? -> 1 February 2026 (score: 0.969 )
What is the start date of the pilot? -> 1 February 2026 (score: 0.96 )
What is the pilot duration? -> nine months (score: 0.97 )
When does the pilot end? -> 31 October 2026 (score: 0.948 )


This counts as “prompt experimentation” because question phrasing changes model behavior/confidence.

## Simple QA evaluation (lightweight)

QA evaluation (Exact Match)

In [15]:
gold = [
    ("What city is the pilot in?", "Thimphu"),
    ("When does the pilot start?", "1 February 2026"),
    ("When does the pilot end?", "31 October 2026"),
    ("How long is the pilot?", "nine months")
]

def normalize(s):
    return " ".join(str(s).lower().strip().split())

em = []
for q, expected in gold:
    pred = qa(question=q, context=context)["answer"]
    ok = normalize(expected) in normalize(pred) or normalize(pred) in normalize(expected)
    em.append(1 if ok else 0)
    print(f"Q: {q}\nPred: {pred}\nExpected: {expected}\nEM: {ok}\n")

print("Exact Match (approx):", sum(em)/len(em))

Q: What city is the pilot in?
Pred: Thimphu
Expected: Thimphu
EM: True

Q: When does the pilot start?
Pred: 1 February 2026
Expected: 1 February 2026
EM: True

Q: When does the pilot end?
Pred: 31 October 2026
Expected: 31 October 2026
EM: True

Q: How long is the pilot?
Pred: nine months
Expected: nine months
EM: True

Exact Match (approx): 1.0


Now you have “evaluation metric” for QA too.

# Save artifacts for GitHub

Save results (CSV)

In [16]:
import os, json

os.makedirs("artifacts_nlp", exist_ok=True)

# Save offline baseline report
offline_report = {
    "offline_accuracy": float(accuracy_score(y_test, y_pred)),
    "offline_f1": float(f1_score(y_test, y_pred)),
}

with open("artifacts_nlp/offline_baseline_metrics.json", "w") as f:
    json.dump(offline_report, f, indent=2)

# Save a simple table of QA prompt tests
qa_rows = []
for q in questions:
    out = qa(question=q, context=context)
    qa_rows.append({"question": q, "answer": out["answer"], "score": float(out["score"])})

pd.DataFrame(qa_rows).to_csv("artifacts_nlp/qa_prompt_tests.csv", index=False)

print("Saved -> artifacts_nlp/")

Saved -> artifacts_nlp/


In [12]:
!zip -r artifacts_nlp.zip artifacts_nlp

  adding: artifacts_nlp/ (stored 0%)
  adding: artifacts_nlp/qa_prompt_tests.csv (deflated 44%)
  adding: artifacts_nlp/offline_baseline_metrics.json (deflated 38%)
