In [2]:
# predict_and_api.py

import torch
import joblib
import shutil
from fastapi import FastAPI, Request
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import uvicorn
import re, emoji, string
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

app = FastAPI()

# ======= Clear Hugging Face Transformers Cache =======
hf_cache_dir = os.path.expanduser("~/.cache/huggingface/transformers")
if os.path.exists(hf_cache_dir):
    print("\n🧹 Clearing Hugging Face Transformers cache...")
    shutil.rmtree(hf_cache_dir)
    print("✅ Cache cleared successfully.\n")

# ======= Preprocessing Utils =======
stop_words = set(stopwords.words("english"))
stop_words.difference_update({"no", "not", "don", "shouldn", "wasn", "mustn"})
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

emoticon_dict = {
    ":)": "smiley", ":-)": "smiley", ":D": "laugh", ":-D": "laugh",
    ":(": "sad", ":-(": "sad", ":'(": "cry", ":'-\(": "cry",
    ":P": "playful", ":-P": "playful", ";)": "wink", ";-)": "wink",
    ":/": "skeptical", ":-/": "skeptical", ":|": "neutral", ":-|": "neutral",
    ":O": "surprised", ":-O": "surprised", "XD": "laugh", "<3": "love",
    ">:(": "angry", "D:": "horrified", ":-*": "kiss", ":3": "cute",
    ":-X": "sealed lips", "B-)": "cool", "O:)": "angel", ">:)": "evil smile"
}

def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+", "link", text)
    text = re.sub(r"@\w+", "user_mention", text)
    text = re.sub(r"#(\w+)", lambda m: m.group(1).replace("_", " "), text)
    text = re.sub(r"\d+", "", text)
    for emot, meaning in emoticon_dict.items():
        text = text.replace(emot, f" {meaning} ")
    text = emoji.demojize(text).replace(":", "").replace("_", " ")
    text = re.sub(rf"[{string.punctuation}]", "", text)
    text = re.sub(r"(.)\1{2,}", r"\1\1", text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [stemmer.stem(lemmatizer.lemmatize(w)) for w in tokens]
    return " ".join(tokens)

# ======= Load Models =======
traditional_models = {
    "nb_count": joblib.load("C:/DCS/saved_models/saved_models/traditional/nb_count_bi.joblib"),
    "lr_count": joblib.load("C:/DCS/saved_models/saved_models/traditional/lr_count_bi.joblib"),
    "svm_count": joblib.load("C:/DCS/saved_models/saved_models/traditional/svm_count_bi.joblib")
}

transformer_models = {
    "distilbert": {
        "tokenizer": AutoTokenizer.from_pretrained("C:/DCS/saved_models/saved_models/transformers/distilbert-base-uncased", local_files_only=True),
        "model": AutoModelForSequenceClassification.from_pretrained("C:/DCS/saved_models/saved_models/transformers/distilbert-base-uncased", local_files_only=True)
    },
    "bert-base": {
        "tokenizer": AutoTokenizer.from_pretrained("C:/DCS/saved_models/saved_models/transformers/bert-base-uncased", local_files_only=True),
        "model": AutoModelForSequenceClassification.from_pretrained("C:/DCS/saved_models/saved_models/transformers/bert-base-uncased", local_files_only=True)
    },
    # "bert-large": {
    #     "tokenizer": AutoTokenizer.from_pretrained("C:/DCS/saved_models/saved_models/transformers/bert-large-uncased", local_files_only=True),
    #     "model": AutoModelForSequenceClassification.from_pretrained("C:/DCS/saved_models/saved_models/transformers/bert-large-uncased", local_files_only=True)
    # },
    # "roberta": {
    #     "tokenizer": AutoTokenizer.from_pretrained("C:/DCS/saved_models/saved_models/transformers/roberta-base", local_files_only=True),
    #     "model": AutoModelForSequenceClassification.from_pretrained("C:/DCS/saved_models/saved_models/transformers/roberta-base", local_files_only=True)
    # }
}

# ======= API Schema =======
class TextInput(BaseModel):
    text: str
    model: str  # e.g., "nb_count", "lr_count", "svm_count", "distilbert", "bert-base"

# ======= Prediction Endpoint =======
@app.post("/predict")
def predict_sentiment(input_data: TextInput):
    text = preprocess(input_data.text)
    model_key = input_data.model

    if model_key in traditional_models:
        model, vectorizer = traditional_models[model_key]
        vec = vectorizer.transform([text])
        pred = model.predict(vec)[0]
        return {"sentiment": "positive" if pred == 1 else "negative"}

    elif model_key in transformer_models:
        tokenizer = transformer_models[model_key]["tokenizer"]
        model = transformer_models[model_key]["model"]
        model.eval()
        with torch.no_grad():
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
            outputs = model(**inputs)
            pred = torch.argmax(outputs.logits, dim=1).item()
        return {"sentiment": "positive" if pred == 1 else "negative"}

    else:
        return {"error": "Model not found or not supported."}

# To run this server:
# uvicorn predict_and_api:app --reload
