<a href="https://colab.research.google.com/github/Deviprasanna-17/audio_analysis_infysp_group1/blob/main/NLP_Text_Classification_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

install Required libraries

In [None]:
!pip install -q scikit-learn gensim fastapi uvicorn joblib datasets transformers torch


In [None]:
imports

In [None]:
import numpy as np
import pandas as pd
import re
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.base import BaseEstimator, TransformerMixin


In [None]:
load dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("imdb")

df = pd.DataFrame(dataset["train"][:10000])
texts = df["text"].tolist()
labels = df["label"].tolist()


In [None]:
train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)


In [None]:
count vectorizermodel

In [None]:
count_pipeline = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("clf", LogisticRegression(max_iter=1000))
])

count_pipeline.fit(X_train, y_train)

pred = count_pipeline.predict(X_test)
print("CountVectorizer Accuracy:", accuracy_score(y_test, pred))
print("CountVectorizer F1:", f1_score(y_test, pred))


In [None]:
TF-IDF MODEL

In [None]:
tfidf_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer()),
    ("clf", LogisticRegression(max_iter=1000))
])

tfidf_pipeline.fit(X_train, y_train)

pred = tfidf_pipeline.predict(X_test)
print("TF-IDF Accuracy:", accuracy_score(y_test, pred))
print("TF-IDF F1:", f1_score(y_test, pred))


In [None]:
TF-IDF WITH N-GRAMS

In [None]:
ngram_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(ngram_range=(1,3))),
    ("clf", LogisticRegression(max_iter=1000))
])

ngram_pipeline.fit(X_train, y_train)

pred = ngram_pipeline.predict(X_test)
print("N-gram TF-IDF Accuracy:", accuracy_score(y_test, pred))
print("N-gram TF-IDF F1:", f1_score(y_test, pred))


In [None]:
WORD2VEC TRANSFORMERS

In [None]:
class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, size=100):
        self.size = size

    def fit(self, X, y=None):
        sentences = [re.findall(r"\b\w+\b", text.lower()) for text in X]
        self.model = Word2Vec(sentences, vector_size=self.size, window=5, min_count=1)
        return self

    def transform(self, X):
        vectors = []
        for text in X:
            words = re.findall(r"\b\w+\b", text.lower())
            word_vecs = [self.model.wv[w] for w in words if w in self.model.wv]
            vectors.append(np.mean(word_vecs, axis=0) if word_vecs else np.zeros(self.size))
        return np.array(vectors)


In [None]:
WORD2VEC MODEL

In [None]:
w2v_pipeline = Pipeline([
    ("vectorizer", Word2VecVectorizer()),
    ("clf", LogisticRegression(max_iter=1000))
])

w2v_pipeline.fit(X_train, y_train)

pred = w2v_pipeline.predict(X_test)
print("Word2Vec Accuracy:", accuracy_score(y_test, pred))
print("Word2Vec F1:", f1_score(y_test, pred))


In [None]:
GRID SEARCH CV

In [None]:
param_grid = {
    "clf__C": [0.01, 0.1, 1, 10]
}

grid = GridSearchCV(
    tfidf_pipeline,
    param_grid,
    cv=3,
    scoring="f1",
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)
print("Best F1:", grid.best_score_)


In [None]:
BEST MODEL

In [None]:
best_model = grid.best_estimator_
joblib.dump(best_model, "best_text_pipeline.joblib")
print("Model saved as best_text_pipeline.joblib")


In [None]:
FAST API INTERFACE APP

In [None]:
%%writefile app.py
from fastapi import FastAPI
import joblib

app = FastAPI()
model = joblib.load("best_text_pipeline.joblib")

@app.post("/predict")
def predict(text: str):
    prediction = model.predict([text])[0]
    return {"prediction": int(prediction)}


In [None]:
!uvicorn app:app --host 0.0.0.0 --port 8000


In [None]:
from transformers import pipeline

sentiment_pipeline = pipeline("sentiment-analysis")

sentiment_pipeline("This internship assignment is very useful and practical!")
