<a href="https://colab.research.google.com/github/Dattathreya1/SVARA_REPLY_CLASSIFICATION/blob/main/svara.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q pandas scikit-learn joblib matplotlib seaborn transformers datasets torch evaluate fastapi uvicorn python-multipart

In [None]:
import pandas as pd, numpy as np, os, re
print("Current working dir:", os.getcwd())


# Upload file manually in Colab file browser or mount Google Drive if desired.
# If you upload manually, set csv_path to the uploaded location.
csv_path = "/content/reply_classification_dataset.csv" # adjust path if needed
df = pd.read_csv(csv_path)
print(df.shape)
df.head()

Current working dir: /content
(2129, 2)


Unnamed: 0,reply,label
0,Can we discuss pricing??,NEUTRAL
1,"Im excited to explore this further, plz send c...",POSITIVE
2,We not looking for new solutions.,negative
3,Could u clarify features included?,neutral
4,"lets,, schedule a meeting to dive deeper",positive


In [None]:
# --- Cell 3: Quick EDA
print(df.info())
print(df.isnull().sum())
print(df['label'].value_counts())

for i,row in df.sample(5, random_state=42).iterrows():
 print(row['label'], ":", row['reply'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129 entries, 0 to 2128
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   reply   2129 non-null   object
 1   label   2129 non-null   object
dtypes: object(2)
memory usage: 33.4+ KB
None
reply    0
label    0
dtype: int64
label
neutral     704
positive    446
NEGATIVE    267
POSITIVE    263
Negative    254
negative    189
Neutral       3
NEUTRAL       2
Positive      1
Name: count, dtype: int64
positive : Im excited to explore!
negative : Please remove me from your mailing list.
POSITIVE : Lets set up product overview!
neutral : Can u provide demo video?
positive : Sounds interesting, let’s set up a call.


In [None]:
# --- Cell 4: Clean text & normalize labels
def clean_text(s):
 s = str(s).lower()
 s = re.sub(r"[^a-z0-9\s]", " ", s)
 s = re.sub(r"\s+", " ", s).strip()
 return s


def map_label(x):
 x = str(x).lower().strip()
 if 'pos' in x: return 'positive'
 if 'neg' in x: return 'negative'
 if 'neu' in x: return 'neutral'
 return x


df['text'] = df['reply'].apply(clean_text)
df['label'] = df['label'].apply(map_label)
print(df['label'].value_counts())
df.head()

label
positive    710
negative    710
neutral     709
Name: count, dtype: int64


Unnamed: 0,reply,label,text
0,Can we discuss pricing??,neutral,can we discuss pricing
1,"Im excited to explore this further, plz send c...",positive,im excited to explore this further plz send co...
2,We not looking for new solutions.,negative,we not looking for new solutions
3,Could u clarify features included?,neutral,could u clarify features included
4,"lets,, schedule a meeting to dive deeper",positive,lets schedule a meeting to dive deeper


In [None]:
# --- Cell 5: Data split
from sklearn.model_selection import train_test_split
X = df['text']
y = df['label']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)
print("Train size:", len(X_train), "Val size:", len(X_val), "Test size:", len(X_test))

Train size: 1490 Val size: 319 Test size: 320


In [None]:
# --- Cell 6: Baseline model training
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


pipe = make_pipeline(
TfidfVectorizer(ngram_range=(1,2), max_df=0.95, min_df=2),
LogisticRegression(max_iter=1000, solver='liblinear', multi_class='ovr')
)
pipe.fit(X_train, y_train)



In [None]:
# --- Cell 7: Evaluate baseline
from sklearn.metrics import classification_report, accuracy_score, f1_score


y_pred = pipe.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Macro F1:", f1_score(y_test, y_pred, average='macro'))
print(classification_report(y_test, y_pred))

Accuracy: 0.996875
Macro F1: 0.9968846671761837
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       106
     neutral       1.00      0.99      1.00       107
    positive       0.99      1.00      1.00       107

    accuracy                           1.00       320
   macro avg       1.00      1.00      1.00       320
weighted avg       1.00      1.00      1.00       320



In [None]:
# --- Cell 8: Save baseline model
import joblib
joblib.dump(pipe, "/content/baseline_tfidf_logreg.pkl")
print("Saved baseline model to /content/baseline_tfidf_logreg.pkl")

Saved baseline model to /content/baseline_tfidf_logreg.pkl


In [None]:
# --- Cell 9: Optional - Transformer fine-tuning (requires GPU)
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np


label_list = sorted(df['label'].unique())
label_to_id = {l:i for i,l in enumerate(label_list)}


train_df = pd.DataFrame({'text': X_train.tolist(), 'label': y_train.map(label_to_id)})
val_df = pd.DataFrame({'text': X_val.tolist(), 'label': y_val.map(label_to_id)})


dataset_train = Dataset.from_pandas(train_df)
dataset_val = Dataset.from_pandas(val_df)


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


def tokenize(batch):
 return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=128)


dataset_train = dataset_train.map(tokenize, batched=True)
dataset_val = dataset_val.map(tokenize, batched=True)


model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_list))


def compute_metrics(eval_pred):
 logits, labels = eval_pred
 preds = np.argmax(logits, axis=-1)
 from sklearn.metrics import accuracy_score, f1_score
 return {"accuracy": accuracy_score(labels, preds), "f1_macro": f1_score(labels, preds, average='macro')}


training_args = TrainingArguments(
 output_dir="/content/distilbert_run",
 eval_strategy="epoch",
 save_strategy="epoch",
 per_device_train_batch_size=16,
 per_device_eval_batch_size=32,
 num_train_epochs=3,
 learning_rate=2e-5,
 load_best_model_at_end=True,
 metric_for_best_model="f1_macro"
)


trainer = Trainer(
 model=model,
 args=training_args,
 train_dataset=dataset_train,
 eval_dataset=dataset_val,
 tokenizer=tokenizer,
 compute_metrics=compute_metrics
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/1490 [00:00<?, ? examples/s]

Map:   0%|          | 0/319 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [None]:
# --- Cell 10: Quick inference demo with baseline model
sample_text = "I'm excited to explore this further, please send details"
print("Sample text:", sample_text)
print("Predicted label:", pipe.predict([sample_text])[0])


# Use pipeline for probability
print("Probabilities:", pipe.predict_proba([sample_text]))

Sample text: I'm excited to explore this further, please send details
Predicted label: positive
Probabilities: [[0.06378999 0.10282414 0.83338587]]
