In [1]:
import pandas as pd

df = pd.read_csv("reply_classification_dataset.csv")

df.head()

Unnamed: 0,reply,label
0,Can we discuss pricing??,NEUTRAL
1,"Im excited to explore this further, plz send c...",POSITIVE
2,We not looking for new solutions.,negative
3,Could u clarify features included?,neutral
4,"lets,, schedule a meeting to dive deeper",positive


In [2]:
df_copy = df.copy()

In [3]:
df_copy['label'].value_counts()

neutral     704
positive    446
NEGATIVE    267
POSITIVE    263
Negative    254
negative    189
Neutral       3
NEUTRAL       2
Positive      1
Name: label, dtype: int64

In [4]:
df_copy['label'] = df_copy['label'].str.lower()

In [5]:
df_copy['label'].value_counts()

positive    710
negative    710
neutral     709
Name: label, dtype: int64

In [6]:
df_copy['reply']

0                                Can we discuss pricing??
1       Im excited to explore this further, plz send c...
2                     We not looking for new solutions.  
3                      Could u clarify features included?
4                lets,, schedule a meeting to dive deeper
                              ...                        
2124                     I’ll forward this to my manager.
2125                      Can you share more information?
2126                 Send me the details and I’ll review.
2127                   What exactly does your product do?
2128                I am not the right person to contact.
Name: reply, Length: 2129, dtype: object

In [7]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]','',text)
    text = re.sub(r'\s+',' ',text).strip()
    return text

In [8]:
df_copy['clean_reply'] = df_copy['reply'].astype(str).apply(clean_text)

df_copy[['reply','clean_reply']].head(5)

Unnamed: 0,reply,clean_reply
0,Can we discuss pricing??,can we discuss pricing
1,"Im excited to explore this further, plz send c...",im excited to explore this further plz send co...
2,We not looking for new solutions.,we not looking for new solutions
3,Could u clarify features included?,could u clarify features included
4,"lets,, schedule a meeting to dive deeper",lets schedule a meeting to dive deeper


In [9]:
df_preprocess = df_copy[['clean_reply','label']]

In [10]:
df_preprocess.head()

Unnamed: 0,clean_reply,label
0,can we discuss pricing,neutral
1,im excited to explore this further plz send co...,positive
2,we not looking for new solutions,negative
3,could u clarify features included,neutral
4,lets schedule a meeting to dive deeper,positive


In [11]:
df_preprocess.shape

(2129, 2)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(df_preprocess['clean_reply'])


In [13]:
feature_names = vectorizer.get_feature_names_out()

X_df = pd.DataFrame(X.toarray(),columns=feature_names)

X_df.head()

Unnamed: 0,about,account,action,agreement,align,aligned,already,alredy,am,and,...,were,weve,what,work,workflow,works,would,yes,you,your
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df_preprocess['label_encoded'] = le.fit_transform(df_preprocess['label'])

df_preprocess.head(3)

Unnamed: 0,clean_reply,label,label_encoded
0,can we discuss pricing,neutral,1
1,im excited to explore this further plz send co...,positive,2
2,we not looking for new solutions,negative,0


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_df,df_preprocess['label_encoded'], test_size=0.2,random_state=42)

X_train.shape

(1703, 225)

In [16]:
from sklearn.linear_model import LogisticRegression

model_LR = LogisticRegression()

model_LR.fit(X_train,y_train)

y_pred = model_LR.predict(X_test)

In [17]:
from sklearn.metrics import classification_report

report = classification_report(y_test,y_pred)

print(report)

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       150
           1       0.99      1.00      1.00       136
           2       1.00      1.00      1.00       140

    accuracy                           1.00       426
   macro avg       1.00      1.00      1.00       426
weighted avg       1.00      1.00      1.00       426



In [18]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=42)

dt_model.fit(X_train,y_train)

y_pred_dt = dt_model.predict(X_test)




In [19]:
dt_report = classification_report(y_test,y_pred_dt)

print(dt_report)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       150
           1       0.99      0.99      0.99       136
           2       0.98      0.99      0.99       140

    accuracy                           0.99       426
   macro avg       0.99      0.99      0.99       426
weighted avg       0.99      0.99      0.99       426



In [20]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=10,random_state=42)

rf_model.fit(X_train,y_train)

y_pred_rf = rf_model.predict(X_test)

print(classification_report(y_test,y_pred_rf))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       150
           1       1.00      1.00      1.00       136
           2       0.99      1.00      1.00       140

    accuracy                           1.00       426
   macro avg       1.00      1.00      1.00       426
weighted avg       1.00      1.00      1.00       426



In [21]:
import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(verbose=-1)

lgb_model.fit(X_train,y_train)

y_pred_lgb = lgb_model.predict(X_test)

print(classification_report(y_test,y_pred_lgb))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       150
           1       0.99      1.00      1.00       136
           2       1.00      1.00      1.00       140

    accuracy                           1.00       426
   macro avg       1.00      1.00      1.00       426
weighted avg       1.00      1.00      1.00       426



In [22]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X_train, X_test, y_train, y_test = train_test_split(
    df_preprocess['clean_reply'], 
    df_preprocess['label'], 
    test_size=0.2,
    random_state=42
)





  from .autonotebook import tqdm as notebook_tqdm


In [23]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [24]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
train_encodings = tokenizer(list(X_train), truncation=True, padding=True)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True)



In [25]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    tf.convert_to_tensor(y_train, dtype=tf.int32)
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    tf.convert_to_tensor(y_test, dtype=tf.int32)
))

train_dataset = train_dataset.shuffle(1000).batch(16)
test_dataset = test_dataset.batch(16)

In [26]:
model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    from_pt=True,  
    num_labels=3
)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")]




  torch.utils._pytree._register_pytree_node(
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pr

In [27]:
model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics
)


history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=5
)


Epoch 1/5

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [28]:
results = model.evaluate(test_dataset)
print(f"Test Loss: {results[0]:.4f}")
print(f"Test Accuracy: {results[1]:.4f}")


import numpy as np

y_pred_logits = model.predict(test_dataset).logits 
y_pred = np.argmax(y_pred_logits, axis=1)  

y_true = np.concatenate([y for x, y in test_dataset], axis=0)

from sklearn.metrics import classification_report, confusion_matrix

print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))

print("\nClassification Report:")
print(classification_report(y_true, y_pred))


Test Loss: 0.0008
Test Accuracy: 1.0000
Confusion Matrix:
[[150   0   0]
 [  0 136   0]
 [  0   0 140]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       150
           1       1.00      1.00      1.00       136
           2       1.00      1.00      1.00       140

    accuracy                           1.00       426
   macro avg       1.00      1.00      1.00       426
weighted avg       1.00      1.00      1.00       426



In [29]:
save_path = "./saved_model"

model.save_pretrained(save_path)

tokenizer.save_pretrained(save_path)

print(f"Model and tokenizer saved at {save_path}")


Model and tokenizer saved at ./saved_model
