In [None]:
from transformers import AutoModelForSequenceClassification,RobertaTokenizer, Trainer, TrainingArguments
import pandas as pd
import torch
from sklearn.model_selection import KFold,cross_validate
from torch.utils.data import Dataset
from sklearn.metrics import make_scorer,accuracy_score, precision_score, recall_score, f1_score,precision_recall_fscore_support
import numpy as np
import os


class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.label = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
       
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            truncation = True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding =  'max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
   


        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'label': torch.tensor(self.label[index], dtype=torch.float)
        }

def encode_Lables(label):
    temp = np.zeros(5)
    temp[label-1] = 1
    return temp

def encode_Test_Lables(label):
    return int(label-1)

    
def split_data(filename,samp_size):
    current_dir = os.getcwd()
    parent_dir = os.path.dirname(current_dir)
    path = parent_dir +"/model_data/" + filename
    #Sampling data
    if samp_size > 1:
        samp_size = 1
    if samp_size <=0:
      samp_size = .1

    df = pd.read_csv(path)
    if int(len(df) * samp_size) <= 5000:
        df = df.sample(n=int(len(df) * samp_size))
    else:
        df = df.sample(n=5000)  
    
    df.reset_index(inplace=True, Drop=True)
    df = pd.read_csv(path)
    df = df.sample(n=int(len(df) * .50), replace=True)

    df = df.reset_index(drop=True)
    #Renaming for Model
    df.rename(columns={"Rating":"label"},inplace=True)
    df.rename(columns={"Review_text":"text"},inplace=True)


    tree_train_Y = df["label"].apply(encode_Test_Lables)
    
    df["label"] = df["label"].astype(int).apply(encode_Lables)
    
    max_length = int(df["text"].str.split().str.len().mean())
    return df,max_length,tree_train_Y
#evaluation metrix
def compute_metrics(pred):
    
    label = torch.argmax(torch.tensor(pred.label_ids),dim=1).numpy()
    preds = torch.argmax(torch.tensor(pred.predictions),dim=1).numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(label, preds, average='micro')
    acc = accuracy_score(label, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

#hyperParmaters for roberta
samp_size = .3
lr = 2e-5
epochs = 5
train_batch_size = 8
test_batch_size = 8
w_decay = .001

#Getting Data Ready for Models
filename = "data_set_1.csv"
train,max_length,tree_train_Y= split_data(filename,samp_size)

#Sending Data To model
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=5)
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Creating dataset
training_set = SentimentData(train, tokenizer, max_len=max_length)


#Setting Training paramaters for Roberta model
training_args = TrainingArguments(

    learning_rate=lr,
    fp16=True,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=test_batch_size,
    num_train_epochs=epochs,
    weight_decay=w_decay,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_set,
    eval_dataset=training_set,
    processing_class=tokenizer,
    compute_metrics = compute_metrics,
)






  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#Training Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
trainer.train()

In [None]:
pred = trainer.predict(training_set)
roberta_pred = pd.DataFrame(pred.predictions)
roberta_pred.to_csv("Roberta_Predictions.csv", index = False)
roberta_pred = roberta_pred.to_numpy()

In [3]:
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support,accuracy_score,classification_report

#hyper paramaters
tree_epochs = 20
max_depth = 2
learing_rate = .00001
l2reg = .0005

device = "cpu"

gpu_available = torch.cuda.is_available()
if gpu_available:
    device = 'cuda'

# Convert text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_df=.95,min_df=.0125)
tree_train_X = vectorizer.fit_transform(train["text"])

# create model instance
bst = XGBClassifier(n_estimators=tree_epochs, max_depth=max_depth,booster='gblinear', learning_rate=learing_rate, reg_lambda= l2reg, objective='multi:softmax',device = device)
# fit model
bst.fit(tree_train_X,tree_train_Y)

Parameters: { "max_depth" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [4]:
results = bst.predict(tree_train_X)
tree_results = bst.predict_proba(tree_train_X)

In [5]:


print(classification_report(tree_train_Y,results))

              precision    recall  f1-score   support

           0       1.00      0.00      0.00      3426
           1       0.00      0.00      0.00      1054
           2       0.00      0.00      0.00      2208
           3       0.00      0.00      0.00      6542
           4       0.56      1.00      0.72     16927

    accuracy                           0.56     30157
   macro avg       0.31      0.20      0.14     30157
weighted avg       0.43      0.56      0.40     30157



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.neighbors import KNeighborsClassifier

meta_data = np.concatenate((roberta_pred, tree_results), axis=1)
knn = KNeighborsClassifier(n_neighbors=100)

kf = KFold(n_splits=5, shuffle=True, random_state=42)



scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score,average="micro",zero_division=0.0),
    'recall': make_scorer(recall_score,average="micro",zero_division=0.0),
    'f1_score': make_scorer(f1_score,average="micro",zero_division=0.0)
}
cv_results = cross_validate(knn, meta_data, tree_train_Y, cv=kf, scoring=scoring)

In [None]:
print("Accuracy:", np.mean(cv_results['test_accuracy']))
print("Precision:", np.mean(cv_results['test_precision']))
print("Recall:", np.mean(cv_results['test_recall']))
print("F1 Score:", np.mean(cv_results['test_f1_score']))