In [None]:
from tqdm import tqdm
import random
import torch
from transformers import pipeline
import pandas as pd

# Define hierarchical mapping
sentiment_mapping = {
    "positive": ["joy", "love", "surprise"],
}

# Load FLAN-T5 model for text generation with pipeline
generator = pipeline("text2text-generation", model="google/flan-t5-large", device=0 if torch.cuda.is_available() else -1)

# Set batch size
batch_size = 10
samples_per_fine_label = 1000

# Generate the dataset for each fine label
positive_data = []

# Generate for each fine label in the sentiment mapping
for overall in sentiment_mapping:
    fine_options = sentiment_mapping[overall]

    # For each fine sentiment under the overall sentiment category
    for fine in fine_options:
        print(f"Generating for {fine} sentiment under overall {overall}")

        for _ in tqdm(range(samples_per_fine_label // batch_size), desc=f"Generating for '{fine}'", ncols=100):
            # Generate the prompt based on fine sentiment
            prompt = f"Generate a social media post/tweet that expresses {fine} emotion with an overall {overall} sentiment."
            # Create a batch of identical prompts (or varied if needed)
            prompts = [prompt] * batch_size

            # Generate in batch
            responses = generator(prompts,
                                  max_length=140,
                                  min_length=100,
                                  do_sample=True,
                                  top_k=50)

            # Append responses
            batch_data = [{
                "text": response['generated_text'],
                "overall_sentiment": overall,
                "fine_grained_sentiment": fine
            } for response in responses]

            # Append batch data to the synthetic data list
            positive_data.extend(batch_data)

# Create a DataFrame from the generated synthetic data
positive_df = pd.DataFrame(positive_data)

# Save to CSV (optional)
# synthetic_df.to_csv("synthetic_data.csv", index=False)

# Display the generated dataframe
positive_df.head()


Device set to use cuda:0


Generating for joy sentiment under overall positive


Generating for 'joy': 100%|█████████████████████████████████████| 100/100 [1:18:46<00:00, 47.27s/it]


Generating for love sentiment under overall positive


Generating for 'love': 100%|████████████████████████████████████| 100/100 [1:18:41<00:00, 47.22s/it]


Generating for surprise sentiment under overall positive


Generating for 'surprise': 100%|████████████████████████████████| 100/100 [1:19:01<00:00, 47.41s/it]


Unnamed: 0,text,overall_sentiment,fine_grained_sentiment
0,@DannyDavidMusic haha my mom will love it and ...,positive,joy
1,is looking forward to his 2nd birthday! what a...,positive,joy
2,uuuuhhhhhh! I feel better today!!..wonder why....,positive,joy
3,@mileycyrus awww omg i love that song!! i got ...,positive,joy
4,@the_candy_girl I know. I am definitely a fan ...,positive,joy


In [None]:
positive_df.head

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
positive_df.to_csv("/content/drive/MyDrive/Mini Project/Generated/positive_data.csv", index=False)


##merge the generated datasets into single one

In [None]:
old_df = pd.read_csv("/content/drive/MyDrive/Mini Project/Generated/combined_synthetic_sentiment_data.csv")

In [None]:
import pandas as pd

# Combine old and new datasets
combined_df = pd.concat([old_df, positive_df], ignore_index=True)

# Drop duplicate texts (keeping the first occurrence)
combined_df = combined_df.drop_duplicates(subset=["text"]).reset_index(drop=True)

# Optional: Check final counts
print("Final record count:", len(combined_df))
print(combined_df["overall_sentiment"].value_counts())


Final record count: 8800
overall_sentiment
positive    3700
negative    3700
neutral     1400
Name: count, dtype: int64


In [None]:
combined_df.to_csv("/content/drive/MyDrive/Mini Project/Generated/combined_synthetic_sentiment_data.csv", index=False)


In [None]:
combined_df.head()

Unnamed: 0,text,overall_sentiment,fine_grained_sentiment
0,"@Jimmy_Knymer, we have not been too good but i...",neutral,neutral
1,@JerzyKrzewczynski http://twitpic.com/7ds6l - ...,neutral,neutral
2,@Miranda_Dawson What did you get them for for ...,neutral,neutral
3,"@rukiyuldoshi Ah - good point, my sister is in...",neutral,neutral
4,has become a bit tired and confused from the h...,neutral,neutral


In [None]:
combined_df.isnull()

AttributeError: 'DataFrame' object has no attribute 'isnull1'

In [None]:
combined_df.shape()

##final data evaluation function

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [None]:
import pandas as pd


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)
from datasets import Dataset
import torch


def train_and_predict_all_models(
    df,
    target_col="label",
    transformers_models=[
        "distilbert-base-uncased",
        "bert-base-uncased",
        "roberta-base",
        "cardiffnlp/twitter-roberta-base-sentiment",
        "xlm-roberta-base"
    ],
    test_size=0.2,
    max_length=128,
    use_cuda=torch.cuda.is_available()
):
    def run_ml_models(df):
        print(f"\n----- Traditional ML Models -----\n")
        X = df["text"]
        y = df[target_col]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y)

        models = {
            "Logistic Regression": LogisticRegression(max_iter=1000),
            "Naive Bayes": MultinomialNB(),
            "SVM": SVC(),
            "Random Forest": RandomForestClassifier()
        }

        for name, model in models.items():
            pipe = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('clf', model)
            ])
            pipe.fit(X_train, y_train)
            y_pred = pipe.predict(X_test)
            print(f"\nModel: {name}")
            print(classification_report(y_test, y_pred))

    def run_transformer_models(df):
        print(f"\n----- Transformer Models -----\n")

        for model_name in transformers_models:
            print(f"\nTransformer: {model_name}\n")
            tokenizer = AutoTokenizer.from_pretrained(model_name)

            def tokenize_function(examples):
                return tokenizer(
                    examples['text'],
                    padding="max_length",
                    truncation=True,
                    max_length=max_length
                )

            hf_dataset = Dataset.from_pandas(
                df[["text", target_col]].rename(columns={target_col: "label"})
            )
            hf_dataset = hf_dataset.class_encode_column("label")
            num_labels = hf_dataset.features['label'].num_classes

            tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)
            tokenized_dataset = tokenized_dataset.train_test_split(test_size=test_size)
            tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

            model = AutoModelForSequenceClassification.from_pretrained(
                model_name,
                num_labels=num_labels,
                ignore_mismatched_sizes=True
            )

            training_args = TrainingArguments(
                output_dir=f"./results/{model_name.replace('/', '_')}",
                learning_rate=2e-5,
                per_device_train_batch_size=8,
                per_device_eval_batch_size=8,
                num_train_epochs=4,
                weight_decay=0.01,
                save_strategy="no",
                logging_dir="./logs",
                disable_tqdm=True,
            )
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=tokenized_dataset['train'],
                eval_dataset=tokenized_dataset['test']
            )

            trainer.train()
            preds = trainer.predict(tokenized_dataset['test'])
            y_pred = preds.predictions.argmax(axis=-1)
            y_true = tokenized_dataset['test']['label']

            print(classification_report(y_true, y_pred))

    # Run models
    # run_ml_models(df.copy())
    run_transformer_models(df.copy())


####synthetic df

In [None]:
synthetic_df=pd.read_csv("/content/drive/MyDrive/Mini Project/Generated/combined_synthetic_sentiment_data.csv")

In [None]:
synthetic_df.shape

(8800, 3)

In [None]:
synthetic_df.columns

Index(['text', 'overall_sentiment', 'fine_grained_sentiment'], dtype='object')

## real

In [None]:
train_and_predict_all_models(synthetic_df,target_col="overall_sentiment")



----- Traditional ML Models -----


Model: Logistic Regression
              precision    recall  f1-score   support

    negative       0.71      0.86      0.78       740
     neutral       0.39      0.05      0.09       280
    positive       0.74      0.83      0.78       740

    accuracy                           0.72      1760
   macro avg       0.61      0.58      0.55      1760
weighted avg       0.67      0.72      0.67      1760


Model: Naive Bayes
              precision    recall  f1-score   support

    negative       0.59      0.94      0.72       740
     neutral       0.00      0.00      0.00       280
    positive       0.79      0.62      0.70       740

    accuracy                           0.66      1760
   macro avg       0.46      0.52      0.47      1760
weighted avg       0.58      0.66      0.60      1760



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Model: SVM
              precision    recall  f1-score   support

    negative       0.71      0.86      0.78       740
     neutral       0.36      0.02      0.03       280
    positive       0.73      0.83      0.78       740

    accuracy                           0.72      1760
   macro avg       0.60      0.57      0.53      1760
weighted avg       0.66      0.72      0.66      1760


Model: Random Forest
              precision    recall  f1-score   support

    negative       0.67      0.84      0.75       740
     neutral       0.00      0.00      0.00       280
    positive       0.70      0.79      0.75       740

    accuracy                           0.69      1760
   macro avg       0.46      0.54      0.50      1760
weighted avg       0.58      0.69      0.63      1760


----- Transformer Models -----


Transformer: distilbert-base-uncased



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Casting to class labels:   0%|          | 0/8800 [00:00<?, ? examples/s]

Map:   0%|          | 0/8800 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mavantithale0922[0m ([33mavantithale0922-sardar-vallabhbhai-national-institute-of[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


{'loss': 0.7687, 'grad_norm': 7.541685581207275, 'learning_rate': 1.716477272727273e-05, 'epoch': 0.5681818181818182}
{'loss': 0.6662, 'grad_norm': 5.949389934539795, 'learning_rate': 1.4323863636363638e-05, 'epoch': 1.1363636363636362}
{'loss': 0.5751, 'grad_norm': 5.264895915985107, 'learning_rate': 1.1482954545454545e-05, 'epoch': 1.7045454545454546}
{'loss': 0.5276, 'grad_norm': 14.633749961853027, 'learning_rate': 8.642045454545457e-06, 'epoch': 2.2727272727272725}
{'loss': 0.4592, 'grad_norm': 4.273338317871094, 'learning_rate': 5.801136363636364e-06, 'epoch': 2.840909090909091}
{'loss': 0.3898, 'grad_norm': 11.118736267089844, 'learning_rate': 2.960227272727273e-06, 'epoch': 3.409090909090909}
{'loss': 0.3619, 'grad_norm': 0.8016400337219238, 'learning_rate': 1.1931818181818185e-07, 'epoch': 3.9772727272727275}
{'train_runtime': 388.1442, 'train_samples_per_second': 72.55, 'train_steps_per_second': 9.069, 'train_loss': 0.5347105169838126, 'epoch': 4.0}
              precision   

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Casting to class labels:   0%|          | 0/8800 [00:00<?, ? examples/s]

Map:   0%|          | 0/8800 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 0.7485, 'grad_norm': 5.040748596191406, 'learning_rate': 1.716477272727273e-05, 'epoch': 0.5681818181818182}
{'loss': 0.6442, 'grad_norm': 11.993361473083496, 'learning_rate': 1.4323863636363638e-05, 'epoch': 1.1363636363636362}
{'loss': 0.5277, 'grad_norm': 2.2265682220458984, 'learning_rate': 1.1482954545454545e-05, 'epoch': 1.7045454545454546}
{'loss': 0.4603, 'grad_norm': 24.26836585998535, 'learning_rate': 8.642045454545457e-06, 'epoch': 2.2727272727272725}
{'loss': 0.4072, 'grad_norm': 15.695857048034668, 'learning_rate': 5.801136363636364e-06, 'epoch': 2.840909090909091}
{'loss': 0.3215, 'grad_norm': 11.892480850219727, 'learning_rate': 2.960227272727273e-06, 'epoch': 3.409090909090909}
{'loss': 0.2624, 'grad_norm': 20.51689338684082, 'learning_rate': 1.1931818181818185e-07, 'epoch': 3.9772727272727275}
{'train_runtime': 707.8027, 'train_samples_per_second': 39.785, 'train_steps_per_second': 4.973, 'train_loss': 0.48030833629044617, 'epoch': 4.0}
              precision

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Casting to class labels:   0%|          | 0/8800 [00:00<?, ? examples/s]

Map:   0%|          | 0/8800 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 0.772, 'grad_norm': 8.11684799194336, 'learning_rate': 1.716477272727273e-05, 'epoch': 0.5681818181818182}
{'loss': 0.6727, 'grad_norm': 14.141013145446777, 'learning_rate': 1.4323863636363638e-05, 'epoch': 1.1363636363636362}
{'loss': 0.5853, 'grad_norm': 3.46024227142334, 'learning_rate': 1.1482954545454545e-05, 'epoch': 1.7045454545454546}
{'loss': 0.5165, 'grad_norm': 19.383773803710938, 'learning_rate': 8.642045454545457e-06, 'epoch': 2.2727272727272725}
{'loss': 0.4929, 'grad_norm': 10.87434196472168, 'learning_rate': 5.801136363636364e-06, 'epoch': 2.840909090909091}
{'loss': 0.4176, 'grad_norm': 52.53609085083008, 'learning_rate': 2.960227272727273e-06, 'epoch': 3.409090909090909}
{'loss': 0.376, 'grad_norm': 77.7061767578125, 'learning_rate': 1.1931818181818185e-07, 'epoch': 3.9772727272727275}
{'train_runtime': 725.0708, 'train_samples_per_second': 38.838, 'train_steps_per_second': 4.855, 'train_loss': 0.5460752372037281, 'epoch': 4.0}
              precision    reca

config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Casting to class labels:   0%|          | 0/8800 [00:00<?, ? examples/s]

Map:   0%|          | 0/8800 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

{'loss': 0.7028, 'grad_norm': 9.93887996673584, 'learning_rate': 1.716477272727273e-05, 'epoch': 0.5681818181818182}
{'loss': 0.6398, 'grad_norm': 12.103569984436035, 'learning_rate': 1.4323863636363638e-05, 'epoch': 1.1363636363636362}
{'loss': 0.5283, 'grad_norm': 5.401997089385986, 'learning_rate': 1.1482954545454545e-05, 'epoch': 1.7045454545454546}
{'loss': 0.4533, 'grad_norm': 36.073123931884766, 'learning_rate': 8.642045454545457e-06, 'epoch': 2.2727272727272725}
{'loss': 0.4237, 'grad_norm': 39.36601638793945, 'learning_rate': 5.801136363636364e-06, 'epoch': 2.840909090909091}
{'loss': 0.3298, 'grad_norm': 46.601409912109375, 'learning_rate': 2.960227272727273e-06, 'epoch': 3.409090909090909}
{'loss': 0.2895, 'grad_norm': 55.35022735595703, 'learning_rate': 1.1931818181818185e-07, 'epoch': 3.9772727272727275}
{'train_runtime': 725.7609, 'train_samples_per_second': 38.801, 'train_steps_per_second': 4.85, 'train_loss': 0.47976256121288646, 'epoch': 4.0}
              precision   

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Casting to class labels:   0%|          | 0/8800 [00:00<?, ? examples/s]

Map:   0%|          | 0/8800 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 0.8327, 'grad_norm': 13.41172981262207, 'learning_rate': 1.716477272727273e-05, 'epoch': 0.5681818181818182}
{'loss': 0.6998, 'grad_norm': 9.210756301879883, 'learning_rate': 1.4323863636363638e-05, 'epoch': 1.1363636363636362}
{'loss': 0.6239, 'grad_norm': 4.032834529876709, 'learning_rate': 1.1482954545454545e-05, 'epoch': 1.7045454545454546}
{'loss': 0.5903, 'grad_norm': 6.609683990478516, 'learning_rate': 8.642045454545457e-06, 'epoch': 2.2727272727272725}
{'loss': 0.5552, 'grad_norm': 28.842243194580078, 'learning_rate': 5.801136363636364e-06, 'epoch': 2.840909090909091}
{'loss': 0.4835, 'grad_norm': 18.147043228149414, 'learning_rate': 2.960227272727273e-06, 'epoch': 3.409090909090909}
{'loss': 0.4402, 'grad_norm': 45.32762145996094, 'learning_rate': 1.1931818181818185e-07, 'epoch': 3.9772727272727275}
{'train_runtime': 958.4798, 'train_samples_per_second': 29.38, 'train_steps_per_second': 3.672, 'train_loss': 0.6022901799191128, 'epoch': 4.0}
              precision    

In [None]:
train_and_predict_all_models(synthetic_df,target_col="fine_grained_sentiment")



----- Traditional ML Models -----


Model: Logistic Regression
              precision    recall  f1-score   support

       anger       0.30      0.34      0.32       235
        fear       0.35      0.32      0.34       235
        hate       0.00      0.00      0.00        35
         joy       0.30      0.30      0.30       247
        love       0.40      0.45      0.42       246
     neutral       0.28      0.28      0.28       280
     sadness       0.46      0.46      0.46       235
    surprise       0.38      0.36      0.37       247

    accuracy                           0.35      1760
   macro avg       0.31      0.31      0.31      1760
weighted avg       0.34      0.35      0.34      1760



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Model: Naive Bayes
              precision    recall  f1-score   support

       anger       0.35      0.24      0.29       235
        fear       0.37      0.28      0.32       235
        hate       0.00      0.00      0.00        35
         joy       0.30      0.11      0.16       247
        love       0.38      0.24      0.29       246
     neutral       0.22      0.72      0.33       280
     sadness       0.45      0.32      0.37       235
    surprise       0.47      0.13      0.20       247

    accuracy                           0.29      1760
   macro avg       0.32      0.26      0.25      1760
weighted avg       0.35      0.29      0.28      1760



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Model: SVM
              precision    recall  f1-score   support

       anger       0.29      0.34      0.31       235
        fear       0.35      0.31      0.33       235
        hate       0.00      0.00      0.00        35
         joy       0.29      0.28      0.28       247
        love       0.38      0.42      0.40       246
     neutral       0.27      0.33      0.30       280
     sadness       0.47      0.45      0.46       235
    surprise       0.39      0.33      0.36       247

    accuracy                           0.34      1760
   macro avg       0.30      0.31      0.30      1760
weighted avg       0.34      0.34      0.34      1760



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Model: Random Forest
              precision    recall  f1-score   support

       anger       0.26      0.24      0.25       235
        fear       0.27      0.27      0.27       235
        hate       0.00      0.00      0.00        35
         joy       0.26      0.30      0.28       247
        love       0.34      0.41      0.38       246
     neutral       0.25      0.30      0.27       280
     sadness       0.44      0.38      0.41       235
    surprise       0.34      0.26      0.29       247

    accuracy                           0.30      1760
   macro avg       0.27      0.27      0.27      1760
weighted avg       0.30      0.30      0.30      1760


----- Transformer Models -----


Transformer: distilbert-base-uncased



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Casting to class labels:   0%|          | 0/8800 [00:00<?, ? examples/s]

Map:   0%|          | 0/8800 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.7207, 'grad_norm': 12.26823616027832, 'learning_rate': 1.716477272727273e-05, 'epoch': 0.5681818181818182}
{'loss': 1.5454, 'grad_norm': 10.926080703735352, 'learning_rate': 1.4323863636363638e-05, 'epoch': 1.1363636363636362}
{'loss': 1.4362, 'grad_norm': 21.468286514282227, 'learning_rate': 1.1482954545454545e-05, 'epoch': 1.7045454545454546}
{'loss': 1.3316, 'grad_norm': 14.093499183654785, 'learning_rate': 8.642045454545457e-06, 'epoch': 2.2727272727272725}
{'loss': 1.2178, 'grad_norm': 10.69008731842041, 'learning_rate': 5.801136363636364e-06, 'epoch': 2.840909090909091}
{'loss': 1.1057, 'grad_norm': 13.804654121398926, 'learning_rate': 2.960227272727273e-06, 'epoch': 3.409090909090909}
{'loss': 1.0148, 'grad_norm': 10.95126724243164, 'learning_rate': 1.1931818181818185e-07, 'epoch': 3.9772727272727275}
{'train_runtime': 362.5442, 'train_samples_per_second': 77.673, 'train_steps_per_second': 9.709, 'train_loss': 1.337646248665723, 'epoch': 4.0}
              precision  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Casting to class labels:   0%|          | 0/8800 [00:00<?, ? examples/s]

Map:   0%|          | 0/8800 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.7386, 'grad_norm': 7.172275066375732, 'learning_rate': 1.716477272727273e-05, 'epoch': 0.5681818181818182}
{'loss': 1.5451, 'grad_norm': 9.944816589355469, 'learning_rate': 1.4323863636363638e-05, 'epoch': 1.1363636363636362}
{'loss': 1.3699, 'grad_norm': 6.254717826843262, 'learning_rate': 1.1482954545454545e-05, 'epoch': 1.7045454545454546}
{'loss': 1.2354, 'grad_norm': 19.304275512695312, 'learning_rate': 8.642045454545457e-06, 'epoch': 2.2727272727272725}
{'loss': 1.0824, 'grad_norm': 18.978992462158203, 'learning_rate': 5.801136363636364e-06, 'epoch': 2.840909090909091}
{'loss': 0.9364, 'grad_norm': 12.91866397857666, 'learning_rate': 2.960227272727273e-06, 'epoch': 3.409090909090909}
{'loss': 0.825, 'grad_norm': 24.154747009277344, 'learning_rate': 1.1931818181818185e-07, 'epoch': 3.9772727272727275}
{'train_runtime': 709.0368, 'train_samples_per_second': 39.716, 'train_steps_per_second': 4.964, 'train_loss': 1.2448000875386325, 'epoch': 4.0}
              precision   

Casting to class labels:   0%|          | 0/8800 [00:00<?, ? examples/s]

Map:   0%|          | 0/8800 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.7128, 'grad_norm': 12.107043266296387, 'learning_rate': 1.716477272727273e-05, 'epoch': 0.5681818181818182}
{'loss': 1.5686, 'grad_norm': 13.6038818359375, 'learning_rate': 1.4323863636363638e-05, 'epoch': 1.1363636363636362}
{'loss': 1.4182, 'grad_norm': 12.487224578857422, 'learning_rate': 1.1482954545454545e-05, 'epoch': 1.7045454545454546}
{'loss': 1.3094, 'grad_norm': 13.446805000305176, 'learning_rate': 8.642045454545457e-06, 'epoch': 2.2727272727272725}
{'loss': 1.2083, 'grad_norm': 20.422609329223633, 'learning_rate': 5.801136363636364e-06, 'epoch': 2.840909090909091}
{'loss': 1.0864, 'grad_norm': 23.97846221923828, 'learning_rate': 2.960227272727273e-06, 'epoch': 3.409090909090909}
{'loss': 0.9967, 'grad_norm': 36.75578308105469, 'learning_rate': 1.1931818181818185e-07, 'epoch': 3.9772727272727275}


In [None]:
train_and_predict_all_models(synthetic_df,target_col="fine_grained_sentiment")



----- Transformer Models -----


Transformer: roberta-base



Casting to class labels:   0%|          | 0/8800 [00:00<?, ? examples/s]

Map:   0%|          | 0/8800 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.7494, 'grad_norm': 18.363088607788086, 'learning_rate': 1.716477272727273e-05, 'epoch': 0.5681818181818182}
{'loss': 1.5646, 'grad_norm': 15.660992622375488, 'learning_rate': 1.4323863636363638e-05, 'epoch': 1.1363636363636362}
{'loss': 1.4408, 'grad_norm': 17.85063362121582, 'learning_rate': 1.1482954545454545e-05, 'epoch': 1.7045454545454546}
{'loss': 1.3482, 'grad_norm': 15.326458930969238, 'learning_rate': 8.642045454545457e-06, 'epoch': 2.2727272727272725}
{'loss': 1.2199, 'grad_norm': 21.14071273803711, 'learning_rate': 5.801136363636364e-06, 'epoch': 2.840909090909091}
{'loss': 1.1138, 'grad_norm': 29.275365829467773, 'learning_rate': 2.960227272727273e-06, 'epoch': 3.409090909090909}
{'loss': 1.0417, 'grad_norm': 39.4102783203125, 'learning_rate': 1.1931818181818185e-07, 'epoch': 3.9772727272727275}
{'train_runtime': 728.3955, 'train_samples_per_second': 38.66, 'train_steps_per_second': 4.833, 'train_loss': 1.352366823499853, 'epoch': 4.0}
              precision    

Casting to class labels:   0%|          | 0/8800 [00:00<?, ? examples/s]

Map:   0%|          | 0/8800 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.6364, 'grad_norm': 9.25947380065918, 'learning_rate': 1.716477272727273e-05, 'epoch': 0.5681818181818182}
{'loss': 1.523, 'grad_norm': 11.370521545410156, 'learning_rate': 1.4323863636363638e-05, 'epoch': 1.1363636363636362}
{'loss': 1.3581, 'grad_norm': 11.44097900390625, 'learning_rate': 1.1482954545454545e-05, 'epoch': 1.7045454545454546}
{'loss': 1.247, 'grad_norm': 19.11360740661621, 'learning_rate': 8.642045454545457e-06, 'epoch': 2.2727272727272725}
{'loss': 1.1288, 'grad_norm': 23.76291847229004, 'learning_rate': 5.801136363636364e-06, 'epoch': 2.840909090909091}
{'loss': 1.0014, 'grad_norm': 23.975919723510742, 'learning_rate': 2.960227272727273e-06, 'epoch': 3.409090909090909}
{'loss': 0.9049, 'grad_norm': 21.76906394958496, 'learning_rate': 1.1931818181818185e-07, 'epoch': 3.9772727272727275}
{'train_runtime': 727.6667, 'train_samples_per_second': 38.699, 'train_steps_per_second': 4.837, 'train_loss': 1.2547910533168098, 'epoch': 4.0}
              precision    re

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Casting to class labels:   0%|          | 0/8800 [00:00<?, ? examples/s]

Map:   0%|          | 0/8800 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.8313, 'grad_norm': 11.216156005859375, 'learning_rate': 1.716477272727273e-05, 'epoch': 0.5681818181818182}
{'loss': 1.6503, 'grad_norm': 12.493613243103027, 'learning_rate': 1.4323863636363638e-05, 'epoch': 1.1363636363636362}
{'loss': 1.5434, 'grad_norm': 7.704151153564453, 'learning_rate': 1.1482954545454545e-05, 'epoch': 1.7045454545454546}
{'loss': 1.4517, 'grad_norm': 42.42093276977539, 'learning_rate': 8.642045454545457e-06, 'epoch': 2.2727272727272725}
{'loss': 1.4006, 'grad_norm': 18.43552017211914, 'learning_rate': 5.801136363636364e-06, 'epoch': 2.840909090909091}
{'loss': 1.2954, 'grad_norm': 28.7098388671875, 'learning_rate': 2.960227272727273e-06, 'epoch': 3.409090909090909}
{'loss': 1.231, 'grad_norm': 28.75621795654297, 'learning_rate': 1.1931818181818185e-07, 'epoch': 3.9772727272727275}
{'train_runtime': 962.3548, 'train_samples_per_second': 29.262, 'train_steps_per_second': 3.658, 'train_loss': 1.48441312529824, 'epoch': 4.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.37      0.27      0.32       212
           1       0.30      0.33      0.31       221
           2       0.00      0.00      0.00        39
           3       0.40      0.32      0.36       269
           4       0.43      0.46      0.44       224
           5       0.30      0.36      0.33       297
           6       0.46      0.61      0.53       232
           7       0.43      0.41      0.42       266

    accuracy                           0.38      1760
   macro avg       0.34      0.35      0.34      1760
weighted avg       0.38      0.38      0.38      1760



In [None]:
real_df=pd.read_csv("/content/drive/MyDrive/Mini Project/Generated/sampled.csv")

In [None]:
train_and_predict_all_models(real_df,target_col="predicted_fine_grained")



----- Transformer Models -----


Transformer: distilbert-base-uncased



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Casting to class labels:   0%|          | 0/21644 [00:00<?, ? examples/s]

Map:   0%|          | 0/21644 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.286, 'grad_norm': 7.038407802581787, 'learning_rate': 1.884757505773672e-05, 'epoch': 0.23094688221709006}
{'loss': 0.9774, 'grad_norm': 12.276163101196289, 'learning_rate': 1.769284064665127e-05, 'epoch': 0.4618937644341801}
{'loss': 0.9015, 'grad_norm': 11.07868480682373, 'learning_rate': 1.6538106235565822e-05, 'epoch': 0.6928406466512702}
{'loss': 0.8799, 'grad_norm': 11.46877670288086, 'learning_rate': 1.538337182448037e-05, 'epoch': 0.9237875288683602}
{'loss': 0.744, 'grad_norm': 8.6715087890625, 'learning_rate': 1.4228637413394922e-05, 'epoch': 1.1547344110854503}
{'loss': 0.6826, 'grad_norm': 29.615251541137695, 'learning_rate': 1.307390300230947e-05, 'epoch': 1.3856812933025404}
{'loss': 0.6783, 'grad_norm': 11.465886116027832, 'learning_rate': 1.191916859122402e-05, 'epoch': 1.6166281755196303}
{'loss': 0.5963, 'grad_norm': 8.319307327270508, 'learning_rate': 1.076443418013857e-05, 'epoch': 1.8475750577367207}
{'loss': 0.565, 'grad_norm': 15.729875564575195, 'lear

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Casting to class labels:   0%|          | 0/21644 [00:00<?, ? examples/s]

Map:   0%|          | 0/21644 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.2358, 'grad_norm': 6.559625148773193, 'learning_rate': 1.884757505773672e-05, 'epoch': 0.23094688221709006}
{'loss': 0.9262, 'grad_norm': 11.82910442352295, 'learning_rate': 1.769284064665127e-05, 'epoch': 0.4618937644341801}
{'loss': 0.8828, 'grad_norm': 11.502676963806152, 'learning_rate': 1.6538106235565822e-05, 'epoch': 0.6928406466512702}
{'loss': 0.8491, 'grad_norm': 9.303016662597656, 'learning_rate': 1.538337182448037e-05, 'epoch': 0.9237875288683602}
{'loss': 0.6942, 'grad_norm': 8.076518058776855, 'learning_rate': 1.4228637413394922e-05, 'epoch': 1.1547344110854503}
{'loss': 0.6101, 'grad_norm': 23.043611526489258, 'learning_rate': 1.307390300230947e-05, 'epoch': 1.3856812933025404}
{'loss': 0.6114, 'grad_norm': 24.531723022460938, 'learning_rate': 1.191916859122402e-05, 'epoch': 1.6166281755196303}
{'loss': 0.5472, 'grad_norm': 10.434219360351562, 'learning_rate': 1.076443418013857e-05, 'epoch': 1.8475750577367207}
{'loss': 0.4779, 'grad_norm': 15.808134078979492,

Casting to class labels:   0%|          | 0/21644 [00:00<?, ? examples/s]

Map:   0%|          | 0/21644 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.3135, 'grad_norm': 18.479000091552734, 'learning_rate': 1.884757505773672e-05, 'epoch': 0.23094688221709006}
{'loss': 0.9196, 'grad_norm': 35.19087600708008, 'learning_rate': 1.769284064665127e-05, 'epoch': 0.4618937644341801}
{'loss': 0.8785, 'grad_norm': 19.284122467041016, 'learning_rate': 1.6538106235565822e-05, 'epoch': 0.6928406466512702}
{'loss': 0.8355, 'grad_norm': 16.874223709106445, 'learning_rate': 1.538337182448037e-05, 'epoch': 0.9237875288683602}
{'loss': 0.7299, 'grad_norm': 25.85964584350586, 'learning_rate': 1.4228637413394922e-05, 'epoch': 1.1547344110854503}
{'loss': 0.6569, 'grad_norm': 43.10336685180664, 'learning_rate': 1.307390300230947e-05, 'epoch': 1.3856812933025404}
{'loss': 0.6657, 'grad_norm': 2.3629214763641357, 'learning_rate': 1.191916859122402e-05, 'epoch': 1.6166281755196303}
{'loss': 0.5846, 'grad_norm': 18.130048751831055, 'learning_rate': 1.076443418013857e-05, 'epoch': 1.8475750577367207}
{'loss': 0.552, 'grad_norm': 18.711891174316406,

Casting to class labels:   0%|          | 0/21644 [00:00<?, ? examples/s]

Map:   0%|          | 0/21644 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.1178, 'grad_norm': 10.049907684326172, 'learning_rate': 1.884757505773672e-05, 'epoch': 0.23094688221709006}
{'loss': 0.8679, 'grad_norm': 12.77180004119873, 'learning_rate': 1.769284064665127e-05, 'epoch': 0.4618937644341801}
{'loss': 0.8295, 'grad_norm': 19.851295471191406, 'learning_rate': 1.6538106235565822e-05, 'epoch': 0.6928406466512702}
{'loss': 0.7929, 'grad_norm': 27.361602783203125, 'learning_rate': 1.538337182448037e-05, 'epoch': 0.9237875288683602}
{'loss': 0.6559, 'grad_norm': 23.503122329711914, 'learning_rate': 1.4228637413394922e-05, 'epoch': 1.1547344110854503}
{'loss': 0.5763, 'grad_norm': 41.7650146484375, 'learning_rate': 1.307390300230947e-05, 'epoch': 1.3856812933025404}
{'loss': 0.5649, 'grad_norm': 9.213062286376953, 'learning_rate': 1.191916859122402e-05, 'epoch': 1.6166281755196303}
{'loss': 0.516, 'grad_norm': 10.547779083251953, 'learning_rate': 1.076443418013857e-05, 'epoch': 1.8475750577367207}
{'loss': 0.4618, 'grad_norm': 19.012845993041992, 

Casting to class labels:   0%|          | 0/21644 [00:00<?, ? examples/s]

Map:   0%|          | 0/21644 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 1.4663, 'grad_norm': 12.611908912658691, 'learning_rate': 1.884757505773672e-05, 'epoch': 0.23094688221709006}
{'loss': 1.1324, 'grad_norm': 19.9086971282959, 'learning_rate': 1.769284064665127e-05, 'epoch': 0.4618937644341801}
{'loss': 1.0193, 'grad_norm': 29.57805824279785, 'learning_rate': 1.6538106235565822e-05, 'epoch': 0.6928406466512702}
{'loss': 0.9869, 'grad_norm': 24.425750732421875, 'learning_rate': 1.538337182448037e-05, 'epoch': 0.9237875288683602}
{'loss': 0.9186, 'grad_norm': 37.72159957885742, 'learning_rate': 1.4228637413394922e-05, 'epoch': 1.1547344110854503}
{'loss': 0.8375, 'grad_norm': 41.90950012207031, 'learning_rate': 1.307390300230947e-05, 'epoch': 1.3856812933025404}
{'loss': 0.8261, 'grad_norm': 15.457219123840332, 'learning_rate': 1.191916859122402e-05, 'epoch': 1.6166281755196303}
{'loss': 0.7441, 'grad_norm': 15.582343101501465, 'learning_rate': 1.076443418013857e-05, 'epoch': 1.8475750577367207}
{'loss': 0.7221, 'grad_norm': 22.568635940551758, 

In [None]:
import pandas as pd

def sample_per_class(df, label_col='label', samples_per_class=100, random_state=42, shuffle=True):

    sampled_dfs = []
    for label in df[label_col].unique():
        class_df = df[df[label_col] == label]
        sampled = class_df.sample(n=min(samples_per_class, len(class_df)), random_state=random_state)
        sampled_dfs.append(sampled)

    result_df = pd.concat(sampled_dfs).reset_index(drop=True)
    if shuffle:
        result_df = result_df.sample(frac=1, random_state=random_state).reset_index(drop=True)

    return result_df


In [None]:
import pandas as pd

def combine_df(df1, label_col1, df2, label_col2):

    df1_renamed = df1[["text", label_col1]].rename(columns={label_col1: 'label'})
    df2_renamed = df2[["text", label_col2]].rename(columns={label_col2: 'label'})

    combined_df = pd.concat([df1_renamed, df2_renamed], ignore_index=True)

    return combined_df


In [None]:
def get_class_distribution(df, label_col='label', normalize=False):

    distribution = df[label_col].value_counts(normalize=normalize).sort_index()
    return distribution


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)
from datasets import Dataset
import torch


def train_and_predict_all_models(
    df,
    target_col="label",
    transformers_models=[
        "xlm-roberta-base",
        "cardiffnlp/twitter-roberta-base-sentiment",
        # "roberta-base",
        # "bert-base-uncased"

    ],
    test_size=0.2,
    max_length=128,
    use_cuda=torch.cuda.is_available()
):
    def run_ml_models(df):
        print(f"\n----- Traditional ML Models -----\n")
        X = df["text"]
        y = df[target_col]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y)

        models = {
            "Logistic Regression": LogisticRegression(max_iter=1000),
            "Naive Bayes": MultinomialNB(),
            "SVM": SVC(),
            "Random Forest": RandomForestClassifier()
        }

        for name, model in models.items():
            pipe = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('clf', model)
            ])
            pipe.fit(X_train, y_train)
            y_pred = pipe.predict(X_test)
            print(f"\nModel: {name}")
            print(classification_report(y_test, y_pred))

    def run_transformer_models(df):
        print(f"\n----- Transformer Models -----\n")

        for model_name in transformers_models:
            print(f"\nTransformer: {model_name}\n")
            tokenizer = AutoTokenizer.from_pretrained(model_name)

            def tokenize_function(examples):
                return tokenizer(
                    examples['text'],
                    padding="max_length",
                    truncation=True,
                    max_length=max_length
                )

            hf_dataset = Dataset.from_pandas(
                df[["text", target_col]].rename(columns={target_col: "label"})
            )
            hf_dataset = hf_dataset.class_encode_column("label")
            num_labels = hf_dataset.features['label'].num_classes

            tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)
            tokenized_dataset = tokenized_dataset.train_test_split(test_size=test_size)
            tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

            model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, ignore_mismatched_sizes=True)

            training_args = TrainingArguments(
                output_dir=f"./results/{model_name.replace('/', '_')}",
                learning_rate=2e-5,
                per_device_train_batch_size=8,
                per_device_eval_batch_size=8,
                num_train_epochs=4,
                weight_decay=0.01,
                save_strategy="no",
                logging_dir="./logs",
                disable_tqdm=True,
            )
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=tokenized_dataset['train'],
                eval_dataset=tokenized_dataset['test']
            )

            trainer.train()
            preds = trainer.predict(tokenized_dataset['test'])
            y_pred = preds.predictions.argmax(axis=-1)
            y_true = tokenized_dataset['test']['label']

            print(classification_report(y_true, y_pred))

    # Run models
    # run_ml_models(df.copy())
    run_transformer_models(df.copy())


In [None]:
real_df=pd.read_csv("/content/drive/MyDrive/Mini Project/Generated/sampled.csv")
synthetic_df=pd.read_csv("/content/drive/MyDrive/Mini Project/Generated/combined_synthetic_sentiment_data.csv")

####b. finegrained sentiment

In [None]:
# Get raw counts
print(get_class_distribution(real_df, label_col='predicted_fine_grained'))

# Get percentage distribution
print(get_class_distribution(real_df, label_col='predicted_fine_grained', normalize=True))


predicted_fine_grained
anger        1139
fear         1430
hate         1402
joy          4010
love          879
neutral       528
sadness      1495
surprise    10761
Name: count, dtype: int64
predicted_fine_grained
anger       0.052624
fear        0.066069
hate        0.064775
joy         0.185271
love        0.040612
neutral     0.024395
sadness     0.069072
surprise    0.497182
Name: proportion, dtype: float64


In [None]:
# Get raw counts
print(get_class_distribution(synthetic_df, label_col='fine_grained_sentiment'))

# Get percentage distribution
print(get_class_distribution(synthetic_df, label_col='fine_grained_sentiment', normalize=True))

fine_grained_sentiment
anger       1173
fear        1174
hate         176
joy         1233
love        1232
neutral     1400
sadness     1177
surprise    1235
Name: count, dtype: int64
fine_grained_sentiment
anger       0.133295
fear        0.133409
hate        0.020000
joy         0.140114
love        0.140000
neutral     0.159091
sadness     0.133750
surprise    0.140341
Name: proportion, dtype: float64


In [None]:
# Combine real and synthetic datasets
combined_df_fine = combine_df(synthetic_df, 'fine_grained_sentiment', real_df, 'predicted_fine_grained')

In [None]:
# Then pass it to your training pipeline
train_and_predict_all_models(combined_df_fine, target_col="label")



----- Transformer Models -----


Transformer: xlm-roberta-base



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Casting to class labels:   0%|          | 0/30444 [00:00<?, ? examples/s]

Map:   0%|          | 0/30444 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mavantithale0922[0m ([33mavantithale0922-sardar-vallabhbhai-national-institute-of[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


{'loss': 1.6836, 'grad_norm': 15.986210823059082, 'learning_rate': 1.9180623973727423e-05, 'epoch': 0.16420361247947454}
{'loss': 1.3708, 'grad_norm': 50.299686431884766, 'learning_rate': 1.8359605911330053e-05, 'epoch': 0.3284072249589491}
{'loss': 1.271, 'grad_norm': 23.387615203857422, 'learning_rate': 1.753858784893268e-05, 'epoch': 0.49261083743842365}
{'loss': 1.2214, 'grad_norm': 23.245243072509766, 'learning_rate': 1.6717569786535305e-05, 'epoch': 0.6568144499178982}
{'loss': 1.1463, 'grad_norm': 19.1372013092041, 'learning_rate': 1.5896551724137932e-05, 'epoch': 0.8210180623973727}
{'loss': 1.1419, 'grad_norm': 13.075311660766602, 'learning_rate': 1.507553366174056e-05, 'epoch': 0.9852216748768473}
{'loss': 1.0203, 'grad_norm': 26.183170318603516, 'learning_rate': 1.4254515599343186e-05, 'epoch': 1.1494252873563218}
{'loss': 1.0266, 'grad_norm': 24.735498428344727, 'learning_rate': 1.3433497536945815e-05, 'epoch': 1.3136288998357963}
{'loss': 1.0191, 'grad_norm': 24.7489223480

config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Casting to class labels:   0%|          | 0/30444 [00:00<?, ? examples/s]

Map:   0%|          | 0/30444 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

{'loss': 1.3474, 'grad_norm': 11.851371765136719, 'learning_rate': 1.9180623973727423e-05, 'epoch': 0.16420361247947454}
{'loss': 1.1568, 'grad_norm': 21.53758430480957, 'learning_rate': 1.8359605911330053e-05, 'epoch': 0.3284072249589491}
{'loss': 1.0658, 'grad_norm': 18.294940948486328, 'learning_rate': 1.753858784893268e-05, 'epoch': 0.49261083743842365}
{'loss': 1.0383, 'grad_norm': 9.857195854187012, 'learning_rate': 1.6717569786535305e-05, 'epoch': 0.6568144499178982}
{'loss': 0.9982, 'grad_norm': 21.94193458557129, 'learning_rate': 1.5896551724137932e-05, 'epoch': 0.8210180623973727}
{'loss': 1.0125, 'grad_norm': 18.099140167236328, 'learning_rate': 1.507553366174056e-05, 'epoch': 0.9852216748768473}
{'loss': 0.8167, 'grad_norm': 26.90762710571289, 'learning_rate': 1.4254515599343186e-05, 'epoch': 1.1494252873563218}
{'loss': 0.7952, 'grad_norm': 19.189123153686523, 'learning_rate': 1.3433497536945815e-05, 'epoch': 1.3136288998357963}
{'loss': 0.7978, 'grad_norm': 21.57005310058