In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error,root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch 
import numpy as np
from sklearn.metrics import accuracy_score,precision_recall_fscore_support
from datasets import load_dataset, DatasetDict,Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [31]:
df = pd.read_csv('./cleaned_data_about_emotion.csv',sep=",")
df = df.dropna(how="any")
df

Unnamed: 0,sentiment,content
0,sadness,layin n bed headache ughhhh waitin
1,sadness,funeral ceremony gloomy friday
2,enthusiasm,want hang friend soon
3,neutral,want trade houston ticket
4,worry,ping prom bc bf like friend
...,...,...
39167,happiness,succesfully follow tayla
39169,love,happy mother day love
39170,love,happy mother day mommy woman man long momma day
39171,happiness,wassup beautiful follow peep new hit single de...


In [32]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['sentiment'] = encoder.fit_transform(df['sentiment'])

In [33]:
dataset = Dataset.from_pandas(df)

In [34]:
len(df['sentiment'].unique())

12

In [35]:
model = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForSequenceClassification.from_pretrained(model,num_labels=len(df['sentiment'].unique()))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
def tokenize_funciton(examples):
    tokens = tokenizer(
        examples['content'], 
        truncation=True,
        padding="max_length",
        max_length=128
    )
    tokens["labels"] = examples["sentiment"]
    return tokens

dataset = dataset.map(tokenize_funciton,batched=True)
dataset = dataset.train_test_split(test_size=0.2)

dataset = dataset.remove_columns(['content', 'sentiment', '__index_level_0__'])

Map: 100%|██████████| 38818/38818 [00:06<00:00, 5919.69 examples/s]


In [37]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

In [39]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    remove_unused_columns=True,
)

In [38]:
print(dataset)
print(dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 31054
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7764
    })
})
{'input_ids': [101, 18168, 2290, 2025, 2113, 2767, 2113, 8840, 2140, 2235, 2088, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [40]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)
trainer.train()



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
eval_results = trainer.evaluate()
print(eval_results)
