In [32]:
from datasets import load_dataset, Dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

In [33]:
dataset = load_dataset("csv", data_files="Reddit_Data.csv")

In [37]:
import pandas as pd
df = pd.DataFrame(dataset['train'])
df_cleaned = df.dropna()

In [38]:
df.columns

Index(['clean_comment', 'category'], dtype='object')

In [43]:
label_map = {-1: 0, 0: 1, 1: 2}
df_cleaned["labels"] = df["category"].map(label_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned["labels"] = df["category"].map(label_map)


In [45]:
df_cleaned = df_cleaned.drop(columns=["category"])

In [46]:
df_cleaned

Unnamed: 0,clean_comment,labels
0,family mormon have never tried explain them t...,2
1,buddhism has very much lot compatible with chr...,2
2,seriously don say thing first all they won get...,0
3,what you have learned yours and only yours wha...,1
4,for your own benefit you may want read living ...,2
...,...,...
37244,jesus,1
37245,kya bhai pure saal chutiya banaya modi aur jab...,2
37246,downvote karna tha par upvote hogaya,1
37247,haha nice,2


In [47]:
dataset = Dataset.from_pandas(df_cleaned)

In [48]:
dataset = dataset.train_test_split(test_size=0.2, seed=42)

In [49]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [50]:
def preprocess(batch):
    return tokenizer(batch["clean_comment"], truncation=True, padding="max_length", max_length=128)

In [51]:
dataset = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/29719 [00:00<?, ? examples/s]

Map:   0%|          | 0/7430 [00:00<?, ? examples/s]

In [52]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

In [54]:
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none"
)

In [55]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [56]:
trainer.train()

Step,Training Loss
500,0.7026
1000,0.408
1500,0.32
2000,0.2606
2500,0.2012
3000,0.1924
3500,0.1835
4000,0.1448
4500,0.1285
5000,0.1162


TrainOutput(global_step=7432, training_loss=0.20628245243856802, metrics={'train_runtime': 2679.2371, 'train_samples_per_second': 44.369, 'train_steps_per_second': 2.774, 'total_flos': 7819467661458432.0, 'train_loss': 0.20628245243856802, 'epoch': 4.0})

In [57]:
print(trainer.evaluate())

{'eval_loss': 0.2740499973297119, 'eval_accuracy': 0.9460296096904441, 'eval_f1': 0.9460887738086955, 'eval_runtime': 54.1237, 'eval_samples_per_second': 137.278, 'eval_steps_per_second': 8.591, 'epoch': 4.0}


In [58]:
model.save_pretrained("./reddit_sentiment_model")
tokenizer.save_pretrained("./reddit_sentiment_model")

('./reddit_sentiment_model/tokenizer_config.json',
 './reddit_sentiment_model/special_tokens_map.json',
 './reddit_sentiment_model/vocab.txt',
 './reddit_sentiment_model/added_tokens.json',
 './reddit_sentiment_model/tokenizer.json')

In [59]:
from google.colab import files
import shutil

# Zip the folder
shutil.make_archive("reddit_sentiment_model", 'zip', "./reddit_sentiment_model")

# Download
files.download("reddit_sentiment_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Testing

In [60]:
from transformers import BertTokenizerFast, BertForSequenceClassification

test_model = BertForSequenceClassification.from_pretrained("./reddit_sentiment_model")
test_tokenizer = BertTokenizerFast.from_pretrained("./reddit_sentiment_model")

In [61]:
from transformers import pipeline

In [62]:
sentiment_pipeline = pipeline("text-classification", model=test_model, tokenizer=test_tokenizer)

Device set to use cuda:0


In [63]:
texts = [
    "I love this community, everyone is so helpful!",
    "This post is okay, nothing special.",
    "Worst advice I’ve ever read on Reddit."
]

In [64]:
results = sentiment_pipeline(texts)

In [65]:
for text, res in zip(texts, results):
    print(f"{text} -> {res}")

I love this community, everyone is so helpful! -> {'label': 'LABEL_2', 'score': 0.999599277973175}
This post is okay, nothing special. -> {'label': 'LABEL_2', 'score': 0.9996894598007202}
Worst advice I’ve ever read on Reddit. -> {'label': 'LABEL_0', 'score': 0.9990071654319763}
