In [None]:

!pip install -q transformers datasets scikit-learn


In [None]:

import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score


In [None]:

df = pd.read_csv('/mnt/data/training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)
df = df[[0, 5]]
df.columns = ['label', 'text']
df['label'] = df['label'].replace({0: 0, 4: 1})
df = df.sample(2000).reset_index(drop=True)
df.head()


In [None]:

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

dataset = Dataset.from_pandas(df)
dataset = dataset.map(tokenize, batched=True)
dataset = dataset.rename_column('label', 'labels')
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

train_test = dataset.train_test_split(test_size=0.1)
train_dataset = train_test['train']
eval_dataset = train_test['test']


In [None]:

model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

def compute_metrics(p):
    preds = torch.argmax(torch.tensor(p.predictions), axis=1)
    labels = p.label_ids
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds)
    }


In [None]:

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)


In [None]:

trainer.train()


In [None]:

trainer.save_model("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")


In [None]:

from transformers import pipeline

sentiment_pipeline = pipeline("sentiment-analysis", model="./sentiment_model", tokenizer="./sentiment_model")
sentiment_pipeline("I love how this works!")
