In [None]:
!pip install datasets transformers

In [None]:
#@title Setup & Config
import transformers
from transformers import RobertaModel, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import random
from transformers import set_seed

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42
set_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
random.seed(RANDOM_SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
model_checkpoint = "roberta-base"
batch_size = 16

In [None]:
from datasets import load_dataset, load_metric

In [None]:
import pandas as pd

In [None]:
df=pd.read_csv("/")

In [None]:
cols = df.columns.tolist()
cols = ['Text','label']
df = df[cols]

In [None]:
df['Text'] = df['Text'].str.replace(r"[^a-zA-Z0-9 ]+", "")

In [None]:
df['label'].value_counts() #2.5k #2.5k

In [None]:
df = df[df['Text'].notna()]
df = df[df['label'].notna()]

In [None]:
df

In [None]:
from sklearn.model_selection import train_test_split
RANDOM_SEED = 42

In [None]:
df_train, df_val = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_val, test_size=0.5, random_state=RANDOM_SEED)

In [None]:
df_test = pd.read_csv('/content/gdrive/MyDrive/')

In [None]:
df_train = df
df_val = pd.read_csv('/content/gdrive/MyDrive/')
df_test = pd.read_csv('/content/gdrive/MyDrive/')

In [None]:
df_val.shape

In [None]:
df_test.shape

In [None]:
df_train = df_train.astype({'label':'int'}) 
df_val = df_val.astype({'label':'int'})

In [None]:
df_train.to_csv('train.csv',index=False)

In [None]:
df_val.to_csv('val.csv',index=False)

In [None]:
from datasets import load_dataset
data_files = {"train": "train.csv", "val": "val.csv"}

In [None]:
dataset = load_dataset("csv", data_files=data_files)

In [None]:
dataset

In [None]:
metric = load_metric('accuracy')

In [None]:
import numpy as np

fake_preds = np.random.randint(0, 2, size=(2,))
fake_labels = np.random.randint(0, 2, size=(2,))
metric.compute(predictions=fake_preds, references=fake_labels)

In [None]:
dataset['train']

In [None]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(dataset["train"])

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [None]:
task_to_keys = {
    "headlines": ("Text", None),
}

In [None]:
task = 'headlines'
sentence1_key, sentence2_key = task_to_keys[task]
if sentence2_key is None:
    print(f"Sentence: {dataset['train'][0][sentence1_key]}")
else:
    print(f"Sentence 1: {dataset['train'][0][sentence1_key]}")
    print(f"Sentence 2: {dataset['train'][0][sentence2_key]}")

In [None]:
def preprocess_function(examples):
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)

In [None]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels).to("cuda")

In [None]:
metric_name = "pearson" if task == "stsb" else "matthews_correlation" if task == "cola" else "accuracy"
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=7,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    push_to_hub=False,
)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if task != "stsb":
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

In [None]:
validation_key = 'val'
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
model

In [None]:
trainer.evaluate()

In [None]:
from transformers import pipeline

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("/content/gdrive/MyDrive/")

In [None]:
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer,device=0)

In [None]:
op = classifier('The sky is falling')

In [None]:
op

In [None]:
df_test['label'].value_counts()

In [None]:
from tqdm import tqdm

In [None]:
for index,row in tqdm(df_test.iterrows()):
    tweet_text=row['Text']
    op = classifier(tweet_text)
    print(op)
    label = -1
    if op[0]['label'] == 'LABEL_0':
      label = 0      
    else:
      label = 1
    df_test.loc[index,'predicted_label']=label

In [None]:
df_test['label'].value_counts()

In [None]:
df_val['predicted_label'].value_counts()

In [None]:
trainer.save_model('here')

In [None]:
metric.compute(predictions=df_test['predicted_label'].tolist(), references=df_test['label'].tolist())

In [None]:
from sklearn.metrics import classification_report

In [None]:
y_true = df_test['label'].tolist()
y_pred = df_test['predicted_label'].tolist()
target_names = ['literal', 'metaphorical']
print(classification_report(y_true, y_pred, target_names=target_names))

In [None]:
df

In [None]:
df

In [None]:
trainer.save_model('/content/gdrive/MyDrive/')

In [None]:
import torch
state_dict = torch.load('/content/gdrive/MyDrive/')
model.load_state_dict(state_dict)
model=model.to("cuda")

In [None]:
df_val

In [None]:
dataset['val'][0]

In [None]:
df_val.info()

In [None]:
df_test.to_csv('R1',index=False)

In [None]:
df = pd.read_csv('/content/gdrive/MyDrive/')
df1 = pd.read_csv('/content/gdrive/MyDrive/')
df2 = pd.read_csv('/content/gdrive/MyDrive/')
df = pd.concat([df,df1,df2])

In [None]:
df4 = pd.read_csv('/content/gdrive/MyDrive/')

In [None]:
df4

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df4, test_size=0.1, random_state=42)

In [None]:
df = pd.concat([df,df_train])

In [None]:
df.reset_index(inplace=True)
df.drop(['index'],axis=1,inplace=True)

In [None]:
df

In [None]:
df

In [None]:
model

In [None]:
df_test

In [None]:
df_test

In [None]:
df_test = pd.read_csv('/content/gdrive/MyDrive/')

In [None]:
df2 = pd.read_csv('/content/gdrive/MyDrive/')
df3 = pd.read_csv('/content/gdrive/MyDrive/')
df2= pd.concat([df2,df3])

In [None]:
df2

In [None]:
df2.drop_duplicates(subset=['Text'], keep='first')

In [None]:
df.drop(['dataset'],axis=1,inplace=True)

In [None]:
df2.drop(['sample_type'],axis=1,inplace=True)

In [None]:
df = pd.concat([df,df2])

In [None]:
df.reset_index(inplace=True)
df.drop(['index'],axis=1,inplace=True)

In [None]:
df