In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import style
style.use("fivethirtyeight")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv("../input/nlp-getting-started/train.csv")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
100 * data.isna().sum() / data.shape[0]

In [None]:
data.isna().sum()

In [None]:
data.keyword[55:232]

In [None]:
data["keyword"].fillna("oov", inplace=True)
data["location"].fillna("unknown", inplace=True)

In [None]:
data.head()

In [None]:
data.drop("id", axis=1, inplace=True)

In [None]:
x, y = data.drop("target", axis=1), data["target"]

In [None]:
def transform_to_one_column(r):
    return r['keyword'] + ' ' + r["location"] + ' ' + r["text"]

In [None]:
x = x.apply(transform_to_one_column, axis="columns")

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
import string
punct = string.punctuation

In [None]:
def process(s):
    for p in punct:
        s = s.replace(p, '')
    s = s.lower()
    s = word_tokenize(s)
    s = " ".join(s)
    s = lemmatizer.lemmatize(s)
    return s

In [None]:
x = x.map(process)

In [None]:
plt.figure(figsize=(20,8))
sns.countplot(x=y, palette="cool")
plt.title("\nDistribution of the Target Feature\n\n", fontsize=35)
plt.show()

In [None]:
def wordcount(r):
    return len(r.split())

In [None]:
plt.figure(figsize=(20,8))
sns.countplot(x=x.apply(wordcount), color="mediumspringgreen")
plt.title("\nn° Words per Document\n\n", fontsize=35)
plt.show()

In [None]:
mydata = pd.concat([x,y], axis=1).rename({0:"text"}, axis=1)

In [None]:
mydata.head()

In [None]:
mydata.to_csv("mydata.csv", index=False)

In [None]:
#Huggingface
!pip install datasets

In [None]:
from datasets import load_dataset, Dataset

In [None]:
mydata

In [None]:
mydata = load_dataset("csv", data_files="./mydata.csv")["train"]

In [None]:
mydata

In [None]:
mydata = mydata.train_test_split(test_size=0.1)

In [None]:
mydata

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [None]:
tokenizer.model_max_length

In [None]:
def tokenize(e):
    return tokenizer(e["text"], truncation= True)

In [None]:
#mapping data through HF tokenizer
md = mydata.map(tokenize, batched= True)

In [None]:
md

In [None]:
md.column_names

In [None]:
traindata = md["train"]
testdata = md["test"]

In [None]:
traindata = traindata.remove_columns(["text"]).rename_column('target', "labels").with_format("torch")
testdata = testdata.remove_columns(["text"]).rename_column('target', "labels").with_format("torch")

In [None]:
traindata #after dropping ["text"]

In [None]:
traindata.select(range(10))

In [None]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

In [None]:
datacol = DataCollatorWithPadding(tokenizer)
trainloader = DataLoader(traindata, batch_size=32, shuffle=True, collate_fn=datacol)
testloader = DataLoader(testdata, batch_size=32, collate_fn=datacol)

In [None]:
traindata

In [None]:
#Modle part
!pip install accelerate

In [None]:
from accelerate import Accelerator   #for distributed settings training

In [None]:
accelerator = Accelerator()

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
checkpoint="roberta-base"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
from transformers import AdamW
optim = AdamW(model.parameters(), lr=5e-5)   #5e-5, 6e-6 , 3e-4, 1e-06
#why AdamW over Adam
#https://towardsdatascience.com/why-adamw-matters-736223f31b5d

In [None]:
from tqdm.auto import tqdm
import torch

In [None]:
device = accelerator.device
model=model.to(device)
print(device)

In [None]:
from datasets import load_metric
f1 = load_metric("f1")
acc = load_metric("accuracy")

In [None]:
from copy import deepcopy

In [None]:
model, optimizer, trainloader = accelerator.prepare(model, optim, trainloader)
testloader = accelerator.prepare(testloader)

In [None]:
def validate(model):
    for batch in testloader:
        #batch = {k:v.cuda() for k,v in batch.items()}
        outputs = model(**batch)
        predictions=torch.argmax(outputs.logits, dim=-1)
        #f1.add_batch(predictions=predictions, references=batch["labels"])
        f1.add_batch(predictions=accelerator.gather(predictions), references=accelerator.gather(batch["labels"]))
        #acc.add_batch(predictions=predictions, references=batch["labels"])
        acc.add_batch(predictions=accelerator.gather(predictions), references=accelerator.gather(batch["labels"]))
    acc_res = acc.compute()["accuracy"]
    print(f"Validation Accuracy: {acc_res:.2f}")
    f_res = f1.compute()["f1"]
    print(f"Validation F1-score: {f_res:.2f}")
    return acc_res

In [None]:
nsteps= 214
nepoch= 10
best_val_acc = 0
for epoch in range(nepoch):
    model.train()
    print(f"epoch n°{epoch+1}:")
    av_epoch_loss=0
    progress_bar = tqdm(range(nsteps))
    for batch in trainloader:
        #batch = {k:v.cuda() for k,v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        av_epoch_loss += loss
        #loss.backward()
        accelerator.backward(loss)
        optim.step()
        optim.zero_grad()
        predictions=torch.argmax(outputs.logits, dim=-1)
        f1.add_batch(predictions=predictions, references=batch["labels"])
        acc.add_batch(predictions=predictions, references=batch["labels"])
        progress_bar.update(1)
    av_epoch_loss /= nsteps
    print(f"Training Loss: {av_epoch_loss: .2f}")
    acc_res = acc.compute()["accuracy"]
    print(f"Training Accuracy: {acc_res:.2f}")
    f_res = f1.compute()["f1"]
    print(f"Training F1-score: {f_res:.2f}")
    model.eval()
    val_acc = validate(model)
    if val_acc > best_val_acc:
        print("Achieved best validation accuracy so far. Saving model.")
        best_val_acc = val_acc
        best_model_state = deepcopy(model.state_dict())
    print("\n\n")

In [None]:
model.load_state_dict(best_model_state)

In [None]:
sub_data = pd.read_csv("../input/nlp-getting-started/test.csv")
sub_data["keyword"].fillna("oov", inplace=True)
sub_data["location"].fillna("unknown", inplace= True)
sub_data.drop("id", axis=1, inplace= True)
sub_data = sub_data.apply(transform_to_one_column, axis="columns")
sub_data = sub_data.map(process).to_frame().rename({0:"text"}, axis=1)
sub_data = Dataset.from_pandas(sub_data)
sub_data = sub_data.map(tokenize, batched= True)
sub_data = sub_data.remove_columns(["text"]).with_format("torch")

In [None]:
sub_loader = DataLoader(sub_data, batch_size=32, collate_fn=datacol)
preds = torch.Tensor().cuda()
for batch in sub_loader:
        batch = {k:v.cuda() for k,v in batch.items()}
        outputs = model(**batch)
        preds=torch.cat((preds,torch.argmax(outputs.logits, dim=-1)))
preds = pd.Series(preds.cpu()).astype("int64")

In [None]:
#submission columns should be id, target
id = pd.read_csv("../input/nlp-getting-started/test.csv")["id"]
submission = pd.DataFrame({"id":id, "target":preds})
submission.to_csv("submission.csv", index= False)

In [None]:
#checking the submission file for same format
import pandas as pd
sample_sub = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")
sample_sub.head()
