In [None]:
import os
import warnings
import torch
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

seed = 42
sns.set_theme()
pd.set_option('display.max_colwidth', None)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
model_name = "distilbert-base-uncased"
TRAIN_FILE = "../input/nlp-getting-started/train.csv"
TEST_FILE = "../input/nlp-getting-started/test.csv"
SUBMISSION_FILE = "../input/nlp-getting-started/sample_submission.csv"

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")


train_df.rename(columns={"target": "labels"}, inplace=True)
num_labels = len(train_df.labels.value_counts())
train_df.sample(5, random_state=seed)

In [None]:
print(f"Number of trainining samples: {train_df.shape[0]}")
print(f"Number of test samples: {test_df.shape[0]}")

In [None]:
train_df.isnull().sum()

In [None]:
for df in (train_df, test_df):
    df.drop(["id", "keyword", "location"], axis=1, inplace=True)

In [None]:
import re
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning, module='bs4')

def clean_text(text):
    text = BeautifulSoup(text, 'html.parser').get_text() # Remove html tags
    text = re.sub("http[s]?\:\/\/\S+", " ", text) # Remove links
    text = re.sub("[ \t\n]+", " ", text) # Remove tabs, newlines and multiple spaces
    text = re.sub("[^a-zA-Z]", " ", text) 
    
    return text.strip().lower()


for df in (train_df, test_df):
    df.text = df.text.apply(lambda x: clean_text(x))

In [None]:
train_df.sample(5, random_state=seed)

In [None]:
plt.title("Text length")
plt.xlabel("n words")
plt.ylabel("Count")
plt.hist(train_df.text.apply(lambda x: x.split(" ")).str.len())
plt.show()

In [None]:
train_df = train_df.sample(frac=1, random_state=seed).reset_index(drop=True)

In [None]:
from datasets import Dataset

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)


In [None]:
train_ds = train_ds.train_test_split(test_size=0.1, seed=seed)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
from transformers import AutoModel, AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


train_encoded = train_ds.map(tokenize, batched=True, batch_size=None)
test_encoded = test_ds.map(tokenize, batched=True, batch_size=None)

In [None]:
train_encoded.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_encoded.set_format("torch", columns=["input_ids", "attention_mask"])

In [None]:
def extract_embedding(batch):
    inputs = {k:v.to(device) for k,v in batch.items() 
              if k in tokenizer.model_input_names}
    
    with torch.no_grad():
        pred = model(**inputs).last_hidden_state
        
    return {"embedding": pred[:, 0].cpu().numpy()}

  
train_embedding = train_encoded.map(extract_embedding, batched=True, batch_size=None)

In [None]:
import umap
from sklearn.preprocessing import MinMaxScaler


train_scaled = MinMaxScaler().fit_transform(train_embedding['train']['embedding'].numpy())
reductor = umap.UMAP(n_neighbors=5, n_components=2, min_dist=0.3).fit(train_scaled)
train_reduced = pd.DataFrame({"X": reductor.embedding_[:,0], 
                              "Y": reductor.embedding_[:,1], 
                              "labels": train_embedding['train']['labels']})

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=train_reduced, x="X", y="Y", hue="labels")
plt.show()

In [None]:
del train_scaled, train_reduced

In [None]:
X_train = train_embedding['train']['embedding']
y_train = train_embedding['train']['labels']

X_valid = train_embedding['test']['embedding']
y_valid = train_embedding['test']['labels']

In [None]:
from sklearn.linear_model import LogisticRegression

logistic_regression = LogisticRegression(max_iter=2000, random_state=seed)
logistic_regression.fit(X_train, y_train)

print(f"Logistic regression score: {logistic_regression.score(X_valid, y_valid):.4f}")

In [None]:
del X_train, X_valid, y_train, y_valid

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer


batch_size = 32
logging_steps = len(train_encoded["train"]) // batch_size
training_args = TrainingArguments(output_dir="distilbert_disaster", 
                                  report_to="tensorboard",
                                  evaluation_strategy="epoch",
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  logging_steps=logging_steps,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,                                
                                  weight_decay=0.01,
                                  save_strategy="no",
                                  disable_tqdm=False,
                                  push_to_hub=False,
                                  log_level="error")

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)

In [None]:
from datasets import load_metric
from sklearn.metrics import f1_score

metric = load_metric("accuracy")

def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)  
    acc = metric.compute(predictions=preds, references=labels)
    return {"accuracy": acc["accuracy"], "f1": f1_score(labels, preds)}

In [None]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args, 
                  compute_metrics=compute_metrics,
                  train_dataset=train_encoded["train"],
                  eval_dataset=train_encoded["test"],
                  tokenizer=tokenizer)

trainer.train()

In [None]:
from sklearn.metrics import confusion_matrix

valid_preds = trainer.predict(train_encoded["test"])
valid_preds = np.argmax(valid_preds.predictions, axis=-1)

cnf_matrix = confusion_matrix(train_encoded["test"]["labels"], valid_preds)

sns.heatmap(cnf_matrix, annot=True, fmt="d")
plt.xlabel("True class")
plt.ylabel("Pred class")
plt.show()

In [None]:
misclasified = np.nonzero(valid_preds != train_encoded["test"]["labels"].numpy())[0][:10]
true_labels = train_encoded["test"].select(misclasified)
pd.DataFrame({"text": true_labels['text'], 
              "labels": true_labels['labels'].numpy(), 
              "preds":valid_preds[misclasified]})

In [None]:
preds = trainer.predict(test_encoded)
preds = np.argmax(preds.predictions, axis=-1)
submission = pd.read_csv(SUBMISSION_FILE)
submission.target = preds
submission.to_csv("submission.csv", index=False)