<a href="https://colab.research.google.com/github/ErfanRasti/MachineLearningProjects/blob/main/NaturalLanguageProcessing/NLP_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# <font color='green'>NLP Project</font>

<div class="alert alert-block alert-warning">
<b>
Amirkabir University of Technology<br>
Fall 2022<br>
Course: Machine Learning<br>
Instructor: Dr. Seyedin<br>
<b><font color='green'>Authors:</font></b><br>
Name: Amir Azad<br />
Student Number: 9823004</div>
Name: Erfan Rasti<br />
Student Number: 9823034</div>


***

# <font color='purple'>Importing Required Packages</font>

In [42]:
# ! pip install transformers
# ! pip install datasets
# ! pip install sentence-transformers
# ! pip install evaluate


In [43]:
# from transformers import AutoTokenizer
import pandas as pd
import torch
import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn import metrics
from sentence_transformers import SentenceTransformer
from transformers import TrainingArguments, Trainer
from transformers import DistilBertForSequenceClassification
from transformers import DistilBertTokenizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay
import matplotlib.pyplot as plt
import numpy as np

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# <font color='purple'>Loading the dataset from Hugging Face</font>

In [45]:
from datasets import load_dataset
dataset = load_dataset('tweet_eval', 'hate')



  0%|          | 0/3 [00:00<?, ?it/s]

In [46]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2970
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
})


In [47]:
print(dataset['train'][0])
print(dataset['test'][2])

{'text': '@user nice new signage. Are you not concerned by Beatlemania -style hysterical crowds crongregating on you…', 'label': 0}
{'text': '@user @user Those People Invaded Us!!! They DO NOT BELING HERE & HAVE NO RIGHTS! Its #AmericaFIRST! Open Your House To Them If Your That IGNORANT! & Yes Im A #Christian Too! #NODACA!', 'label': 1}


# <font color='purple'>Preprocessing Data</font>


In [48]:
def preprocess_sentence(sentence):
    # remove punctuation
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    # remove numbers
    sentence = re.sub(r'\d+', '', sentence)
    # remove whitespace
    sentence = sentence.strip()
    # remove stopwords
    sentence = ' '.join([word for word in sentence.split()
                        if word not in stopwords.words('english')])
    return sentence


In [49]:
def preprocess_dataset(dataset):
    dataset = dataset.map(
        lambda examples: {'text': preprocess_sentence(examples['text'])})
    return dataset


In [None]:
dataset = preprocess_dataset(dataset)
print(dataset['train'][0])
print(dataset['test'][2])

  0%|          | 0/9000 [00:00<?, ?ex/s]

In [None]:
#You can use bert-base or DistilBert as a model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

def tokenize_dataset(dataset):
    tokenized_dataset = dataset.map(lambda examples: tokenizer(examples['text'], truncation=True, padding='max_length'), batched=True)
    return tokenized_dataset

In [None]:
dataset = tokenize_dataset(dataset)
print(dataset['train'][0])
print(dataset['validation'][0])
print(dataset['test'][2])

# <font color='purple'>Fine-tune a model</font>

You should complete this section

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-cased',
    num_labels=2,
    dropout=0.3
    )

training_args = TrainingArguments(output_dir="test_trainer",
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  learning_rate=5e-6,
                                  num_train_epochs=5,
                                  weight_decay=0.01,
                                  per_device_train_batch_size=4,
                                  load_best_model_at_end=True,
                                  optim="adamw_torch")


In [None]:
import numpy as np
import evaluate


metrics_to_compute = evaluate.combine([
    evaluate.load("accuracy"),
    evaluate.load("precision", average='weighted'),
    evaluate.load("recall", average='weighted'),
    evaluate.load("f1", average='weighted')
])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metrics_to_compute.compute(predictions, labels)




In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    compute_metrics=compute_metrics,
)

trainer.train()

# <font color='purple'>Evaluation</font>

You should complete this section

In [None]:
test_pred = trainer.predict(dataset['test']).predictions.argmax(axis=1)
test_true = dataset['test']['label']
cm = confusion_matrix(test_true, test_pred)

print('Accuracy: ', accuracy_score(test_true, test_pred))
print('Precision: ', precision_score(test_true, test_pred, average='weighted'))
print('Recall: ', recall_score(test_true, test_pred, average='weighted'))
print('F1: ', f1_score(test_true, test_pred, average='weighted'))
print('ROC AUC: ', roc_auc_score(test_true, test_pred))

fpr, tpr, thresholds = roc_curve(test_true, test_pred)
roc_plot = RocCurveDisplay(fpr=fpr, tpr=tpr)
roc_plot.plot()
plt.grid()
plt.show()

cm_plot = ConfusionMatrixDisplay(
    cm, display_labels=['not hate', 'hate'])
cm_plot.plot(cmap='Blues')
plt.show()


# <font color='purple'>References</font>

1.   https://huggingface.co/metrics
2.   https://huggingface.co/docs/transformers/training
