<a href="https://colab.research.google.com/github/DeeeTeeee/Sentiment-Classification-FineTunning/blob/main/Fine_tuning_Hugging_face_trainer_roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis with Hugging Face

## Application of Hugging Face Text classification model Fune-tuning

In [None]:
# #Install the datasets library
# !pip install datasets
# !pip install sentencepiece
# !pip install transformers datasets
# !pip install transformers[torch]
# !pip install accelerate
# !pip install accelerate>=0.20.1
# !pip install huggingface_hub
# !pip3 install -q transformers datasets

In [None]:
# Import libraries
import os
import pandas as pd
import numpy as np
import plotly
import string
import re
import matplotlib.pyplot as plt
from google.colab import files
import nltk
from nltk.corpus import stopwords
from datasets import load_dataset
from sklearn.model_selection import train_test_split

from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig, AdamW
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from sklearn.metrics import mean_squared_error
from huggingface_hub import notebook_login



In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load the dataset and display some values
df_train = pd.read_csv('/content/drive/MyDrive/Natural Language Processing/zindi_challenge/data/Train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Natural Language Processing/zindi_challenge/data/Test.csv')
# A way to eliminate rows containing NaN values
df_train = df_train[~df_train.isna().any(axis=1)]
df_test = df_test[~df_test.isna().any(axis=1)]


In [None]:
# Set the max_colwidth option to a higher value
pd.set_option('display.max_colwidth', None)

Because this project is basiscally for finetunning, we are not concentrating on  the EDA part however for the sake of the dataset, we need to check and understand it...
so, we will use the `CRISP-DM framework`
# Cleaning and little EDA

In [None]:
df_train.sample(4)

In [None]:
df_test.sample(4)

# Data cleaning

Here I:\
Remove unnecessary columns\
Clean the 'safe_text' column\
Remove emojis and other special characters\
Remove punctuation


In [None]:
# Remove unnecessary columns
df_train = df_train.drop(['tweet_id'], axis=1)
df_test = df_test.drop(['tweet_id'], axis=1)

In [None]:
# Clean the 'safe_text' column (example: remove URLs and special characters)
df_train['safe_text'] = df_train['safe_text'].str.replace(r'<url>', '')  # Remove <url> tag
df_test['safe_text'] = df_test['safe_text'].str.replace(r'<url>', '')  # Remove <url> tag

# Remove emojis and other special characters
emojis = re.compile(r'[^\w\s@#$%^*()<>/|}{~:]')
df_train["safe_text"] = df_train["safe_text"].str.replace(emojis, '')
df_test["safe_text"] = df_test["safe_text"].str.replace(emojis, '')

# # Remove punctuation
punctuation = string.punctuation
df_train["safe_text"] = df_train["safe_text"].str.translate(str.maketrans('', '', punctuation))
df_test["safe_text"] = df_test["safe_text"].str.translate(str.maketrans('', '', punctuation))

In [None]:
df_train.sample(4)

In [None]:
df_test.head(4)

Let's for the Fine Tunning to be smoother, let us make the case of the text uniform. I will make it `lowercase`

In [None]:
# Turn the safe_text column into lowercase
df_train["safe_text"] = df_train["safe_text"].str.lower()
df_test["safe_text"] = df_test["safe_text"].str.lower()

### EDA

In [None]:
# Plot the distribution of labels
label_counts = df_train['label'].value_counts()
plt.bar(label_counts.index, label_counts.values)
plt.xlabel('Label')
plt.ylabel('Count')
plt.title('Distribution of Labels')
plt.show()

In [None]:
df_train['label'].unique()

Distribution of Label Agreement:

In [None]:
# Plot the distribution of 'agreement'
plt.hist(df_train['agreement'])
plt.xlabel('Agreement')
plt.ylabel('Count')
plt.title('Distribution of Label Agreement')
plt.show()

In [None]:
from collections import Counter

# Concatenate all the 'safe_text' into a single string
text = ' '.join(df_train['safe_text'])

# Split the text into words
words = text.split()

# Count the frequency of each word
word_counts = Counter(words)

# Display the most common words
print(word_counts.most_common(10))


In [None]:
# Calculate the length of each text in 'safe_text'
text_lengths = df_train['safe_text'].apply(len)

# Plot the distribution of text lengths
plt.hist(text_lengths)
plt.xlabel('Text Length')
plt.ylabel('Count')
plt.title('Distribution of Text Lengths')
plt.show()

In [None]:
from wordcloud import WordCloud

# Generate the word cloud
wordcloud = WordCloud().generate(text)

# Display the word cloud
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


In [None]:
# Calculate the correlation between 'label' and 'agreement'
correlation = df_train['label'].corr(df_train['agreement'])

# Print the correlation value
print(f"Correlation: {correlation}")


In [None]:
df_train.head(5)

In [None]:
df_test.head(5)

## Removing Stop Words

In [None]:
# Download the stop words (only required for the first time)
nltk.download('stopwords')

# Remove stop words
stop_words = set(stopwords.words('english'))
df_train['safe_text'] = df_train['safe_text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))


In [None]:
''.join(stop_words)

In [None]:
#Sample after removing stopwords
df_train.head(4)

#### Save the DataFrames to CSV files:

In [None]:
# Save df_train
df_train.to_csv('df_train.csv', index=False)

# Save df_test
df_test.to_csv('df_test.csv', index=False)

Download the CSV files to your local machine:

## Import the Cleaned data:

In [None]:
# Disabe W&B
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Load the dataset and display some values
df = pd.read_csv('/content/df_train.csv')

# A way to eliminate rows containing NaN values
df = df[~df.isna().any(axis=1)]

In [None]:
# Split the train data => {train, eval}
train, eval = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

In [None]:
# Display the first few rows of the training data
train.head(5)

In [None]:
train.info()

In [None]:
# Display the first few rows of the evaluation data
eval.head()

In [None]:
eval.label.unique()

In [None]:
# Print the shapes of the new dataframes
print(f"Shape of the train dataframe: {train.shape}")
print(f"Shape of the eval dataframe: {eval.shape}")

In [None]:
# # Save splitted subsets
# train.to_csv("../data/train_subset.csv", index=False)
# eval.to_csv("../data/eval_subset.csv", index=False)
import os
import pandas as pd

# Create the data directory if it doesn't exist
if not os.path.exists("../data"):
    os.makedirs("../data")

# Save splitted subsets
train.to_csv("../data/train_subset.csv", index=False)
eval.to_csv("../data/eval_subset.csv", index=False)


In [None]:
# Load the dataset from CSV files
dataset = load_dataset('csv',
                        data_files={'train': '../data/train_subset.csv',
                        'eval': '../data/eval_subset.csv'}, encoding = "ISO-8859-1")

In [None]:

#!pip install transformers

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# "cardiffnlp/twitter-xlm-roberta-base-sentiment"
# "roberta-base"
# "xlnet-base-cased"
# "bert-base-uncased"

# Import the tokenizer from transformers library
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased')

In [None]:
# Function to transform labels
def transform_labels(label):
    label = label['label']
    num = 0
    if label == -1: #'Negative'
        num = 0
    elif label == 0: #'Neutral'
        num = 1
    elif label == 1: #'Positive'
        num = 2

    return {'labels': num}

# Function to tokenize data
def tokenize_data(example):
    return tokenizer(example['safe_text'], padding='max_length')

# Change the tweets to tokens that the models can exploit
dataset = dataset.map(tokenize_data, batched=True)

# Transform	labels and remove the useless columns
remove_columns = ['label', 'safe_text', 'agreement']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)

In [None]:
dataset

In [None]:
from transformers import TrainingArguments
import transformers

# Configure the trianing parameters like `num_train_epochs`:
# the number of time the model will repeat the training loop over the dataset
# Training Arguments
training_args = TrainingArguments(
    output_dir="mytest_trainer_base-cased",
    num_train_epochs=10,
    load_best_model_at_end=True,
    #output_dir="SentimentAnalysisDee1",
    push_to_hub=True,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    logging_steps=100,
    per_device_train_batch_size=16,
)


In [None]:
from transformers import AutoModelForSequenceClassification

# Loading a pretrain model while specifying the number of labels in our dataset for fine-tuning
model = AutoModelForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=3)

In [None]:
# Train and Evaluation Datasets
train_dataset = dataset['train'].shuffle(seed=25) #.select(range(40000)) # to select a part
eval_dataset = dataset['eval'].shuffle(seed=25)

## other way to split the train set ... in the range you must use:
# # int(num_rows*.8 ) for [0 - 80%] and  int(num_rows*.8 ),num_rows for the 20% ([80 - 100%])
# train_dataset = dataset['train'].shuffle(seed=10).select(range(40000))
# eval_dataset = dataset['train'].shuffle(seed=10).select(range(40000, 41000))

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"rmse": mean_squared_error(labels, predictions, squared=False)}

In [None]:
# Model Training Setup
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,

)


In [None]:
# Launch the learning process: training
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rmse
1,0.7459,0.782984,0.753658
2,0.6459,0.634595,0.658027
3,0.5262,0.656488,0.658787


Epoch,Training Loss,Validation Loss,Rmse
1,0.7459,0.782984,0.753658
2,0.6459,0.634595,0.658027
3,0.5262,0.656488,0.658787
4,0.4144,0.693382,0.653835


TrainOutput(global_step=2000, training_loss=0.5831099319458007, metrics={'train_runtime': 3253.5331, 'train_samples_per_second': 9.834, 'train_steps_per_second': 0.615, 'total_flos': 8418576913625088.0, 'train_loss': 0.5831099319458007, 'epoch': 4.0})

In [None]:
trainer.push_to_hub()

Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file runs/Jul13_04-02-07_60ba339b8491/events.out.tfevents.1689220934.60ba339b8491.3211.4:   0%|        …

To https://huggingface.co/DeeeTeeee01/mytest_trainer_roberta-base
   6566dfb..bc07886  main -> main

   6566dfb..bc07886  main -> main

To https://huggingface.co/DeeeTeeee01/mytest_trainer_roberta-base
   bc07886..49273df  main -> main

   bc07886..49273df  main -> main



'https://huggingface.co/DeeeTeeee01/mytest_trainer_roberta-base/commit/bc07886c580e95ffabde0ea3d822fa71a5852bb3'

In [None]:
# Launch the final evaluation
trainer.evaluate()

{'eval_loss': 0.6345946192741394,
 'eval_rmse': 0.6580273550544841,
 'eval_runtime': 58.5712,
 'eval_samples_per_second': 34.146,
 'eval_steps_per_second': 4.268,
 'epoch': 4.0}

Some checkpoints of the model are automatically saved locally in `test_trainer/` during the training.

In [None]:
#!pip show transformers
#!pip install transformers==2.4.0

In [None]:
model = transformers.AutoModel.from_pretrained("mytest_trainer_base-cased")

Some weights of the model checkpoint at mytest_trainer_roberta-base were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at mytest_trainer_roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
