In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")

print("Path to dataset files:", path)

In [None]:
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from datasets import Dataset
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score

In [None]:
train_df = pd.read_csv(path + "/twitter_training.csv", header=None)
val_df = pd.read_csv(path + "/twitter_validation.csv", header=None)

In [None]:
train_df.columns =['ID','entity','sentiment','tweet_content']
val_df.columns =['ID','entity','sentiment','tweet_content']

In [None]:
train_df = train_df.dropna()
val_df = val_df.dropna()

In [None]:
sns.countplot(data = train_df, x = 'sentiment')
plt.title('Sentiment Count')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

In [None]:
train_df = train_df.drop(columns = ['ID', 'entity'], axis=1)
val_df = val_df.drop(columns = ['ID', 'entity'], axis=1)

In [None]:
train_df.info()

In [None]:
label_encoder = LabelEncoder()
train_df['sentiment'] = label_encoder.fit_transform(train_df['sentiment'])
val_df['sentiment'] = label_encoder.transform(val_df['sentiment'])

In [None]:
def clean_txt(text):
  text = text.lower()
  text = re.sub(r'http\S+|www\S+|https\S+', '', text)
  text = re.sub(r'\@\w+|\#', '', text)
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  text = re.sub(r'\d+', '', text)
  return text

train_df['clean_content'] = train_df['tweet_content'].apply(clean_txt)
val_df['clean_content'] = val_df['tweet_content'].apply(clean_txt)

In [None]:
train_df = train_df.drop(['tweet_content'], axis=1)
val_df = val_df.drop(['tweet_content'], axis=1)

In [None]:
train_df.head()

In [None]:
val_df.head()

In [None]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
  return tokenizer(
  examples['clean_content'],
  padding = "max_length",
  truncation = True,
  max_length = 128,
  return_tensors = "pt"
  )

In [None]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

In [None]:
train_dataset

In [None]:
train_dataset = train_dataset.remove_columns(['clean_content'])
val_dataset = val_dataset.remove_columns(['clean_content'])

train_dataset = train_dataset.rename_column('sentiment', 'labels')
val_dataset = val_dataset.rename_column('sentiment', 'labels')

train_dataset.set_format('torch')
val_dataset.set_format('torch')

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

training_args = TrainingArguments(
    output_dir = "./results",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 3,
    weight_decay = 0.01,
    load_best_model_at_end = False,
    metric_for_best_model = "accuracy"
)


In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
)

trainer.train()

In [None]:
final_metrics = trainer.evaluate()
print(final_metrics)

In [None]:
predictions = trainer.predict(val_dataset)

y_pred = np.argmax(predictions.predictions, axis=-1)
y_true = val_dataset['labels']

acc = accuracy_score(y_true, y_pred)
print("Accuracy:", acc)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
trainer.save_model('/content/drive/MyDrive/Twitter-Sentiment-Analysis-BERT')
tokenizer.save_pretrained('/content/drive/MyDrive/Twitter-Sentiment-Analysis-BERT-Tokenizer')