In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

### Load Dataset

In [None]:
# Download data
import requests
request = requests.get("https://drive.google.com/uc?export=download&id=1wHt8PsMLsfX5yNSqrt2fSTcb8LEiclcf")
with open("data.zip", "wb") as file:
    file.write(request.content)

# Unzip data
import zipfile
with zipfile.ZipFile('data.zip') as zip:
    zip.extractall('data')

In [None]:
# Load data and set labels
data_complaint = pd.read_csv('data/complaint1700.csv')
data_complaint['label'] = 0
data_non_complaint = pd.read_csv('data/noncomplaint1700.csv')
data_non_complaint['label'] = 1

# Concatenate complaining and non-complaining data
data = pd.concat([data_complaint, data_non_complaint], axis=0).reset_index(drop=True)

data.head(5)

Unnamed: 0,id,airline,tweet,label
0,80938,United,@united I'm having issues. Yesterday I rebooke...,0
1,10959,United,@united kinda feel like the $6.99 you charge f...,0
2,130813,SouthWest,"Livid in Vegas, delayed, again&amp; again&amp;...",0
3,146589,United,@united the most annoying man on earth is on m...,0
4,117579,United,"@united The last 2 weeks I've flown wit u, you...",0


In [None]:
data.drop(['airline'], inplace=True, axis=1)

In [None]:
test_data = pd.read_csv('data/test_data.csv')

# Keep important columns
test_data = test_data[['id', 'tweet']]

# Display 5 samples from the test data
test_data

Unnamed: 0,id,tweet
0,33,@SouthwestAir get your damn act together. Don'...
1,58,@AmericanAir horrible at responding to emails....
2,135,@AmericanAir hey where is your crew? Flight aa...
3,159,Ok come on we are late let's goooo @united
4,182,@AmericanAir since you are now affiliated with...
...,...,...
4550,173504,Another day another lie from @SouthwestAir. It...
4551,173526,On @united flight from hell. Diverted from EWR...
4552,173573,@united I will never use your airline after th...
4553,173599,"Flight back to SFO delayed over an hour, no up..."


In [None]:
def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
from sklearn.model_selection import train_test_split

X = data.tweet.values
y = data.label.values

In [None]:
X = [text_preprocessing(tweet) for tweet in X]
X[:10]

["I'm having issues. Yesterday I rebooked for 24 hours after I was supposed to fly, now I can't log on & check in. Can you help?",
 "kinda feel like the $6.99 you charge for in flight Wi-Fi is ridiculous. AND it sucks, slow, or doesn't work. #anythingtomakeabuck",
 'Livid in Vegas, delayed, again& again&again, decided to cancel a flight and combine two, then waited on crew, now pilots.',
 'the most annoying man on earth is on my flight. what can you do to help me?',
 "The last 2 weeks I've flown wit u, you have given me 4 reasons to convince me it was a bad decision. Time 2 go back 2 @SouthwestAir",
 '#AmericanAirlines Flight 1179 Miami to DC. Gate closes at least 19 minutes before the flight. Agents show up and lie to me.',
 'It appears employees have a fear of autism.',
 "Kids have no idea we are heading to WDW tue for an extended vacation. Can't wait to see their face boarding #AKV #DisneySide",
 'Four hour delay on my flight from dallas to houston. My longest delay i ever had for f

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=2020)

### Preparing Data for Training

In [None]:
!pip install transformers datasets torch scikit-learn



In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=128)




In [None]:
from datasets import Dataset
import pandas as pd

train_dataset = Dataset.from_pandas(pd.DataFrame({"text": X_train, "labels": y_train}))
val_dataset = Dataset.from_pandas(pd.DataFrame({"text": X_val, "labels": y_val}))

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)


Map:   0%|          | 0/2380 [00:00<?, ? examples/s]

Map:   0%|          | 0/1020 [00:00<?, ? examples/s]

In [None]:
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # 3 sentiment labels

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!mkdir /content/drive/MyDrive/bert_checkpoints/
!ls /content/drive/MyDrive/bert_checkpoints/ -alh

mkdir: cannot create directory ‘/content/drive/MyDrive/bert_checkpoints/’: File exists
total 0


In [None]:
# Define the Google Drive directory where you want to save the results
output_dir = '/content/drive/MyDrive/bert_checkpoints/'

training_args = TrainingArguments(
    output_dir=output_dir,               # Use Google Drive for saving model checkpoints
    evaluation_strategy="epoch",         # Evaluate at the end of every epoch
    save_strategy="epoch",               # Save checkpoints at the end of every epoch
    save_total_limit=3,                  # Limit to last 3 checkpoints
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
)



In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [69]:
# Train model
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [65]:
# Evaluate model
trainer.evaluate()

{'eval_loss': 0.4213866591453552,
 'eval_runtime': 443.8374,
 'eval_samples_per_second': 2.298,
 'eval_steps_per_second': 0.288,
 'epoch': 2.0}

In [None]:
# Define Google Drive directory for saving model
final_model_dir = '/content/drive/MyDrive/bert_final_model/'

# After training, save the final model
model.save_pretrained(final_model_dir)
tokenizer.save_pretrained(final_model_dir)


### Testing

In [67]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Define the Google Drive directory where the final model is saved
model_dir = '/content/drive/MyDrive/bert_final_model/'

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained(model_dir)
model = BertForSequenceClassification.from_pretrained(model_dir)

# Set model to evaluation mode
model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [68]:
# Define a test string (e.g., a tweet)
test_string = "I love this product, it's amazing!"

# Tokenize the input string
inputs = tokenizer(test_string, return_tensors="pt", padding=True, truncation=True, max_length=128)

# Get model predictions
with torch.no_grad():
    outputs = model(**inputs)

# Get predicted class (sentiment)
predictions = torch.argmax(outputs.logits, dim=-1)

# Map prediction to sentiment (assuming 0=Negative, 1=Neutral, 2=Positive)
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}

# Output the sentiment
predicted_sentiment = label_map[predictions.item()]
print(f"Sentiment: {predicted_sentiment}")


Sentiment: Neutral


In [71]:
val_inputs = tokenizer(X_val, return_tensors="pt", padding=True, truncation=True, max_length=128)

# Step 8: Get model predictions
with torch.no_grad():
    val_outputs = model(**val_inputs)

# Step 9: Get predicted classes (sentiments)
val_predictions = torch.argmax(val_outputs.logits, dim=-1)

NameError: name 'accuracy_score' is not defined

In [72]:
from sklearn.metrics import accuracy_score
# Step 10: Calculate accuracy
accuracy = accuracy_score(y_val, val_predictions.numpy())

# Step 11: Output the accuracy
print(f"Validation Accuracy: {accuracy:.2f}")

Validation Accuracy: 0.81
