In [47]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.nn.utils.rnn import pad_sequence

In [7]:
# Step 1: Read the CSV file
df = pd.read_csv('Resources\\train.csv',encoding='ISO-8859-1')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   textID            27481 non-null  object 
 1   text              27480 non-null  object 
 2   selected_text     27480 non-null  object 
 3   sentiment         27481 non-null  object 
 4   Time of Tweet     27481 non-null  object 
 5   Age of User       27481 non-null  object 
 6   Country           27481 non-null  object 
 7   Population -2020  27481 non-null  int64  
 8   Land Area (Km²)   27481 non-null  float64
 9   Density (P/Km²)   27481 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 2.1+ MB


In [9]:
df

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26
...,...,...,...,...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,night,31-45,Ghana,31072940,227540.0,137
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,morning,46-60,Greece,10423054,128900.0,81
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,noon,60-70,Grenada,112523,340.0,331
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,night,70-100,Guatemala,17915568,107160.0,167


## Cleaning

In [23]:
# Initialize the encoder
encoder = LabelEncoder()

# Fit the encoder and transform the labels to numerical (not categorical)
df['sentiment'] = encoder.fit_transform(df['sentiment'])

# encoding key:
# 0 = negative
# 1 = neutral
# 2 = positive

In [24]:
# Clean up any empty values and ensure everything is a string

# Fill None values with empty strings
df['selected_text'] = df['selected_text'].fillna('')
df['text'] = df['text'].fillna('')

# Ensure all entries are strings
df['selected_text'] = df['selected_text'].astype(str)
df['text'] = df['text'].fillna('')

In [25]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [26]:
# Test longest tweet to see number of tokens based on tokenizer. 
tokens = tokenizer.tokenize("Ahhh, I slept through the game. I`m gonna try my best to watch tomorrow though. I hope we play Army...")
print(len(tokens))

29


In [27]:
# Tokenize the tweets
df['tokens'] = df['selected_text'].apply(lambda x: tokenizer.encode(x, truncation=True, max_length=64))

In [28]:
# check it again
df

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²),tokens
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",1,morning,0-20,Afghanistan,38928346,652860.0,60,"[101, 1045, 1036, 1040, 2031, 5838, 1010, 2065..."
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,0,noon,21-30,Albania,2877797,27400.0,105,"[101, 17111, 2080, 6517, 102]"
2,088c60f138,my boss is bullying me...,bullying me,0,night,31-45,Algeria,43851044,2381740.0,18,"[101, 18917, 2033, 102]"
3,9642c003ef,what interview! leave me alone,leave me alone,0,morning,46-60,Andorra,77265,470.0,164,"[101, 2681, 2033, 2894, 102]"
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",0,noon,60-70,Angola,32866272,1246700.0,26,"[101, 4124, 1997, 1008, 1008, 1008, 1008, 1010..."
...,...,...,...,...,...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,0,night,31-45,Ghana,31072940,227540.0,137,"[101, 1040, 2439, 102]"
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",0,morning,46-60,Greece,10423054,128900.0,81,"[101, 1010, 2123, 1036, 1056, 2486, 102]"
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,2,noon,60-70,Grenada,112523,340.0,331,"[101, 8038, 2100, 2204, 2005, 2119, 1997, 2017..."
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,2,night,70-100,Guatemala,17915568,107160.0,167,"[101, 2021, 2009, 2001, 4276, 2009, 1008, 1008..."


In [29]:
# Define features
X = df['tokens']
y = df['sentiment']

In [30]:
# Split the data into training and test sets (deja vu)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
# Load the pre-trained model for classification, BERT-base has 3 classes
model = BertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", # Use the base uncased BERT model
    num_labels = 3, # The number of output labels--3 for binary classification
    output_attentions = False, # Whether the model returns attentions weights
    output_hidden_states = False, # Whether the model returns all hidden-states
)

# NOTE COME BACK TO THIS LATER I WANT THIS WORKING
# Move the model to the GPU
# model.cuda()

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
# Convert the lists of tokens into tensors
X_train_tensors = [torch.tensor(x) for x in X_train.tolist()]

# Pad the sequences
train_inputs = pad_sequence(X_train_tensors, batch_first=True)

# This "pads" the token list by adding 0s until it matches the length of the longest tweet.

In [49]:
# Convert training data into torch tensors
train_labels = torch.tensor(y_train.tolist())

In [50]:
# Create an iterator of the data with torch DataLoader 
train_data = TensorDataset(train_inputs, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

In [51]:
# Number of training epochs
epochs = 4

# Total number of training steps is number of batches * number of epochs
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Store the average loss after each epoch so we can plot them
loss_values = []

In [52]:
# For each epoch...
for epoch in range(0, epochs):
    # The big training part!!!
    total_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to('cpu')
        b_labels = batch[1].to('cpu')
        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_loss / len(train_dataloader)
    loss_values.append(avg_train_loss)

KeyboardInterrupt: 

In [None]:
# Now you can save your model with
# SET UP MODEL DIRECTORY FIRST
# model.save_pretrained('model_directory')