In [6]:
import numpy as np
import pandas as pd
import torch
import transformers
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import string
import nltk
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score

In [1]:
# Load dataset
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    data = {'text': [], 'label': []}
    for line in lines:
        parts = line.strip().split('\t')
        data['text'].append(parts[0])
        data['label'].append(int(parts[1]))
    return pd.DataFrame(data)

In [2]:
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    return text

In [7]:
# Load data from .txt file
file_path = 'imdb_labelled.txt'
df = load_data(file_path)

In [8]:
# Apply preprocessing to text data
df['text'] = df['text'].apply(preprocess_text)


In [9]:
df.head()

Unnamed: 0,text,label
0,a very very very slowmoving aimless movie abou...,0
1,not sure who was more lost the flat character...,0
2,attempting artiness with black white and clev...,0
3,very little music or anything to speak of,0
4,the best scene in the movie was when gerardo i...,1


In [10]:
# Split dataset into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [11]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Tokenize text data and convert to input tensors
def tokenize_texts(texts):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=64,  # Assuming max sequence length of 64
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded_text['input_ids'])
        attention_masks.append(encoded_text['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    return input_ids, attention_masks

In [13]:
train_input_ids, train_attention_masks = tokenize_texts(train_texts)
test_input_ids, test_attention_masks = tokenize_texts(test_texts)

In [14]:
# Convert labels to tensors
train_labels = torch.tensor(train_labels.values)
test_labels = torch.tensor(test_labels.values)

In [15]:
# Fine-tune BERT model for binary classification
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 3
batch_size = 32

for epoch in range(epochs):
    for i in range(0, len(train_labels), batch_size):
        optimizer.zero_grad()
        outputs = model(input_ids=train_input_ids[i:i+batch_size],
                        attention_mask=train_attention_masks[i:i+batch_size],
                        labels=train_labels[i:i+batch_size])
        loss = outputs.loss
        loss.backward()
        optimizer.step()




In [17]:
# Evaluate the model
def evaluate_model(model, input_ids, attention_masks, labels):
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_masks)
        logits = outputs.logits
        predictions = np.argmax(logits.detach().numpy(), axis=1)
        acc = accuracy_score(labels, predictions)
        precision = precision_score(labels, predictions)
        recall = recall_score(labels, predictions)
        f1 = f1_score(labels, predictions)
    return acc, precision, recall, f1

acc, precision, recall, f1 = evaluate_model(model, test_input_ids, test_attention_masks, test_labels)


In [18]:
print("Accuracy:", acc)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.94
Precision: 0.9217391304347826
Recall: 0.9724770642201835
F1 Score: 0.9464285714285715


In [19]:
# Function to classify individual text inputs
def classify_text(input_text):
    # Preprocess input text
    preprocessed_text = preprocess_text(input_text)
    # Tokenize input text and convert to input tensors
    input_ids, attention_mask = tokenize_texts([preprocessed_text])
    # Classify input text using the trained model
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_label = np.argmax(logits.detach().numpy(), axis=1)[0]
    if predicted_label == 1:
        return "Positive"
    else:
        return "Negative"

# Test the classify_text function with an example input
example_input = "The movie was great!"
classification_result = classify_text(example_input)
print("Classification Result:", classification_result)

Classification Result: Positive
