In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
project_path = '/content/drive/MyDrive/NLP Projects/Sentiment analysis'
data_batch_count = 20000

In [10]:
import pandas as pd

column_names = ['score', 'title', 'text']
train_dataset = pd.read_csv(f'{project_path}/Dataset/train.csv', names=column_names, nrows=1000000)
test_dataset = pd.read_csv(f'{project_path}/Dataset/test.csv', names=column_names, nrows=100000)

In [11]:
train_dataset['score'] = train_dataset['score'] - 1
test_dataset['score'] = test_dataset['score'] - 1

In [12]:
print(train_dataset.iloc[0]['score'])
print(train_dataset.iloc[0]['title'])
print(train_dataset.iloc[0]['text'])

print(train_dataset['score'].value_counts())

1
Stuning even for the non-gamer
This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^
score
1    505678
0    494322
Name: count, dtype: int64


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

test_texts, test_labels = shuffle(test_dataset['text'], test_dataset['score'], random_state=42)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_dataset['text'], train_dataset['score'], test_size=0.05, shuffle=True, random_state=42
)

In [14]:
print(f"train dataset count {len(train_texts)} - {len(train_labels)}")
print(f"validation dataset count {len(val_texts)} - {len(val_labels)}")
print(f"test dataset count {len(test_texts)} - {len(test_labels)}")

train dataset count 950000 - 950000
validation dataset count 50000 - 50000
test dataset count 100000 - 100000


In [15]:
# Import required libraries
import re
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

def clean_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove numbers and special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

    return text

# Apply text cleaning
train_texts = train_texts.apply(clean_text)
val_texts = val_texts.apply(clean_text)
test_texts = test_texts.apply(clean_text)

print("Text cleaning completed for both datasets.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Text cleaning completed for both datasets.


In [16]:
import torch
from transformers import BertTokenizer

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization and saving function with labels
def batch_tokenize_and_save_with_labels(texts, labels, file_prefix, batch_size=data_batch_count):
    print(f'Start processing {file_prefix} data')
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_labels = labels[i:i+batch_size]

        # Tokenize the text batch
        tokenized_batch = tokenizer(batch_texts, padding="max_length", truncation=True, return_tensors="pt")

        # Save tokenized texts with corresponding labels
        batch_data = {
            'input_ids': tokenized_batch['input_ids'],
            'attention_mask': tokenized_batch['attention_mask'],
            'labels': torch.tensor(batch_labels, dtype=torch.long)
        }

        # Save the batch
        torch.save(batch_data, f"{project_path}/Dataset/processed_{file_prefix}_data/{file_prefix}_batch_{(i//batch_size) + 1}.pt")

    print(f'Finished processing {file_prefix} data')

# Tokenize and save train, validation, and test datasets with labels
batch_tokenize_and_save_with_labels(train_texts.tolist(), train_labels.tolist(), file_prefix="train")
batch_tokenize_and_save_with_labels(val_texts.tolist(), val_labels.tolist(), file_prefix="validation")
batch_tokenize_and_save_with_labels(test_texts.tolist(), test_labels.tolist(), file_prefix="test")



Start processing train data
Finished processing train data
Start processing validation data
Finished processing validation data
Start processing test data
Finished processing test data
