In [118]:
from datasets import load_dataset
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import DataLoader, Dataset
import joblib
from sklearn.metrics import accuracy_score

In [2]:
ds = load_dataset("SocialGrep/the-reddit-dataset-dataset", "comments",split="train")

Found cached dataset the-reddit-dataset-dataset (C:/Users/bibek/.cache/huggingface/datasets/SocialGrep___the-reddit-dataset-dataset/comments/1.0.0/e1425cc8beebba4388da97f897d2f9275f462e1fb78b1bc4f89b5f4276ee2475)


In [3]:
df = ds.to_pandas()[["body","sentiment"]]

In [4]:
df

Unnamed: 0,body,sentiment
0,Spatial problem: Suitability of new locations ...,0.0772
1,Have you tried toying around with GDELT or Ali...,0.0000
2,Damn random internet person of whom I know not...,-0.3851
3,Ah nice one. Best of luck with the baby. If yo...,0.9136
4,I was about to write and say this shouldn't be...,0.0762
...,...,...
54843,See http://code.reddit.com/wiki/API and http:/...,
54844,Careful of the licence on this one.,0.1531
54845,Also a great example of exposing an API with v...,0.8545
54846,"From the overview:\n""We have collected packet ...",0.5789


In [32]:
def clean_text(text):
    # Remove URLs
    
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\n\n.+?\n\n','', text, flags=re.MULTILINE)
    # Remove special characters (keeping only letters, numbers, and basic punctuation)
    text = re.sub(r'[^A-Za-z0-9\s,.\'"]+', ' ', text)
    text = re.sub(r'\n',' ', text)
    text = re.sub(r'n\'t',' not', text)
    text = re.sub(r'\'m', ' am', text)
    text = re.sub(r'\'s',' is', text)
    text = re.sub(r'[".,\']+', '', text)
    text = " ".join(text.split())
    text = text.lower()
    return text

In [33]:
def assign_sentiment_class(value):
    sentiment_class = 0
    if(value>0):
        sentiment_class = 1
    elif(value<0):
        sentiment_class = -1
    return sentiment_class

In [34]:
df['sentiment_class'] = df['sentiment'].apply(assign_sentiment_class)

In [35]:
df["clean_text"] = df["body"].apply(clean_text)

In [36]:
row = 133
df["clean_text"][row]

'did u ever find this also in a bind for a school project lol i assumed this would be easy to find'

In [37]:
df["body"][row]

'did u ever find this? also in a bind for a school project lol i assumed this would be easy to find'

In [38]:
df['sentiment_class'][row]

1

In [39]:
df.to_csv("processed_text.csv")

In [40]:
# Function to check if a row is noisy
def is_noisy(text, min_word_count=3):
    # Check if text is mostly numeric
    if re.fullmatch(r'\d+', text):
        return True
    # Check if the text has fewer than the minimum word count
    if len(text.split()) < min_word_count:
        return True
    return False

# Filter out noisy rows
df_cleaned = df[~df['clean_text'].apply(is_noisy)]

In [41]:
df_cleaned.shape,df.shape

((47658, 4), (54848, 4))

In [42]:
df_cleaned = df_cleaned[["clean_text","sentiment_class"]]

In [43]:
df_cleaned

Unnamed: 0,clean_text,sentiment_class
0,spatial problem suitability of new locations f...,1
1,have you tried toying around with gdelt or ali...,0
2,damn random internet person of whom i know not...,-1
3,ah nice one best of luck with the baby if you ...,1
4,i was about to write and say this should not b...,1
...,...,...
54841,full list here,0
54842,this was posted in another thread,0
54844,careful of the licence on this one,1
54845,also a great example of exposing an api with v...,1


In [44]:
# Split data
train_df, val_df = train_test_split(df_cleaned, test_size=0.3, random_state=42,shuffle=True)

In [45]:
train_df = train_df[:1000]
val_df = val_df[:300]

In [46]:
train_df

Unnamed: 0,clean_text,sentiment_class
39808,hey u u olegispe i am removing post because it...,1
37199,maybe this dataset england specific though,0
34452,great since the images of each model can be ea...,1
9049,a laptop or laptop computer is a small portabl...,1
19170,there is a dataset from the faa i think you ca...,0
...,...,...
7207,thank you very much can i dm you for some help...,1
47541,what is the format of the file i was able to d...,0
34586,mental health datasets are notoriously hard to...,1
11766,well even if that thought is true then you d e...,1


In [47]:
# Define a custom dataset class
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

In [87]:
# Create dataset and dataloader
batch_size = 16  # You can adjust the batch size based on your available memory
dataset_train = TextDataset(train_df['clean_text'].tolist())
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=False)
dataset_val = TextDataset(val_df['clean_text'].tolist())
dataloader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)

In [88]:
# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

In [89]:
# train_embeddings = []
# for text in train_df["clean_text"]:
#     inputs = tokenizer(text,padding=True, truncation=True, return_tensors="pt")
#     # Get the hidden states from the last layer of BERT
#     with torch.no_grad():
#         outputs = model(**inputs)
#         cls_embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()
#         train_embeddings.extend(cls_embeddings)


In [90]:
# Extract embeddings in batches
train_embeddings = []
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    for batch in dataloader_train:
        # Tokenize the batch of sentences at once
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        outputs = model(**inputs)
        # Get the [CLS] token embeddings for each sentence in the batch
        cls_embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()
        train_embeddings.extend(cls_embeddings)

In [91]:
# val_embeddings = []
# for text in val_df["clean_text"]:
#     inputs = tokenizer(text,padding=True, truncation=True, return_tensors="pt")
#     # Get the hidden states from the last layer of BERT
#     with torch.no_grad():
#         outputs = model(**inputs)
#         cls_embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()
#         val_embeddings.extend(cls_embeddings)

In [97]:
# Extract embeddings in batches
val_embeddings = []
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    for batch in dataloader_val:
        # Tokenize the batch of sentences at once
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        outputs = model(**inputs)
        # Get the [CLS] token embeddings for each sentence in the batch
        cls_embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()
        val_embeddings.extend(cls_embeddings)


In [65]:
# Train logistic regression model
logreg = LogisticRegression()
logreg.fit(train_embeddings, train_df["sentiment_class"])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [66]:

pred_labels = logreg.predict(val_embeddings)
print("Validation Accuracy:", accuracy_score(val_df["sentiment_class"], pred_labels))


Validation Accuracy: 0.71


In [67]:
from sklearn.svm import SVC

In [68]:
# Create an SVC object
svm_model = SVC()

# Fit the model to the training data
svm_model.fit(train_embeddings, train_df["sentiment_class"])

In [69]:
pred_labels = svm_model.predict(val_embeddings)
print("Validation Accuracy:", accuracy_score(val_df["sentiment_class"], pred_labels))


Validation Accuracy: 0.7133333333333334


In [105]:
import torch
import torch.nn as nn
import torch.optim as optim

class SentimentClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SentimentClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [106]:
input_dim = 768  # From your embedding shape
hidden_dim = 128  # Adjust as needed
output_dim = 3  # Assuming 3 sentiment classes (-1, 0, 1)

model = SentimentClassifier(input_dim, hidden_dim, output_dim)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [114]:
X_train = torch.tensor(train_embeddings, dtype=torch.float32)
y_train = torch.tensor(train_df["sentiment_class"].values + 1, dtype=torch.long)  # Assuming you have sentiment labels

In [115]:
X_train.shape, y_train.shape

(torch.Size([1000, 768]), torch.Size([1000]))

In [117]:
num_epochs = 10  # Adjust as needed
batch_size = 16  # Adjust as needed

for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(X_train), batch_size):
        batch_X = X_train[i:i+batch_size]
        batch_y = y_train[i:i+batch_size]
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

Epoch 1/10, Loss: 0.4063225984573364
Epoch 2/10, Loss: 0.36372947692871094
Epoch 3/10, Loss: 0.33103638887405396
Epoch 4/10, Loss: 0.30589941143989563
Epoch 5/10, Loss: 0.2771449089050293
Epoch 6/10, Loss: 0.24609558284282684
Epoch 7/10, Loss: 0.2369849979877472
Epoch 8/10, Loss: 0.18568888306617737
Epoch 9/10, Loss: 0.1628016233444214
Epoch 10/10, Loss: 0.12317706644535065


In [120]:

# Convert `val_embeddings` to a PyTorch tensor
X_val = torch.tensor(val_embeddings, dtype=torch.float32)

# Shift labels to be non-negative (0, 1, 2)
y_val = torch.tensor(val_df["sentiment_class"].values + 1, dtype=torch.long)

# Set the model to evaluation mode
model.eval()

# Make predictions
with torch.no_grad():
    outputs = model(X_val)
    _, predicted = torch.max(outputs, 1)  # Get predicted class indices

# Calculate accuracy
accuracy = accuracy_score(y_val.cpu().numpy(), predicted.cpu().numpy())
print(f"Validation Accuracy: {accuracy}")

Validation Accuracy: 0.69


In [121]:
# Save the model
torch.save(model.state_dict(), "sentiment_model.pth")