In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight

In [2]:
df = pd.read_csv('reddit_posts.csv')

df['text'] = df['text'].str.lower()

unique_classes = df['target'].unique()
class_weights = compute_class_weight(class_weight='balanced', classes=unique_classes, y=df['target'])
class_weight_dict = dict(zip(unique_classes, class_weights))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF vectorizer and Logistic Regression classifier
model = make_pipeline(
    TfidfVectorizer(),
    LogisticRegression(class_weight=class_weight_dict)
)

# Train the model
model.fit(X_train, y_train)

# Predict the sentiment on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.36      0.20      0.26        20
           2       0.54      0.60      0.57        47
           4       0.51      0.58      0.54        33

    accuracy                           0.51       100
   macro avg       0.47      0.46      0.46       100
weighted avg       0.50      0.51      0.50       100



In [4]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Create a pipeline with TF-IDF vectorizer and SVM classifier
pipeline = make_pipeline(
    TfidfVectorizer(),
    SVC(class_weight='balanced')
)

# Define the parameter grid to search over
param_grid = {
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'svc__C': [0.1, 1, 10, 100]
}

# Setup the grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=1)

# Train the model
grid_search.fit(X_train, y_train)

# Best model after grid search
best_model = grid_search.best_estimator_

# Predict the sentiment on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred, zero_division=0))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        20
           2       0.54      0.91      0.68        47
           4       0.65      0.39      0.49        33

    accuracy                           0.56       100
   macro avg       0.40      0.44      0.39       100
weighted avg       0.47      0.56      0.48       100



In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv('reddit_posts.csv')

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W', ' ', text)  # Remove special chars
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]  # Lemmatize and remove stopwords
    text = ' '.join(words)
    return text

# Apply preprocessing
df['text'] = df['text'].apply(preprocess_text)

# Class weighting
unique_classes = df['target'].unique()
class_weights = compute_class_weight(class_weight='balanced', classes=unique_classes, y=df['target'])
class_weight_dict = dict(zip(unique_classes, class_weights))

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.2, random_state=42)

# Vectorization and Model Pipeline
def build_and_evaluate_model(model, vectorizer=TfidfVectorizer()):
    pipeline = make_pipeline(vectorizer, model)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"Classification report for {model.__class__.__name__} with {vectorizer.__class__.__name__}:\n", classification_report(y_test, y_pred))

# Logistic Regression
build_and_evaluate_model(LogisticRegression(class_weight=class_weight_dict))

# Naive Bayes
build_and_evaluate_model(MultinomialNB(), CountVectorizer())

# Support Vector Machine
build_and_evaluate_model(SVC(class_weight=class_weight_dict))

# Random Forest
build_and_evaluate_model(RandomForestClassifier(class_weight=class_weight_dict))

# Gradient Boosting
build_and_evaluate_model(GradientBoostingClassifier())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alche\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alche\AppData\Roaming\nltk_data...


Classification report for LogisticRegression with TfidfVectorizer:
               precision    recall  f1-score   support

           0       0.30      0.15      0.20        20
           2       0.49      0.57      0.53        47
           4       0.46      0.48      0.47        33

    accuracy                           0.46       100
   macro avg       0.42      0.40      0.40       100
weighted avg       0.44      0.46      0.44       100

Classification report for MultinomialNB with CountVectorizer:
               precision    recall  f1-score   support

           0       0.33      0.10      0.15        20
           2       0.55      0.68      0.61        47
           4       0.50      0.55      0.52        33

    accuracy                           0.52       100
   macro avg       0.46      0.44      0.43       100
weighted avg       0.49      0.52      0.49       100

Classification report for SVC with TfidfVectorizer:
               precision    recall  f1-score   support


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for RandomForestClassifier with TfidfVectorizer:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        20
           2       0.54      0.98      0.70        47
           4       0.73      0.33      0.46        33

    accuracy                           0.57       100
   macro avg       0.42      0.44      0.39       100
weighted avg       0.50      0.57      0.48       100

Classification report for GradientBoostingClassifier with TfidfVectorizer:
               precision    recall  f1-score   support

           0       0.40      0.20      0.27        20
           2       0.63      0.91      0.75        47
           4       0.64      0.42      0.51        33

    accuracy                           0.61       100
   macro avg       0.56      0.51      0.51       100
weighted avg       0.59      0.61      0.57       100



In [10]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Load and preprocess dataset
df = pd.read_csv('reddit_posts.csv')
df['text'] = df['text'].apply(preprocess_text)  # Assuming preprocess_text is defined

# Encode target labels
label_encoder = LabelEncoder()
df['target'] = label_encoder.fit_transform(df['target'])

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.2, random_state=42)

# Define a PyTorch dataset
class TextDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        target = self.targets[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

# Parameters
MAX_LEN = 128
BATCH_SIZE = 16

# Tokenizer for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create data loaders
train_dataset = TextDataset(X_train.to_numpy(), y_train, tokenizer, MAX_LEN)
test_dataset = TextDataset(X_test.to_numpy(), y_test, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Define the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Training and evaluation functions go here
# For example:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Define loss function and optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

# Training loop for BERT
def train_model(model, data_loader, loss_fn, optimizer, device, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=targets
        )
        
        loss = outputs.loss
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    return correct_predictions.double() / n_examples, np.mean(losses)

# Define the evaluation loop
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=targets
            )
            
            loss = outputs.loss
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

# Implement the training and evaluation loops
# For example, call train_model and eval_model in a training loop for several epochs

# After training, you can save the model, make predictions, or evaluate it on the test set


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
import numpy as np
import torch
from tqdm import tqdm

# Number of epochs
EPOCHS = 3

# Training and evaluation
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_model(
        model=model,
        data_loader=train_loader,
        loss_fn=loss_fn,
        optimizer=optimizer,
        device=device,
        n_examples=len(train_dataset)
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model=model,
        data_loader=test_loader,
        loss_fn=loss_fn,
        device=device,
        n_examples=len(test_dataset)
    )

    print(f'Val loss {val_loss} accuracy {val_acc}')
    print()

# Save the trained model
torch.save(model.state_dict(), 'bert_sentiment_analysis.pth')

# Predictions (you can also load the model before making predictions if needed)
model = model.eval()  # Set model to evaluation mode

# Example: Predicting on the first batch from the test set
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        print(f"Predictions: {preds}")
        break  # Remove break to predict on the whole test set

# Evaluate the model on the entire test set
test_acc, test_loss = eval_model(
    model=model,
    data_loader=test_loader,
    loss_fn=loss_fn,
    device=device,
    n_examples=len(test_dataset)
)

print(f'Test loss {test_loss} accuracy {test_acc}')


Epoch 1/3
----------


KeyError: 180