In [95]:
import pandas as pd
import json




In [111]:
# Step 1: Load the CSV Data
def load_data(file_path):
    """Load dataset from a CSV file."""
    df = pd.read_csv(file_path)
    return df

# Step 2: Extract Text and Labels
def extract_text_and_label(sentiment):
    """Extract text and label from sentiment column in JSON format."""
    if isinstance(sentiment, str):
        try:
            annotations = json.loads(sentiment.replace("'", "\""))
            if annotations and annotations[0].get("labels"):
                text = annotations[0]["text"]
                label = annotations[0]["labels"][0]
                return text, label
        except json.JSONDecodeError:
            pass
    return None, None

def preprocess_data(df):
    """Apply text and label extraction and remove rows with missing labels."""
    df[["extracted_text", "label"]] = df["sentiment"].apply(lambda x: pd.Series(extract_text_and_label(x)))
    df = df.dropna(subset=["extracted_text", "label"])
    # Map labels to binary values for subjectivity classification (optional)
    df["label"] = df["label"].map({"Neutral": 0, "Positive": 1, "Negative": 1})
    return df[["extracted_text", "label"]]


In [112]:
len(df)



184

In [62]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [113]:
from sklearn.model_selection import train_test_split
def split_data(df):
    """Split the data into training and testing sets."""
    X = df["extracted_text"].tolist()  # Convert to list to avoid indexing issues
    y = df["label"].tolist()
    
    # Perform train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test


In [64]:
# Step 4: Tokenize Data
def tokenize_data(X_train, X_test):
    """Tokenize the text data for BERT."""
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
    test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)
    return train_encodings, test_encodings


In [65]:
# Step 5: Convert to Dataset Class
class SubjectivityDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [66]:
# Step 6: Define Training Arguments
def get_training_args():
    return TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
    )

In [67]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

In [117]:
# Step 8: Train the Model
def train_model(X_train, y_train, X_test, y_test):
    train_encodings, test_encodings = tokenize_data(X_train, X_test)
    train_dataset = SubjectivityDataset(train_encodings, list(y_train))
    test_dataset = SubjectivityDataset(test_encodings, list(y_test))

    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
    trainer = Trainer(
        model=model,
        args=get_training_args(),
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    trainer.model.save_pretrained("./results")
    tokenizer.save_pretrained("./results")
    return trainer

In [69]:
# Step 9: Evaluate the Model
def evaluate_model(trainer):
    eval_result = trainer.evaluate()
    print("Evaluation Results:", eval_result)

In [114]:
def run_pipeline(file_path):
    # Load and preprocess data
    df = load_data(file_path)
    processed_df = preprocess_data(df)

    # Filter only necessary columns
    processed_df = processed_df[["extracted_text", "label"]]

    # Split data
    X_train, X_test, y_train, y_test = split_data(processed_df)

    # Train and evaluate the model
    trainer = train_model(X_train, y_train, X_test, y_test)
    evaluate_model(trainer)


In [118]:
run_pipeline('chunk_0_labelled.csv')  # Replace with your actual file path


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["label"].map({"Neutral": 0, "Positive": 1, "Negative": 1})
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.26629120111465454, 'eval_accuracy': 0.9655172413793104, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.3139, 'eval_samples_per_second': 92.392, 'eval_steps_per_second': 6.372, 'epoch': 1.0}


  0%|          | 0/2 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.18545514345169067, 'eval_accuracy': 0.9655172413793104, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.2933, 'eval_samples_per_second': 98.881, 'eval_steps_per_second': 6.819, 'epoch': 2.0}


  0%|          | 0/2 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.16889667510986328, 'eval_accuracy': 0.9655172413793104, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.3587, 'eval_samples_per_second': 80.847, 'eval_steps_per_second': 5.576, 'epoch': 3.0}
{'train_runtime': 21.9745, 'train_samples_per_second': 15.563, 'train_steps_per_second': 1.092, 'train_loss': 0.3509397506713867, 'epoch': 3.0}


  0%|          | 0/2 [00:00<?, ?it/s]

Evaluation Results: {'eval_loss': 0.16889667510986328, 'eval_accuracy': 0.9655172413793104, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.3516, 'eval_samples_per_second': 82.482, 'eval_steps_per_second': 5.688, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))


In [103]:
file_path='chunk_0_labelled.csv'
df = load_data(file_path)
processed_df = preprocess_data(df)

# # Split data
# X_train, X_test, y_train, y_test = split_data(processed_df)
processed_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["label"].map({"Neutral": 0, "Positive": 1, "Negative": 1})


Unnamed: 0,extracted_text,label
1,Too often the presenter speaks and the others ...,1
2,test successful. way to go!!!,1
3,"Randy,\n\nCan you send me a schedule of the sa...",0
5,"Greg,\n\nHow about either next Tuesday or Thur...",0
6,Please cc the following distribution list with...,0
7,any morning between 10 and 11:30,0
10,"Mr. Buckner,\n\nFor delivered gas behind San D...",0
13,I have been involved in most of the meetings a...,0
14,Here are the names of the west desk members by...,0
15,35 million is fine,0


In [123]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the trained model and tokenizer from the output directory
model = BertForSequenceClassification.from_pretrained("./results")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define prediction function
def predict_sentiment(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Make sure the model is in evaluation mode
    model.eval()
    
    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the predicted label
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=1).item()

    # Map the predicted class to a label
    label_map = {0: "Neutral", 1: "Positive", 2: "Negative"}   # Adjust based on your labels
    predicted_label = label_map[predicted_class_id]
    
    return predicted_label

# Test the function with a sample text
sample_text = "this is a waste of time!!"
predicted_label = predict_sentiment(sample_text)
print(f"Predicted Sentiment for the sample text: {predicted_label}")


Predicted Sentiment for the sample text: Neutral
