In [None]:
# Mount Google Drive
from google.colab import drive
import os

drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np

In [None]:
#Load dataset
base_path = '/content/drive/MyDrive/Sentiment_analysis'
data_path = os.path.join(base_path, 'Facebook.csv')
df = pd.read_csv(data_path, encoding='utf-8')

In [None]:
# Check dataset structure
print("Dataset Shape:", df.shape)

In [None]:
# Check dataset structure
df.head(10)

In [None]:
df.info()

In [None]:
# Check for missing values
print("\nMissing values per column:\n", df.isnull().sum())
df.dropna(inplace=True)

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('stopwords')
nltk.download('punkt_tab')

In [None]:
# Define a text preprocessing function
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    text = word_tokenize(text)  # Tokenize
    text = [word for word in text if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(text)

In [None]:
# Apply preprocessing
df['cleaned_content'] = df['content'].apply(preprocess_text)

# **Lable Encoding**

In [None]:
# Define labels based on the 'score' column
def label_sentiment(score):
    if score >=4:
        return 'Positive'
    elif score == 3:
        return 'Neutral'
    else:
        return 'Negative'

In [None]:
df['sentiment'] = df['score'].apply(label_sentiment)

In [None]:
df.head()

In [None]:
review=df[df['score'] == 1].iloc[49]['cleaned_content']
print(review)

# **Visualising Data**

In [None]:
df['score'].value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Basic EDA: Distribution of scores
plt.figure(figsize=(8, 6))
sns.countplot(data=df,x='score', palette='viridis')
plt.title("Distribution of Sentiment Scores")
plt.xlabel("Score")
plt.ylabel("Frequency")
plt.show()

In [None]:

#Finding the percentage distribution of each score - we'll divide the number of records for each rating by total number of records

print(f"Score value count - percentage distribution: \n{round(df['score'].value_counts()/df.shape[0]*100,2)}")

In [None]:
fig = plt.figure(figsize=(7,7))

colors = ('blue', 'green','orange','yellow','red')

wp = {'linewidth':1, "edgecolor":'black'}

tags = df['score'].value_counts()/df.shape[0]

explode=(0.1,0.1,0.1,0.1,0.1)

tags.plot(kind='pie', autopct="%1.1f%%", shadow=True, colors=colors, startangle=90, wedgeprops=wp, explode=explode, label='Percentage wise distrubution of Scores')

from io import  BytesIO

graph = BytesIO()

fig.savefig(graph, format="png")

In [None]:
#Finding the percentage distribution of each feedback - we'll divide the number of records for each feedback by total number of records

print(f"Sentiments value count - percentage distribution: \n{round(df['sentiment'].value_counts()/df.shape[0]*100,2)}")

In [None]:
# Plot sentiment distribution
sns.countplot(data=df, x='sentiment', palette='viridis')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

In [None]:
from wordcloud import WordCloud

# Generate word clouds
for sentiment in df['sentiment'].unique():
    text = ' '.join(df[df['sentiment'] == sentiment]['cleaned_content'])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

    plt.figure(figsize=(7, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud for {sentiment} Sentiment')
    plt.axis('off')
    plt.show()

# **Feature Extraction**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text to TF-IDF features
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['cleaned_content']).toarray()
y = df['sentiment']

# **Train/Test Split**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Train Multiple Machine Learning Models**

## **1. Logistic Regresssion**


In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

# Logistic Regression
lr_preds = lr_model.predict(X_test)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, lr_preds, zero_division=0))
cm_lr = confusion_matrix(y_test, lr_preds)
print("Confusion Matrix for Logistic Regression:")
print(cm_lr)

In [None]:
# Logistic Regression
lr_preds = lr_model.predict(X_test)
print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test, lr_preds))

## **2. Naive Bayes**

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

# Naive Bayes
nb_preds = nb_model.predict(X_test)
print("\nNaive Bayes Classification Report:")
print(classification_report(y_test, nb_preds))
# Naive Bayes predictions
nb_preds = nb_model.predict(X_test)

# Calculate the confusion matrix for Naive Bayes
cm_nb = confusion_matrix(y_test, nb_preds)

# Print the confusion matrix
print("Confusion Matrix for Naive Bayes:")
print(cm_nb)

In [None]:
# Naive Bayes
nb_preds = nb_model.predict(X_test)
print("\nNaive Bayes")
print("Accuracy:", accuracy_score(y_test, nb_preds))

# **3.SVM**

In [None]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# SVM
svm_preds = svm_model.predict(X_test)
print("\nSVM Classification Report:")
print(classification_report(y_test, svm_preds, zero_division=0))

# Calculate the confusion matrix for SVM
cm_svm = confusion_matrix(y_test, svm_preds)

# Print the confusion matrix
print("Confusion Matrix for SVM:")
print(cm_svm)

In [None]:
# SVM
svm_preds = svm_model.predict(X_test)
print("\nSVM")
print("Accuracy:", accuracy_score(y_test, svm_preds))

# **4.Using BERT as feature extractor + train sklearn classifier**

In [None]:
!pip install -q transformers sentence-transformers

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [None]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # small & fast sentence embedding model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
def embed_texts(texts, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
        enc = {k:v.to(device) for k,v in enc.items()}
        with torch.no_grad():
            out = model(**enc, return_dict=True)
            # mean pooling
            token_embeddings = out.last_hidden_state  # (batch, seq_len, hidden)
            attention_mask = enc['attention_mask'].unsqueeze(-1)
            summed = (token_embeddings * attention_mask).sum(1)
            counts = attention_mask.sum(1)
            pooled = summed / counts
            embeddings.append(pooled.cpu().numpy())
    return np.vstack(embeddings)

In [None]:
# Prepare texts and labels
texts = df['cleaned_content'].tolist()
labels = df['sentiment'].values
X = embed_texts(texts)
# split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.15, stratify=labels, random_state=42)

In [None]:
# Train classifier (example: Logistic Regression)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

# **5.Fine-tune BERT (Hugging Face Trainer)**

In [None]:
!pip install -q evaluate

In [None]:
# === Imports ===
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, ClassLabel
import evaluate
import os

# Load the accuracy metric from the evaluate library
metric = evaluate.load("accuracy")

In [None]:
# === Load data ===
# assume df has columns: 'cleaned_content' and 'sentiment'
print(df.shape)
df = df.dropna(subset=['cleaned_content']).reset_index(drop=True)

In [None]:
# If labels are strings, map to ints
if df['sentiment'].dtype == object:
    label2id = {lab: i for i, lab in enumerate(sorted(df['sentiment'].unique()))}
    id2label = {v:k for k,v in label2id.items()}
    df['label'] = df['sentiment'].map(label2id)
else:
    id2label = None
    label2id = None

In [None]:
# Split
train_df, test_df = train_test_split(df, test_size=0.15, stratify=df['label'], random_state=42)
train_df, val_df  = train_test_split(train_df, test_size=0.1, stratify=train_df['label'], random_state=42)

In [None]:
# Convert to Hugging Face Dataset
train_ds = Dataset.from_pandas(train_df[['cleaned_content','label']])
val_ds   = Dataset.from_pandas(val_df[['cleaned_content','label']])
test_ds  = Dataset.from_pandas(test_df[['cleaned_content','label']])

In [None]:
# === Model & Tokenizer ===
model_name = "bert-base-uncased"   # or "distilbert-base-uncased" for faster/smaller
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
max_length = 128

In [None]:
def preprocess(batch):
    return tokenizer(batch['cleaned_content'], truncation=True, padding='max_length', max_length=max_length)

In [None]:
train_ds = train_ds.map(preprocess, batched=True)
val_ds   = val_ds.map(preprocess, batched=True)
test_ds  = test_ds.map(preprocess, batched=True)

In [None]:
# Set format for PyTorch
cols = ['input_ids','attention_mask','label']
train_ds.set_format(type='torch', columns=cols)
val_ds.set_format(type='torch', columns=cols)
test_ds.set_format(type='torch', columns=cols)

In [None]:
# === Create model ===
num_labels = len(df['label'].unique())
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

In [None]:
# === Metrics ===
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_metric.compute(predictions=preds, references=labels)['accuracy']
    if num_labels == 2:
        f1 = f1_metric.compute(predictions=preds, references=labels, average='binary')['f1']
    else:
        f1 = f1_metric.compute(predictions=preds, references=labels, average='weighted')['f1']
    return {"accuracy": acc, "f1": f1}

In [None]:
# === Training args ===
output_dir = "./bert_finetuned"
training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,   # reduce if GPU OOM; set to 8 or 4 as needed
    per_device_eval_batch_size=32,
    num_train_epochs=3,               # 2-4 good starting point
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=False,
    fp16=torch.cuda.is_available(),   # only if GPU and supported
    logging_steps=100,
    report_to='none' # Disable reporting to services like wandb
)

In [None]:
# Disable wandb logging
import os
os.environ["WANDB_DISABLED"] = "true"

# === Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# === Train ===
trainer.train()

In [None]:
# === Evaluate on test set ===
metrics = trainer.evaluate(eval_dataset=test_ds)
print("Test metrics:", metrics)

In [None]:
# === Save model & tokenizer ===
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print("Saved to", output_dir)

# **RoBERTa model**

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# === Load Data ===
# assume df has columns: 'cleaned_content' and 'sentiment'
df = df.dropna(subset=['cleaned_content']).reset_index(drop=True)

# === Encode sentiment labels to numeric ===
if df['sentiment'].dtype == object:
    label2id = {lab: i for i, lab in enumerate(sorted(df['sentiment'].unique()))}
    id2label = {v: k for k, v in label2id.items()}
    df['label'] = df['sentiment'].map(label2id)
else:
    label2id = None
    id2label = None


In [None]:
# === Split into train, val, test ===
train_df, test_df = train_test_split(df, test_size=0.15, stratify=df['label'], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, stratify=train_df['label'], random_state=42)

# === Convert to Hugging Face Datasets ===
train_ds = Dataset.from_pandas(train_df[['cleaned_content', 'label']])
val_ds   = Dataset.from_pandas(val_df[['cleaned_content', 'label']])
test_ds  = Dataset.from_pandas(test_df[['cleaned_content', 'label']])

# === Load tokenizer and model ===
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(df['label'].unique()))

In [None]:
# === Tokenize ===
def tokenize(batch):
    return tokenizer(batch['cleaned_content'], padding=True, truncation=True, max_length=128)

train_ds = train_ds.map(tokenize, batched=True)
val_ds   = val_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)

train_ds = train_ds.remove_columns(['cleaned_content'])
val_ds   = val_ds.remove_columns(['cleaned_content'])
test_ds  = test_ds.remove_columns(['cleaned_content'])
train_ds.set_format('torch')
val_ds.set_format('torch')
test_ds.set_format('torch')

In [None]:
# === Define metrics ===
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    acc = accuracy_score(p.label_ids, preds)
    f1 = f1_score(p.label_ids, preds, average='weighted')
    return {"accuracy": acc, "f1": f1}

In [None]:
# === Training arguments ===
training_args = TrainingArguments(
    output_dir="./results_roberta_fb",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir='./logs',
    logging_steps=100
)

In [None]:
# === Trainer ===
# Disable wandb logging
import os
os.environ["WANDB_DISABLED"] = "true"

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# === Train model ===
trainer.train()

# === Evaluate on test set ===
results = trainer.evaluate(test_ds)
print("Test metrics:", results)

In [None]:
# === Save model & tokenizer ===
output_dir = "./facebook_roberta_model"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print("Saved to", output_dir)

# **Predict New Sentiments**

In [None]:
# Sample new texts
new_texts = ['I absolutely love this!', 'This is the worst experience ever.']

# Preprocess the new texts
new_texts_cleaned = [preprocess_text(text) for text in new_texts]

# Transform to TF-IDF
new_features = tfidf.transform(new_texts_cleaned).toarray()

# Predict using Logistic Regression
predictions = lr_model.predict(new_features)

# Print predictions
print(predictions)

# **Predict New Sentiments Using Fine-Tuned RoBERTa**

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# === Load the saved fine-tuned model ===
model_path = "./facebook_roberta_model"   # same as your saved folder
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# === Sample new Facebook texts ===
new_texts = [
    "I absolutely love this product! üíï",
    "This is the worst experience everüòê.",
    "It‚Äôs okay üòå, not too good, not too bad üòë.",
    "Amazing service and friendly staff ü§ó!",
    "Absolutely terrible experience, very frustating",
    "Completely disappointed with the update üòí."
]

# === Tokenize the new texts ===
inputs = tokenizer(new_texts, padding=True, truncation=True, return_tensors="pt")

# === Make predictions ===
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_labels = torch.argmax(predictions, dim=1)

# === Map label IDs back to names ===
# These should match your label order used during training
id2label = {0: "Negative", 1: "Neutral", 2: "Positive"}   # adjust if needed

# === Print results ===
for text, label_id, probs in zip(new_texts, predicted_labels, predictions):
    print(f"Text: {text}")
    print(f"Predicted Sentiment: {id2label[label_id.item()]}")
    print(f"Confidence Scores: {probs.tolist()}\n")
