In [58]:
pip install transformers datasets torch scikit-learn pandas tqdm


Note: you may need to restart the kernel to use updated packages.


In [59]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import DistilBertTokenizer, DistilBertModel
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [6]:
# Load dataset (update path if needed)
df = pd.read_csv("C:\\Users\\Harsh Tripathi\\Documents\\Whatsapp Project 8 sem\\Dataset\\twitter_training.csv", delimiter=",", header=None, names=["ID", "Topic", "Sentiment", "Text"])

# Remove 'Irrelevant' labeled texts
df = df[df["Sentiment"] != "Irrelevant"].reset_index(drop=True)

# Encode sentiment labels (Positive=2, Neutral=1, Negative=0)
label_encoder = LabelEncoder()
df["Sentiment"] = label_encoder.fit_transform(df["Sentiment"])

# Check dataset
df.head()

Unnamed: 0,ID,Topic,Sentiment,Text
0,2401,Borderlands,2,im getting on borderlands and i will murder yo...
1,2401,Borderlands,2,I am coming to the borders and I will kill you...
2,2401,Borderlands,2,im getting on borderlands and i will kill you ...
3,2401,Borderlands,2,im coming on borderlands and i will murder you...
4,2401,Borderlands,2,im getting on borderlands 2 and i will murder ...


In [35]:
# Ensure all text values are strings
df["Text"] = df["Text"].astype(str)

# Check if any non-string values exist
print(df["Text"].apply(lambda x: isinstance(x, str)).value_counts())

# Print a sample text to confirm it's clean
print("Sample Text:", df["Text"].iloc[0])
print(type(df["Text"].iloc[0]))


Text
True    828
Name: count, dtype: int64
Sample Text: BBC News - Amazon boss Jeff Bezos rejects claims company acted like a 'drug dealer' bbc.co.uk/news/av/busine…
<class 'str'>


In [36]:
# Load DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")


In [37]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        
        text = str(self.texts[idx])  # Ensure text is a string
        label = self.labels[idx]

        if pd.isna(text) or text.strip() == "":  # Handle empty/missing text
            text = "empty text"  

        encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }
    


In [41]:
# Convert text and labels to list
texts = df["Text"].tolist()
labels = df["Sentiment"].tolist()

# Create dataset and DataLoader
dataset = SentimentDataset(texts, labels, tokenizer)
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)

# Check dataset size
len(dataset)


828

In [43]:
class SentimentClassifier(nn.Module):
    def __init__(self, num_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = self.dropout(outputs.last_hidden_state[:, 0, :])  # CLS token output
        return self.fc(x)


In [44]:
# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model
model = SentimentClassifier(num_classes=3).to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)


In [45]:
import torch
print(torch.cuda.is_available())  # Should print True
print(torch.cuda.device_count())  # Number of GPUs available
print(torch.cuda.get_device_name(0))  # GPU name
print(torch.version.cuda)  # CUDA version PyTorch is using


True
1
NVIDIA GeForce GTX 1050
12.1


In [None]:
epochs = 3  # You can change this
model.train()

for epoch in range(epochs):
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
    
    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix(loss=total_loss / (progress_bar.n + 1))


In [14]:
import torch

# Save the model
torch.save(model.state_dict(), "distilbert_sentiment_model.pth")
print("Model training complete and saved!")

# Save the tokenizer separately
tokenizer.save_pretrained("./distilbert_sentiment_model")


Model training complete and saved!


('./distilbert_sentiment_model\\tokenizer_config.json',
 './distilbert_sentiment_model\\special_tokens_map.json',
 './distilbert_sentiment_model\\vocab.txt',
 './distilbert_sentiment_model\\added_tokens.json')

In [None]:
import torch
from transformers import DistilBertTokenizer

# Define the number of classes (adjust this based on your dataset)
num_classes = 3  # Example: Positive, Negative, Neutral

# Recreate the model architecture with the correct number of classes
model = SentimentClassifier(num_classes=num_classes)  
model.load_state_dict(torch.load("distilbert_sentiment_model.pth", weights_only=True))

model.eval()  # Set to evaluation mode

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("./distilbert_sentiment_model")

print("Model and tokenizer loaded successfully!")



In [48]:
import torch
from transformers import DistilBertTokenizer

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("./distilbert_sentiment_model")

# Load the trained model
model = SentimentClassifier(num_classes=3)  # Ensure correct num_classes
model.load_state_dict(torch.load("distilbert_sentiment_model.pth", weights_only=True))
model.eval()  # Set to evaluation mode


SentimentClassifier(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Lin

In [61]:
import torch
import pandas as pd
from transformers import DistilBertModel, DistilBertTokenizer
import torch.nn as nn
from torch.nn.functional import softmax

# Define SentimentClassifier model
class SentimentClassifier(nn.Module):
    def __init__(self, num_classes=3):
        super(SentimentClassifier, self).__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return self.classifier(outputs.last_hidden_state[:, 0, :])

# Define class mapping
sentiment_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("./distilbert_sentiment_model")

# Load model
model = SentimentClassifier(num_classes=3)

# Load state_dict and rename keys if needed
state_dict = torch.load("distilbert_sentiment_model.pth", map_location=torch.device("cpu"), weights_only=True)


# Rename keys from "fc" to "classifier" if needed
new_state_dict = {k.replace("fc", "classifier"): v for k, v in state_dict.items()}
model.load_state_dict(new_state_dict, strict=False)  # Allow flexible loading

model.eval()  # Set to evaluation mode

# Load validation data
df = pd.read_csv("C:\\Users\\Harsh Tripathi\\Documents\\Whatsapp Project 8 sem\\Dataset\\twitter_validation.csv", 
                 sep="\t", header=None, names=["ID", "Category", "Sentiment", "Text"])
df = pd.read_csv("C:\\Users\\Harsh Tripathi\\Documents\\Whatsapp Project 8 sem\\Dataset\\twitter_validation.csv", sep=",", header=None, names=["ID", "Category", "Sentiment", "Text"])



In [70]:
print(df.head()) 

     ID   Category Sentiment  \
0   352     Amazon   Neutral   
1  8312  Microsoft  Negative   
2  4371      CS-GO  Negative   
3  4433     Google   Neutral   
4  6273       FIFA  Negative   

                                                Text Predicted Sentiment  
0  BBC News - Amazon boss Jeff Bezos rejects clai...             Neutral  
1  @Microsoft Why do I pay for WORD when it funct...            Negative  
2  CSGO matchmaking is so full of closet hacking,...            Negative  
3  Now the President is slapping Americans in the...             Neutral  
4  Hi @EAHelp I’ve had Madeleine McCann in my cel...            Negative  


In [69]:
# Remove "Irrelevant" rows
df = df[df["Sentiment"].isin(["Positive", "Negative", "Neutral"])]

# Predict sentiment
predictions = []
for text in df["Text"]:
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
        probs = softmax(outputs, dim=1)  
        predicted_class = torch.argmax(probs, dim=1).item()  
    
    predictions.append(sentiment_mapping[predicted_class])

# Save results
df["Predicted Sentiment"] = predictions
df.to_csv("predicted_results.csv", index=False)

print("Prediction complete! Results saved to predicted_results.csv")

Prediction complete! Results saved to predicted_results.csv


In [63]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("predicted_results.csv")

# Compare actual vs. predicted sentiment
df["Correct"] = df["Sentiment"] == df["Predicted Sentiment"]

# Print accuracy
accuracy = df["Correct"].mean() * 100
print(f"Model Accuracy: {accuracy:.2f}%")

# Show incorrect predictions
incorrect_predictions = df[df["Correct"] == False]
print("Incorrect Predictions:")
print(incorrect_predictions[["Text", "Sentiment", "Predicted Sentiment"]])


Model Accuracy: 97.71%
Incorrect Predictions:
                                                  Text Sentiment  \
43                I’m addicted to call of duty mobile😅  Negative   
60   Seems like #Playstation has the marketing deal...   Neutral   
78   GM Fam!!! hope you are all up and being great ...   Neutral   
80   Leaked memo excoriates #Facebook’s ‘slapdash a...  Negative   
131  I didn't have massive success in #IndieApril b...   Neutral   
181  Plague of Corruption is #1 on Amazon and # 3 o...  Negative   
269  #WorldCupAtHome: Five African matches you woul...  Positive   
301  Shipped first GPU-enabled production code thro...   Neutral   
322                                           Mori😻😻😻😻   Neutral   
369  Wine drunk playing the new Borderlands 😩\n\nGo...   Neutral   
382  This is about as far as I can go with it for t...   Neutral   
434  Absolutely love my amazing sister who has made...   Neutral   
450  My main reason for wanting the #PS5 is because...   Neutral   
46

In [67]:
print(df["Predicted Sentiment"].value_counts())


Predicted Sentiment
Positive    289
Neutral     278
Negative    261
Name: count, dtype: int64


In [68]:
import pandas as pd

# Load the predicted results
df = pd.read_csv("predicted_results.csv")

# Drop any NaN values (if present)
df.dropna(subset=["Sentiment", "Predicted Sentiment"], inplace=True)

# Calculate accuracy
accuracy = (df["Sentiment"] == df["Predicted Sentiment"]).mean() * 100
print(f"Model Accuracy: {accuracy:.2f}%")


Model Accuracy: 97.71%


In [66]:
incorrect_df = df[df["Sentiment"] != df["Predicted Sentiment"]]
print(f"Number of incorrect predictions: {len(incorrect_df)}")
print(incorrect_df[["Text", "Sentiment", "Predicted Sentiment"]].head(10))  # Show first 10 errors


Number of incorrect predictions: 19
                                                  Text Sentiment  \
43                I’m addicted to call of duty mobile😅  Negative   
60   Seems like #Playstation has the marketing deal...   Neutral   
78   GM Fam!!! hope you are all up and being great ...   Neutral   
80   Leaked memo excoriates #Facebook’s ‘slapdash a...  Negative   
131  I didn't have massive success in #IndieApril b...   Neutral   
181  Plague of Corruption is #1 on Amazon and # 3 o...  Negative   
269  #WorldCupAtHome: Five African matches you woul...  Positive   
301  Shipped first GPU-enabled production code thro...   Neutral   
322                                           Mori😻😻😻😻   Neutral   
369  Wine drunk playing the new Borderlands 😩\n\nGo...   Neutral   

    Predicted Sentiment  
43             Positive  
60             Positive  
78             Positive  
80              Neutral  
131            Positive  
181             Neutral  
269             Neutral  
301    

In [65]:
print(df["Predicted Sentiment"].value_counts())
print(df["Sentiment"].value_counts())


Predicted Sentiment
Positive    289
Neutral     278
Negative    261
Name: count, dtype: int64
Sentiment
Neutral     285
Positive    277
Negative    266
Name: count, dtype: int64


In [56]:
pip install scikit-learn





In [31]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

# Load the predicted results
df = pd.read_csv("predicted_results.csv")

# Ensure there are no NaN values
df.dropna(subset=["Sentiment", "Predicted Sentiment"], inplace=True)

# Define class mapping (ensure your labels match exactly)
sentiment_mapping = {"Negative": 0, "Neutral": 1, "Positive": 2}

# Convert Sentiment labels to numeric values
df["Sentiment"] = df["Sentiment"].map(sentiment_mapping)
df["Predicted Sentiment"] = df["Predicted Sentiment"].map(sentiment_mapping)

# Compute Accuracy
accuracy = accuracy_score(df["Sentiment"], df["Predicted Sentiment"])
print(f"Model Accuracy: {accuracy:.2%}")

# Compute Precision, Recall, and F1-score
print("\nClassification Report:")
print(classification_report(df["Sentiment"], df["Predicted Sentiment"], target_names=["Negative", "Neutral", "Positive"]))


Model Accuracy: 35.14%

Classification Report:
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       266
     Neutral       0.59      0.12      0.19       285
    Positive       0.33      0.93      0.49       277

    accuracy                           0.35       828
   macro avg       0.31      0.35      0.23       828
weighted avg       0.31      0.35      0.23       828



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
