<a href="https://colab.research.google.com/github/Chava-Sai/Multi-Task-Model/blob/main/Deep_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentimental Analysis

## SA using Native Bayes

In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer

# Load your dataset
# Assuming you have a CSV file named 'dataset.csv'
data = pd.read_csv('train.csv')


data.drop(data.columns[[ 2,3,4,5,6]], axis=1, inplace=True)

data.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [None]:
data.dropna(subset=['text'], inplace=True)




# Remove special characters, punctuation, and numbers
data['cleaned_text'] = data['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

# Convert text to lowercase
data['cleaned_text'] = data['cleaned_text'].str.lower()

import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Remove stopwords
stop_words = set(stopwords.words('english'))
data['cleaned_text'] = data['cleaned_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Apply stemming
# stemmer = PorterStemmer()
# data['cleaned_text'] = data['cleaned_text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Apply lemmatization
data['cleaned_text'] = data['cleaned_text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Split data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features as needed

# Transform text data into TF-IDF features
X_train = vectorizer.fit_transform(train_data['cleaned_text'])
X_test = vectorizer.transform(test_data['cleaned_text'])

# Prepare target labels
y_train = train_data['sentiment']
y_test = test_data['sentiment']

# Initialize and train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Predict sentiments on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Accuracy: 0.64
Classification Report:
               precision    recall  f1-score   support

    negative       0.73      0.46      0.57      1572
     neutral       0.56      0.78      0.65      2236
    positive       0.74      0.61      0.67      1688

    accuracy                           0.64      5496
   macro avg       0.68      0.62      0.63      5496
weighted avg       0.66      0.64      0.63      5496



## SA using Bag of Words Vectorization-based Models


In [None]:
import pandas as pd
data = pd.read_csv('train.csv')
data.drop(data.columns[[2,3,4,5,6]], axis=1, inplace=True)
data.dropna(subset=['text'], inplace=True)
#Pre-Prcoessing and Bag of Word Vectorization using Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts = cv.fit_transform(data['text'])
#Splitting the data into trainig and testing
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, data['sentiment'], test_size=0.25, random_state=5)
#Training the model
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)
#Caluclating the accuracy score of the model
from sklearn import metrics
predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)
print("Accuracuy Score: ",accuracy_score)



Accuracuy Score:  0.632896652110626


## SA using LSTM-based Models

In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dense

# Load your dataset and preprocess it
data = pd.read_csv('train.csv')
data.drop(data.columns[[2, 3, 4, 5, 6]], axis=1, inplace=True)
data.dropna(subset=['text'], inplace=True)

# Preprocessing components
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    cleaned_text = cleaned_text.lower()
    cleaned_text = ' '.join([stemmer.stem(word) for word in cleaned_text.split()])
    cleaned_text = ' '.join([lemmatizer.lemmatize(word) for word in cleaned_text.split()])
    cleaned_text = ' '.join([word for word in cleaned_text.split() if word not in stop_words])
    return cleaned_text

data['text'] = data['text'].apply(preprocess_text)

# Tokenize and pad the preprocessed text
tokenizer = Tokenizer(num_words=500, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

# One-hot encode the sentiment labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data['sentiment'])
num_classes = len(label_encoder.classes_)
y_onehot = tf.keras.utils.to_categorical(y_encoded, num_classes=num_classes)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=0.25, random_state=5)

# Build the LSTM model
model = Sequential()
model.add(Embedding(500, 120, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(704, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(352, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model Training
model.fit(X_train, y_train, epochs=20, batch_size=32, verbose=1)

# Model Testing
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


# Emotional Analysis

## EA using RandomForest Classifier

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
data = pd.read_csv('tweet_emotions.csv')  # Replace with your dataset file

# Separate features (text) and labels
X = data['text']  # Replace 'text_column' with the actual column name
y = data['label']  # Replace 'label_column' with the actual column name

# Preprocess text data and convert to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features
X_tfidf = tfidf_vectorizer.fit_transform(X)

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust n_estimators
random_forest.fit(X_train, y_train)

# Make predictions on the test set
y_pred = random_forest.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)


Accuracy: 0.324875
Classification Report:
               precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.00      0.00      0.00        31
       empty       0.00      0.00      0.00       162
  enthusiasm       0.00      0.00      0.00       163
         fun       0.11      0.02      0.03       338
   happiness       0.32      0.34      0.33      1028
        hate       0.38      0.16      0.22       268
        love       0.45      0.37      0.41       762
     neutral       0.34      0.50      0.40      1740
      relief       0.21      0.02      0.03       352
     sadness       0.32      0.17      0.23      1046
    surprise       0.15      0.01      0.03       425
       worry       0.30      0.51      0.38      1666

    accuracy                           0.32      8000
   macro avg       0.20      0.16      0.16      8000
weighted avg       0.30      0.32      0.29      8000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## EA using XGB Classifier

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_csv('tweet_emotions.csv')  # Replace with your dataset file

# Separate features (text) and labels
X = data['text']  # Replace 'text_column' with the actual column name
y = data['label']  # Replace 'label_column' with the actual column name

# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Preprocess text data and convert to CountVector features
count_vectorizer = CountVectorizer(max_features=1000)  # You can adjust max_features
X_count = count_vectorizer.fit_transform(X)

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_count, y_encoded, test_size=0.2, random_state=42)

# Initialize and train the XGBoost classifier
xgb_classifier = XGBClassifier(n_estimators=100, random_state=42)  # You can adjust n_estimators
xgb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_classifier.predict(X_test)

# Decode the predicted labels back to original strings
y_pred_decoded = label_encoder.inverse_transform(y_pred)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)


Accuracy: 0.34875
Classification Report:
               precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.14      0.03      0.05        31
       empty       0.50      0.01      0.02       162
  enthusiasm       0.00      0.00      0.00       163
         fun       0.12      0.03      0.04       338
   happiness       0.35      0.32      0.33      1028
        hate       0.38      0.23      0.29       268
        love       0.48      0.38      0.42       762
     neutral       0.34      0.65      0.44      1740
      relief       0.26      0.04      0.07       352
     sadness       0.36      0.21      0.27      1046
    surprise       0.31      0.05      0.08       425
       worry       0.34      0.43      0.38      1666

    accuracy                           0.35      8000
   macro avg       0.27      0.18      0.18      8000
weighted avg       0.34      0.35      0.31      8000



## EA using Bert Transformer

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_csv('tweet_emotions_1.csv')  # Replace with your dataset file

# Separate features (text) and labels
X = data['text']
y = data['label']

# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

num_classes = len(label_encoder.classes_)


# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

# Tokenize the input text
encoded_data = tokenizer.batch_encode_plus(
    X,
    add_special_tokens=True,
    padding=True,
    return_attention_mask=True,
    return_tensors='pt'
)

input_ids = encoded_data['input_ids']
attention_masks = encoded_data['attention_mask']

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(input_ids, y_encoded, test_size=0.2, random_state=42)

train_attention_masks = attention_masks[y_train]
test_attention_masks = attention_masks[y_test]

train_dataset = TensorDataset(X_train, train_attention_masks, torch.tensor(y_train))
test_dataset = TensorDataset(X_test, test_attention_masks, torch.tensor(y_test))

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Set up the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

epochs = 1  # Define the number of training epochs

# Training loop
for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        batch_input_ids, batch_attention_mask, batch_labels = batch

        outputs = model(
            input_ids=batch_input_ids,
            attention_mask=batch_attention_mask,
            labels=batch_labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        batch_input_ids, batch_attention_mask, _ = batch

        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, dim=1)

        predictions.extend(batch_predictions.tolist())

# Calculate accuracy and classification report
accuracy = accuracy_score(y_test, predictions)
classification_rep = classification_report(y_test, predictions, target_names=label_encoder.classes_)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy: 0.68
Classification Report:
               precision    recall  f1-score   support

       anger       0.00      0.00      0.00         2
     boredom       0.00      0.00      0.00         2
       empty       0.00      0.00      0.00        12
  enthusiasm       0.00      0.00      0.00         6
         fun       0.00      0.00      0.00        11
   happiness       0.00      0.00      0.00        19
        hate       0.00      0.00      0.00        38
        love       0.00      0.00      0.00        25
     neutral       0.68      0.95      0.79       112
      relief       0.00      0.00      0.00        10
     sadness       0.80      0.99      0.88       151
    surprise       0.00      0.00      0.00        34
       worry       0.59      0.85      0.70       178

    accuracy                           0.68       600
   macro avg       0.16      0.21      0.18       600
weighted avg       0.50      0.68      0.58       600



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Multi Task Model on Sentimental and Emotional Analysis

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score

# Load your dataset
data = pd.read_csv("test_sent_emo_1.csv", encoding='ISO-8859-1')

# Map emotion labels to integers
emotion_map = {label: idx for idx, label in enumerate(data['emotion'].unique())}
data['emotion'] = data['emotion'].map(emotion_map)

# Map sentiment labels to integers
sentiment_map = {"positive": 0, "negative": 1, "neutral": 2}
data['sentiment'] = data['sentiment'].map(sentiment_map)

# Split the dataset
train_texts, val_texts, train_emotions, val_emotions, train_sentiments, val_sentiments = train_test_split(
    data['text'].values, data['emotion'].values, data['sentiment'].values,
    test_size=0.2, random_state=42, stratify=data['emotion'].values
)

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
num_emotion_classes = len(data['emotion'].unique())
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_emotion_classes)

# Tokenize and prepare data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_emotions),
    torch.tensor(train_sentiments)
)

val_dataset = TensorDataset(
    torch.tensor(val_encodings['input_ids']),
    torch.tensor(val_encodings['attention_mask']),
    torch.tensor(val_emotions),
    torch.tensor(val_sentiments)
)

# Define DataLoader
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=8)

val_sampler = SequentialSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=8)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 10)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(3):  # You can adjust the number of epochs
    model.train()
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, emotion, sentiment = batch

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=emotion)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

    model.eval()
    val_preds = []
    val_labels = []
    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, emotion, sentiment = batch

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        val_preds.extend(preds)
        val_labels.extend(emotion.cpu().numpy())

    accuracy = accuracy_score(val_labels, val_preds)
    print(f"Epoch {epoch+1}, Validation Accuracy: {accuracy:.4f}")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Validation Accuracy: 0.5632
Epoch 2, Validation Accuracy: 0.6207
Epoch 3, Validation Accuracy: 0.6379
