<a href="https://colab.research.google.com/github/Dije-7/NLP/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from sklearn.decomposition import TruncatedSVD

In [None]:
pip install datasets



In [None]:
from datasets import load_dataset

dataset = load_dataset("carblacac/twitter-sentiment-analysis")

In [None]:
train_df = pd.DataFrame(dataset['train'])
val_df = pd.DataFrame(dataset['validation'])
test_df = pd.DataFrame(dataset['test'])

In [None]:
train_texts, train_labels = train_df['text'], train_df['feeling']
val_texts, val_labels = val_df['text'], val_df['feeling']
test_texts, test_labels = test_df['text'], test_df['feeling']

# **(a) Naive Bayes Classifier(sklearn)**

In [None]:
count_vectorizer = CountVectorizer()
tfidf_transformer = TfidfTransformer()

X_train_counts = count_vectorizer.fit_transform(train_texts)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [None]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, train_labels)

# **(b)Support Vector Machine (SVM) Classifier (Sklearn)**

In [None]:
svd = TruncatedSVD(n_components=500)
X_train_svd = svd.fit_transform(X_train_tfidf)


svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_svd, train_labels)

# **Bi-LSTM Model (TensorFlow Keras)**

In [None]:
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

train_sequences = pad_sequences(train_sequences, maxlen=max_len)
val_sequences = pad_sequences(val_sequences, maxlen=max_len)
test_sequences = pad_sequences(test_sequences, maxlen=max_len)

In [None]:
embedding_dim = 100

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(train_sequences, train_labels, epochs=5, batch_size=32, validation_data=(val_sequences, val_labels))


Epoch 1/5
 207/3750 [>.............................] - ETA: 7:53 - loss: 0.6044 - accuracy: 0.6688

KeyboardInterrupt: ignored

In [None]:
X_val_counts = count_vectorizer.transform(val_texts)
X_val_tfidf = tfidf_transformer.transform(X_val_counts)

# Evaluate Naive Bayes Classifier
nb_accuracy = nb_classifier.score(X_val_tfidf, val_labels)
print(f"Naive Bayes Classifier Accuracy: {nb_accuracy}")

# Evaluate Bi-LSTM Model
lstm_loss, lstm_accuracy = model.evaluate(test_sequences, test_labels)
print(f"Bi-LSTM Model Accuracy: {lstm_accuracy}")

Naive Bayes Classifier Accuracy: 0.7581758175817582
Bi-LSTM Model Accuracy: 0.7851382493972778


In [None]:
# Evaluate SVM Classifier
svm_accuracy = svm_classifier.score(X_val_tfidf, val_labels)
print(f"SVM Classifier Accuracy: {svm_accuracy}")

SVM Classifier Accuracy: 0.78498654321
