
#Improved Sentiment Analyzer using nltk pos_tag
a. Use the POS Tagger in Task 1 for POS tagging the dataset.

b. Implement a pipeline to integrate the POS tag features along with the sentenceembeddings.

c. Train the same Classifier again for sentiment classification using the new features.


In [9]:
import nltk
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from nltk import pos_tag
from nltk.tokenize import word_tokenize
import random

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

nltk.download('movie_reviews')

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = [word.lower() for word in movie_reviews.words()]

sample_size = 1000
documents = documents[:sample_size]

tfidf_vectorizer = TfidfVectorizer(max_features=2000)

tfidf_features = tfidf_vectorizer.fit_transform([' '.join(doc) for doc, _ in documents])

X = tfidf_features.toarray()
y = [category for _, category in documents]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

def extract_pos_tags(doc):
    words = word_tokenize(doc)
    pos_tags = [tag for word, tag in pos_tag(words)]
    return ' '.join(pos_tags)

X_train_str = [' '.join(map(str, doc)) for doc in X_train]
X_val_str = [' '.join(map(str, doc)) for doc in X_val]
X_test_str = [' '.join(map(str, doc)) for doc in X_test]

X_pos_train = [' '.join(extract_pos_tags(doc)) for doc in X_train_str]
X_pos_val = [' '.join(extract_pos_tags(doc)) for doc in X_val_str]
X_pos_test = [' '.join(extract_pos_tags(doc)) for doc in X_test_str]

X_combined_train = [X_train_str[i] + ' ' + X_pos_train[i] for i in range(len(X_train_str))]
X_combined_val = [X_val_str[i] + ' ' + X_pos_val[i] for i in range(len(X_val_str))]
X_combined_test = [X_test_str[i] + ' ' + X_pos_test[i] for i in range(len(X_test_str))]

tfidf_vectorizer_combined = TfidfVectorizer(max_features=3000)

tfidf_features_combined_train = tfidf_vectorizer_combined.fit_transform(X_combined_train)
tfidf_features_combined_val = tfidf_vectorizer_combined.transform(X_combined_val)
tfidf_features_combined_test = tfidf_vectorizer_combined.transform(X_combined_test)

clf_combined = MultinomialNB()
clf_combined.fit(tfidf_features_combined_train, y_train)

y_val_pred_combined = clf_combined.predict(tfidf_features_combined_val)

validation_accuracy_combined = accuracy_score(y_val, y_val_pred_combined)

clf_combined.fit(tfidf_features_combined_test, y_test)

y_test_pred_combined = clf_combined.predict(tfidf_features_combined_test)

test_accuracy_combined = accuracy_score(y_test, y_test_pred_combined)

report_dict_combined = classification_report(y_test, y_test_pred_combined, output_dict=True)
report_df_combined = pd.DataFrame(report_dict_combined).transpose()

print(f'Validation Accuracy with Combined Features: {validation_accuracy_combined:.2f}')
print(f'Test Accuracy with Combined Features: {test_accuracy_combined:.2f}')
report_df_combined


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Validation Accuracy with Combined Features: 0.54
Test Accuracy with Combined Features: 0.54


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
neg,0.0,0.0,0.0,46.0
pos,0.54,1.0,0.701299,54.0
accuracy,0.54,0.54,0.54,0.54
macro avg,0.27,0.5,0.350649,100.0
weighted avg,0.2916,0.54,0.378701,100.0
