# Improved Sentiment Analyzer

a. Use the POS Tagger in Task 1 for POS tagging the dataset.

b. Implement a pipeline to integrate the POS tag features along with the sentenceembeddings.

c. Train the same Classifier again for sentiment classification
using the new features.

In [None]:
import nltk
from nltk.corpus import treebank
from collections import defaultdict

nltk.download("treebank")
sentences = treebank.tagged_sents()
tags = set(tag for word, tag in treebank.tagged_words())



def train_hmm(tagged_sentences):
    emission_counts = defaultdict(lambda: defaultdict(int))
    transition_counts = defaultdict(lambda: defaultdict(int))
    initial_counts = defaultdict(int)
    tag_counts = defaultdict(int)

    for sentence in tagged_sentences:
        previous_tag = None
        for word, tag in sentence:
            emission_counts[tag][word] += 1
            transition_counts[previous_tag][tag] += 1
            if previous_tag is None:
                initial_counts[tag] += 1
            previous_tag = tag
            tag_counts[tag] += 1

    return emission_counts, transition_counts, initial_counts, tag_counts

emission_counts, transition_counts, initial_counts, tag_counts = train_hmm(sentences)

def viterbi(sentence, emission_counts, transition_counts, initial_counts, tag_counts):
    V = [{}]
    backpointer = [{}]

    for tag in tag_counts:
        if emission_counts[tag][sentence[0]]:
            emission_prob = emission_counts[tag][sentence[0]] / tag_counts[tag]
        else:
            emission_prob = 0.0001  # A small value for unknown words
        V[0][tag] = initial_counts[tag] / sum(initial_counts.values()) * emission_prob
        backpointer[0][tag] = None

    for t in range(1, len(sentence)):
        V.append({})
        backpointer.append({})
        for current_tag in tag_counts:
            max_prob = 0
            max_tag = None
            for previous_tag in tag_counts:
                if transition_counts[previous_tag][current_tag] and V[t - 1][previous_tag]:
                    transition_prob = transition_counts[previous_tag][current_tag] / tag_counts[previous_tag]
                    if emission_counts[current_tag][sentence[t]]:
                        emission_prob = emission_counts[current_tag][sentence[t]] / tag_counts[current_tag]
                    else:
                        emission_prob = 0.0001  # A small value for unknown words
                    prob = V[t - 1][previous_tag] * transition_prob * emission_prob
                    if prob > max_prob:
                        max_prob = prob
                        max_tag = previous_tag
            V[t][current_tag] = max_prob
            backpointer[t][current_tag] = max_tag

    best_path = []
    max_prob = 0
    max_tag = None
    for tag in tag_counts:
        if V[-1][tag] > max_prob:
            max_prob = V[-1][tag]
            max_tag = tag
    best_path.append(max_tag)

    for bp in reversed(backpointer):
        max_tag = bp[max_tag]
        if max_tag is not None:
            best_path.append(max_tag)

    best_path.reverse()
    return best_path

unique_tags = sorted(tags)
unique_words = set(word for word, _ in treebank.tagged_words())

train_size = int(0.8 * len(sentences))
train_data = sentences[:train_size]
test_data = sentences[train_size:]

emission_counts, transition_counts, initial_counts, tag_counts = train_hmm(train_data)

def tag_test_data(test_data, emission_counts, transition_counts, initial_counts, tag_counts):
    tagged_sentences = []
    for sentence in test_data:
        predicted_tags = viterbi([word for word, _ in sentence], emission_counts, transition_counts, initial_counts, tag_counts)
        tagged_sentences.append(list(zip([word for word, _ in sentence], predicted_tags)))
    return tagged_sentences

tagged_test_data = tag_test_data(test_data, emission_counts, transition_counts, initial_counts, tag_counts)

def calculate_accuracy(predicted_data, true_data):
    total_words = 0
    correct_words = 0
    for predicted_sentence, true_sentence in zip(predicted_data, true_data):
        for (word1, tag1), (word2, tag2) in zip(predicted_sentence, true_sentence):
            total_words += 1
            if tag1 == tag2:
                correct_words += 1
    accuracy = (correct_words / total_words) * 100
    return accuracy

accuracy = calculate_accuracy(tagged_test_data, test_data)
print(f"Accuracy on Test Data: {accuracy:.2f}%")


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


Accuracy on Test Data: 87.85%


In [None]:
num_sentences_to_print = 5
print("First few sentences of the dataset:")
for i, sentence in enumerate(sentences[:num_sentences_to_print], start=1):
    words = [word for word, _ in sentence]
    print(f"Sentence {i}: {' '.join(words)}")

First few sentences of the dataset:
Sentence 1: Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .
Sentence 2: Mr. Vinken is chairman of Elsevier N.V. , the Dutch publishing group .
Sentence 3: Rudolph Agnew , 55 years old and former chairman of Consolidated Gold Fields PLC , was named *-1 a nonexecutive director of this British industrial conglomerate .
Sentence 4: A form of asbestos once used * * to make Kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed * to it more than 30 years ago , researchers reported 0 *T*-1 .
Sentence 5: The asbestos fiber , crocidolite , is unusually resilient once it enters the lungs , with even brief exposures to it causing symptoms that *T*-1 show up decades later , researchers said 0 *T*-2 .


In [None]:
total_words_count = sum(len(sentence) for sentence in test_data)
total_unique_tags = len(unique_tags)

print(f"Total Words Count: {total_words_count}")
print(f"Total Unique Tags: {total_unique_tags}")

Total Words Count: 20039
Total Unique Tags: 46


In [None]:
num_sentences_to_print = 5
print("First few tagged sentences of the dataset:")
for i, tagged_sentence in enumerate(sentences[:num_sentences_to_print], start=1):
    print(f"Tagged Sentence {i}: {tagged_sentence}")


First few tagged sentences of the dataset:
Tagged Sentence 1: [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
Tagged Sentence 2: [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')]
Tagged Sentence 3: [('Rudolph', 'NNP'), ('Agnew', 'NNP'), (',', ','), ('55', 'CD'), ('years', 'NNS'), ('old', 'JJ'), ('and', 'CC'), ('former', 'JJ'), ('chairman', 'NN'), ('of', 'IN'), ('Consolidated', 'NNP'), ('Gold', 'NNP'), ('Fields', 'NNP'), ('PLC', 'NNP'), (',', ','), ('was', 'VBD'), ('named', 'VBN'), ('*-1', '-NONE-'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('of', 'IN'), ('this', 'DT')

In [None]:
test_sentence = "My name is Gangaram Sudewad .".split()
predicted_tags = viterbi(test_sentence, emission_counts, transition_counts, initial_counts, tag_counts)
for word, tag in zip(test_sentence, predicted_tags):
    print(f"Word: {word}, Predicted Tag: {tag}")


Word: My, Predicted Tag: DT
Word: name, Predicted Tag: NN
Word: is, Predicted Tag: VBZ
Word: Gangaram, Predicted Tag: VBN
Word: Sudewad, Predicted Tag: -NONE-
Word: ., Predicted Tag: .


In [None]:
import pandas as pd
unique_words = sorted(unique_words)
unique_tags = sorted(unique_tags)

emission_matrix = pd.DataFrame(index=unique_tags, columns=unique_words, dtype=float)

for word in unique_words:
    for tag in unique_tags:
        if emission_counts[tag][word] > 0:
            emission_prob = emission_counts[tag][word] / tag_counts[tag]
        else:
            emission_prob = 0.0
        emission_matrix.at[tag, word] = emission_prob

pd.options.display.float_format = '{:.5f}'.format

emission_matrix_concatenated = pd.concat([emission_matrix[word] for word in unique_words], axis=1).T

print("\nEmission Matrix:")
emission_matrix_concatenated



Emission Matrix:


Unnamed: 0,#,$,'',",",-LRB-,-NONE-,-RRB-,.,:,CC,...,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WP$,WRB,``
!,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00193,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
#,1.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
$,0.00000,0.98963,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
%,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
&,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.03704,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zero,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
zinc,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
zip,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
zone,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000


In [None]:
import pandas as pd
import pandas as pd
from tabulate import tabulate

unique_tags = sorted(unique_tags)

transition_matrix = pd.DataFrame(index=unique_tags, columns=unique_tags, dtype=float)


for previous_tag in unique_tags:
    for current_tag in unique_tags:
        if transition_counts[previous_tag][current_tag] > 0:
            transition_prob = transition_counts[previous_tag][current_tag] / tag_counts[previous_tag]
        else:
            transition_prob = 0.0
        transition_matrix.at[previous_tag, current_tag] = transition_prob

pd.options.display.float_format = '{:.4f}'.format

transition_matrix_concatenated = pd.concat([transition_matrix[tag] for tag in unique_tags], axis=1).T

print("\nTransition Matrix:")

tags_df = pd.DataFrame(transition_matrix.values, columns=unique_tags, index=unique_tags)
tags_df


Transition Matrix:


Unnamed: 0,#,$,'',",",-LRB-,-NONE-,-RRB-,.,:,CC,...,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WP$,WRB,``
#,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'',0.0,0.0,0.0065,0.0,0.0065,0.0097,0.0016,0.0016,0.0049,0.0292,...,0.0682,0.0032,0.0049,0.0049,0.1364,0.0114,0.0,0.0016,0.0016,0.0
",",0.0,0.0121,0.0612,0.0,0.0003,0.0324,0.0,0.0,0.0,0.0801,...,0.0503,0.0174,0.0222,0.0086,0.0308,0.0291,0.0124,0.0008,0.0076,0.0149
-LRB-,0.0,0.1489,0.0,0.0,0.0,0.0106,0.0,0.0,0.0,0.0213,...,0.0,0.0106,0.0319,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-NONE-,0.0,0.0036,0.0004,0.0461,0.0013,0.0744,0.0034,0.0868,0.0097,0.0118,...,0.0308,0.0817,0.011,0.0244,0.0404,0.0002,0.0002,0.0,0.001,0.0025
-RRB-,0.0,0.0,0.0,0.22,0.0,0.04,0.0,0.12,0.12,0.02,...,0.07,0.0,0.01,0.01,0.02,0.01,0.0,0.0,0.0,0.01
.,0.0,0.0,0.0675,0.0,0.0022,0.0016,0.0055,0.0,0.0003,0.0,...,0.0003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0006
:,0.0,0.0062,0.0,0.0,0.0,0.0288,0.0,0.0123,0.0,0.0535,...,0.0247,0.0103,0.0123,0.0144,0.0123,0.0041,0.0165,0.0,0.0062,0.0453
CC,0.0,0.0114,0.0,0.012,0.0,0.0093,0.0,0.0,0.0005,0.0005,...,0.0381,0.024,0.0163,0.0114,0.0212,0.0011,0.0022,0.0,0.0038,0.0044


Taken steps to combine information from TF-IDF word frequencies and POS tags to create a more informative feature set for sentiment analysis.

This approach can be beneficial as it considers both lexical and grammatical aspects of the text.


In [None]:
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from collections import defaultdict

# Download the movie_reviews corpus if not already downloaded
nltk.download('movie_reviews')

# Load the movie_reviews corpus
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the documents
import random
random.shuffle(documents)

# Extract features (TF-IDF)
all_words = [word.lower() for word in movie_reviews.words()]
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_features = tfidf_vectorizer.fit_transform([' '.join(doc) for doc, _ in documents])

# Split the data into train, validation, and test sets
X = tfidf_features.toarray()
y = [category for _, category in documents]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


# Tag the movie_reviews dataset using the Viterbi-based POS tagger
tagged_movie_reviews = []
for words, _ in documents:
    pos_tags = viterbi(words, emission_counts, transition_counts, initial_counts, tag_counts)
    tagged_movie_reviews.append(pos_tags)

def calculate_avg_pos_vectors(tagged_reviews, tag_set):
    pos_tag_vectors = defaultdict(list)

    for review_tags in tagged_reviews:
        if review_tags is not None:  # Check if review_tags is not None
            for word, tag in review_tags:
                if tag in tag_set:
                    pos_tag_vectors[tag].append(word)

    avg_pos_vectors = {}
    for tag, words in pos_tag_vectors.items():
        avg_vector = np.mean([tfidf_vectorizer.transform([' '.join(words)]).toarray()], axis=0)
        avg_pos_vectors[tag] = avg_vector

    return avg_pos_vectors


# Create a set of relevant POS tags (e.g., NOUN, ADJECTIVE, VERB)
relevant_pos_tags = {'NOUN', 'ADJ', 'VERB'}  # Adjust as needed

# Calculate average POS tag vectors
avg_pos_tag_vectors = calculate_avg_pos_vectors(tagged_movie_reviews, relevant_pos_tags)

# Combine average POS tag vectors with TF-IDF features
combined_features = np.hstack((X, np.array([avg_pos_tag_vectors[tag] for tag in relevant_pos_tags])))

# Train the classifier on the combined training data
clf_combined = MultinomialNB()
clf_combined.fit(X_temp, y_temp)

# Make predictions on the combined test set
y_combined_test_pred = clf_combined.predict(X_test)

# Calculate combined test accuracy
combined_test_accuracy = accuracy_score(y_test, y_combined_test_pred)

# Generate the classification report as a DataFrame
report_dict = classification_report(y_test, y_combined_test_pred, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()

# Print the combined test accuracy and the classification report
print(f'Combined Test Accuracy: {combined_test_accuracy:.2f}')
print(report_df)
