In [1]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.linear_model import LogisticRegression


def calculatewordfreq(words):
    word_freq = {}
    for word in words:
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1

    return word_freq


def calculate_ngrams(docs, n):
    ngram_list = []

    # Iterate over each document in the input list
    for doc in docs:

        words = doc.split()
        doc_ngrams = []

        # Generate n-grams for the current document
        for i in range(len(words) - n + 1):
            n_gram = words[i:i + n]
            n_gram_tuple = tuple(n_gram)
            doc_ngrams.append(n_gram_tuple)

        ngram_list.append(doc_ngrams)

    # Return the list of n-grams for each document
    return ngram_list


def ngrams_to_vector(ngrams, vocabulary):

    vector = []
    # Iterate over each document in the ngrams list
    for doc in ngrams:
        # Calculate the word frequency for the current document
        doc_counts = calculatewordfreq(doc)

        # Create a vector representation for the current document based on the vocabulary
        doc_vector = []
        for term in vocabulary:
            # Get the count of the current term in the document; if not found, default to 0
            count = doc_counts.get(term, 0)
            doc_vector.append(count)

        # Append the document vector to the overall vector list
        vector.append(doc_vector)

    # Return the vector representation of all documents
    return vector


data = pd.read_csv(
    '/Users/swastikagarwal/Downloads/SEM5_RVCE/LAB WORK/NLP/nlp dataset/Musical_instruments_reviews 4.csv')
x = data.iloc[:, 6]
y = data.iloc[:, 5]


x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=1)

n = int(input("Enter the value of n for n-grams: "))
xtrain_ngrams = calculate_ngrams(x_train, n)
xtest_ngrams = calculate_ngrams(x_test, n)

vocabulary = set()
# Iterate over each document in the xtrain_ngrams list
for doc in xtrain_ngrams:
    # Iterate over each n-gram in the current document
    for gram in doc:
        # Add the n-gram to the set (if it's not already present)
        vocabulary.add(gram)


xtrain_vector = ngrams_to_vector(xtrain_ngrams, vocabulary)
xtest_vector = ngrams_to_vector(xtest_ngrams, vocabulary)








In [2]:
# multinomial_nb = LogisticRegression(max_iter=100)
multinomial_nb = MultinomialNB()
# Training the classifier
multinomial_nb.fit(xtrain_vector, y_train)

# Making predictions on the test set
y_pred = multinomial_nb.predict(xtest_vector)
print(accuracy_score(y_test, y_pred))


0.6731612274719923


In [3]:
review = "best product. very useful"
review_ngrams = calculate_ngrams([review], n)
review_vector = ngrams_to_vector(review_ngrams, vocabulary)
res = multinomial_nb.predict(review_vector)
print(res[0])  # Prediction for the review


5.0


In [4]:
review = "Didn't fit my 1996 Fender Strat... so its not that good"
review_ngrams = calculate_ngrams([review], n)
review_vector = ngrams_to_vector(review_ngrams, vocabulary)
res = multinomial_nb.predict(review_vector)
print(res[0])  # Prediction for the review


4.0


In [5]:
review = "TDefinitely Not For The Seasoned Piano Player what to do."
review_ngrams = calculate_ngrams([review], n)
review_vector = ngrams_to_vector(review_ngrams, vocabulary)
res = multinomial_nb.predict(review_vector)
print(res[0])  # Prediction for the review


2.0
