In [None]:
pip install pandas



In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/all_kindle_review.csv'
df = pd.read_csv(file_path)

# Create and insert the Sentiment column based on rating
# 1 for positive sentiment (rating > 3), 0 for negative sentiment (rating <= 3)
df['Sentiment'] = df['rating'].apply(lambda x: 1 if x > 3 else 0)

# Display the first few rows to verify
print(df[['rating', 'Sentiment']].head())




   rating  Sentiment
0       3          0
1       5          1
2       3          0
3       3          0
4       4          1


In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Download the punkt_tab data package

import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


# Load the dataset
file_path = '/content/all_kindle_review.csv'
df = pd.read_csv(file_path)

# Define a function to preprocess text
def preprocess_text(text):
    # 1. Convert to lowercase
    text = text.lower()

    # 2. Remove punctuation and special characters
    text = text.translate(str.maketrans('', '', string.punctuation))

    # 3. Optionally, remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)  # Tokenize the text
    text = ' '.join([word for word in tokens if word not in stop_words])

    return text

# Apply preprocessing to the 'reviewText' column
df['Processed_ReviewText'] = df['reviewText'].astype(str).apply(preprocess_text)

# Display the first few rows to verify
print(df[['reviewText', 'Processed_ReviewText']].head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


                                          reviewText  \
0  Jace Rankin may be short, but he's nothing to ...   
1  Great short read.  I didn't want to put it dow...   
2  I'll start by saying this is the first of four...   
3  Aggie is Angela Lansbury who carries pocketboo...   
4  I did not expect this type of book to be in li...   

                                Processed_ReviewText  
0  jace rankin may short hes nothing mess man hau...  
1  great short read didnt want put read one sitti...  
2  ill start saying first four books wasnt expect...  
3  aggie angela lansbury carries pocketbooks inst...  
4  expect type book library pleased find price right  


FEATURE EXTRACTION USING TfidfVectorizer



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Load the dataset
file_path = '/content/all_kindle_review.csv'
df = pd.read_csv(file_path)

# Ensure preprocessing has already been applied to the reviewText column
# If not, preprocess the text
if 'Processed_ReviewText' not in df.columns:
    df['Processed_ReviewText'] = df['reviewText'].astype(str).apply(preprocess_text)

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,  # Maximum number of features (vocabulary size)
    ngram_range=(1, 2),  # Consider unigrams and bigrams
    stop_words='english'  # Remove stop words
)

# Fit and transform the Processed_ReviewText column
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Processed_ReviewText'])

# Convert the sparse matrix to a DataFrame for easier interpretation
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

# Display the first few rows of the TF-IDF features
print("TF-IDF Features:")
print(tfidf_df.head())


TF-IDF Features:
   099   10  10 years  100  100 pages   11   12  12 stars   13   14  ...  \
0  0.0  0.0       0.0  0.0        0.0  0.0  0.0       0.0  0.0  0.0  ...   
1  0.0  0.0       0.0  0.0        0.0  0.0  0.0       0.0  0.0  0.0  ...   
2  0.0  0.0       0.0  0.0        0.0  0.0  0.0       0.0  0.0  0.0  ...   
3  0.0  0.0       0.0  0.0        0.0  0.0  0.0       0.0  0.0  0.0  ...   
4  0.0  0.0       0.0  0.0        0.0  0.0  0.0       0.0  0.0  0.0  ...   

   youve read  yummy  zach  zane  zero  zombie  zombies  zone  zorn  zsadist  
0         0.0    0.0   0.0   0.0   0.0     0.0      0.0   0.0   0.0      0.0  
1         0.0    0.0   0.0   0.0   0.0     0.0      0.0   0.0   0.0      0.0  
2         0.0    0.0   0.0   0.0   0.0     0.0      0.0   0.0   0.0      0.0  
3         0.0    0.0   0.0   0.0   0.0     0.0      0.0   0.0   0.0      0.0  
4         0.0    0.0   0.0   0.0   0.0     0.0      0.0   0.0   0.0      0.0  

[5 rows x 5000 columns]


In [None]:
# Create the Sentiment column
df['Sentiment'] = df['rating'].apply(lambda x: 1 if x > 3 else 0)

# Split the dataset into train and test sets
X = tfidf_matrix  # Features (TF-IDF matrix)
y = df['Sentiment']  # Labels (new sentiment column)

# Use train_test_split to create training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Fit the classifier on the training data
nb_classifier.fit(X_train, y_train)

# Predict sentiment on the test data
y_pred = nb_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.83

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.82      0.82      1190
           1       0.83      0.83      0.83      1210

    accuracy                           0.83      2400
   macro avg       0.83      0.83      0.83      2400
weighted avg       0.83      0.83      0.83      2400


Confusion Matrix:
[[ 978  212]
 [ 205 1005]]


FEATURE EXTRACTION USING Word2Vec

> Add blockquote



In [None]:
from gensim.models import Word2Vec
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Ensure preprocessing has been applied
if 'Processed_ReviewText' not in df.columns:
    df['Processed_ReviewText'] = df['reviewText'].astype(str).apply(preprocess_text)

# Tokenize the preprocessed text for Word2Vec
tokenized_reviews = df['Processed_ReviewText'].apply(lambda x: x.split())

# Train a Word2Vec model
word2vec_model = Word2Vec(
    sentences=tokenized_reviews,
    vector_size=100,  # Embedding size
    window=5,  # Context window size
    min_count=1,  # Minimum word frequency
    sg=0,  # Skip-gram (sg=1) or CBOW (sg=0)
    workers=4,  # Number of threads
    epochs=10  # Number of training iterations
)

# Create sentence embeddings by averaging word vectors
def get_sentence_vector(sentence, model, vector_size):
    words = [word for word in sentence if word in model.wv]
    if len(words) == 0:
        return np.zeros(vector_size)
    return np.mean(model.wv[words], axis=0)

df['Sentence_Vector'] = tokenized_reviews.apply(
    lambda x: get_sentence_vector(x, word2vec_model, vector_size=100)
)

# Convert the sentence embeddings into a feature matrix
X = np.vstack(df['Sentence_Vector'].values)
y = df['Sentiment']  # Labels (assumes the Sentiment column exists)

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train and Evaluate Naive Bayes Classifier
# Using GaussianNB since Word2Vec vectors are continuous
nb_classifier = GaussianNB()

# Fit the classifier on the training data
nb_classifier.fit(X_train, y_train)

# Predict sentiment on the test data
y_pred = nb_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.76

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.78      0.76      1190
           1       0.77      0.73      0.75      1210

    accuracy                           0.76      2400
   macro avg       0.76      0.76      0.76      2400
weighted avg       0.76      0.76      0.76      2400


Confusion Matrix:
[[932 258]
 [326 884]]
