In [1]:
import pandas as pd

# Load the dataset
file_path = 'fakeReviewData.csv'
raw_data = pd.read_csv(file_path)

# Display the first few rows
print(raw_data.head())


             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  
0  Love this!  Well made, sturdy, and very comfor...  
1  love it, a great upgrade from the original.  I...  
2  This pillow saved my back. I love the look and...  
3  Missing information on how to use it, but it i...  
4  Very nice set. Good quality. We have had the s...  


In [2]:
# Handling missing values by removing rows with any NaN values
cleaned_data = raw_data.dropna()

# Removing duplicate rows
cleaned_data = cleaned_data.drop_duplicates()

print(cleaned_data.head())


             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  
0  Love this!  Well made, sturdy, and very comfor...  
1  love it, a great upgrade from the original.  I...  
2  This pillow saved my back. I love the look and...  
3  Missing information on how to use it, but it i...  
4  Very nice set. Good quality. We have had the s...  


In [3]:
import re


def normalize_text(text):

    # Converting text to lowercase
    text = text.lower()

    # Removing punctuation, special characters, and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    return text

# Applying the normalization function to the text column
cleaned_data['text_'] = cleaned_data['text_'].apply(normalize_text)

print(cleaned_data.head())


             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  
0  love this  well made sturdy and very comfortab...  
1  love it a great upgrade from the original  ive...  
2  this pillow saved my back i love the look and ...  
3  missing information on how to use it but it is...  
4  very nice set good quality we have had the set...  


In [6]:
import nltk
from nltk.tokenize import word_tokenize

# Downloading the NLTK tokenizer models (only needs to be done once)
# nltk.download('punkt')
# nltk.download('punkt_tab')

# Tokenizing the 'text_' column using NLTK's word_tokenize function
cleaned_data['tokens'] = cleaned_data['text_'].apply(word_tokenize)

print(cleaned_data.head())


             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  \
0  love this  well made sturdy and very comfortab...   
1  love it a great upgrade from the original  ive...   
2  this pillow saved my back i love the look and ...   
3  missing information on how to use it but it is...   
4  very nice set good quality we have had the set...   

                                              tokens  
0  [love, this, well, made, sturdy, and, very, co...  
1  [love, it, a, great, upgrade, from, the, origi...  
2  [this, pillow, saved, my, back, i, love, the, ...  
3  [missing, information, on, how, to, use, it, b...  
4  [very, nice, set, good, quality, we, have, had...  


In [8]:
from nltk.corpus import stopwords

# Downloading stopwords list and tokenizer models (only needs to be done once)
# nltk.download('stopwords')

# list of English stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):

    filtered_tokens = []

    for word in tokens:

        # Check if the word is not in the set of stopwords
        if word not in stop_words:

            # Add the word to the filtered list if it's not a stopword
            filtered_tokens.append(word)

    return filtered_tokens

cleaned_data['tokens'] = cleaned_data['tokens'].apply(remove_stopwords)

print(cleaned_data.head())

             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  \
0  love this  well made sturdy and very comfortab...   
1  love it a great upgrade from the original  ive...   
2  this pillow saved my back i love the look and ...   
3  missing information on how to use it but it is...   
4  very nice set good quality we have had the set...   

                                              tokens  
0  [love, well, made, sturdy, comfortable, love, ...  
1  [love, great, upgrade, original, ive, mine, co...  
2    [pillow, saved, back, love, look, feel, pillow]  
3  [missing, information, use, great, product, pr...  
4       [nice, set, good, quality, set, two, months]  


In [10]:
from nltk.stem import WordNetLemmatizer

# Downloading the WordNet lemmatizer models (only needs to be done once)
# nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def apply_lemmatization(tokens):

    # Initialize an empty list to store lemmatized tokens
    lemmatized_tokens = []

    # Iterate over each token in the list
    for word in tokens:

        # Lemmatize the word and add it to the lemmatized_tokens list
        lemmatized_tokens.append(lemmatizer.lemmatize(word,pos='v'))

    return lemmatized_tokens


# Applying lemmatization to the 'tokens' column
cleaned_data['tokens'] = cleaned_data['tokens'].apply(apply_lemmatization)

print(cleaned_data.head())


             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  \
0  love this  well made sturdy and very comfortab...   
1  love it a great upgrade from the original  ive...   
2  this pillow saved my back i love the look and ...   
3  missing information on how to use it but it is...   
4  very nice set good quality we have had the set...   

                                              tokens  
0  [love, well, make, sturdy, comfortable, love, ...  
1  [love, great, upgrade, original, ive, mine, co...  
2     [pillow, save, back, love, look, feel, pillow]  
3    [miss, information, use, great, product, price]  
4       [nice, set, good, quality, set, two, months]  


In [11]:
from gensim.models import Word2Vec

# Step 1: Prepare Corpus
# Extract the 'tokens' column as a list of tokenized sentences
corpus = cleaned_data['tokens'].tolist()

# Step 2: Train Word2Vec Model
# Initialize and train the Word2Vec model
model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1)

# Step 3: Create Sentence Vectors
# Function to compute the average word vector for a list of tokens

def get_sentence_vector(tokens, model):

  # Initialize an empty list to store word vectors
  vectors = []

  # Loop through each word in tokens
  for word in tokens:
     # Check if the word exists in the Word2Vec model's vocabulary
      if word in model.wv:
          # Retrieve the word vector and append it to the list
          vectors.append(model.wv[word])


  if vectors:
      return sum(vectors) / len(vectors)
  else:
      return [0] * model.vector_size  # Return zero vector for empty tokens



# Applying the function to compute sentence vectors
cleaned_data['sentence_vector'] = cleaned_data['tokens'].apply(lambda x: get_sentence_vector(x, model))

# Step 4: Save Processed Data
cleaned_data.to_csv("processed_fake_review_data.csv", index=False)

from google.colab import files
files.download("processed_fake_review_data.csv")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
data = pd.read_csv('processed_fake_review_data.csv');
print(data.head())

             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  \
0  love this  well made sturdy and very comfortab...   
1  love it a great upgrade from the original  ive...   
2  this pillow saved my back i love the look and ...   
3  missing information on how to use it but it is...   
4  very nice set good quality we have had the set...   

                                              tokens  \
0  ['love', 'well', 'make', 'sturdy', 'comfortabl...   
1  ['love', 'great', 'upgrade', 'original', 'ive'...   
2  ['pillow', 'save', 'back', 'love', 'look', 'fe...   
3  ['miss', 'information', 'use', 'great', 'produ...   
4  ['nice', 'set', 'good', 'quality', 'set', 'two...   

                                     sentence_vector  
0  [ 0.18759157 -0.03918396 -0.34630

In [13]:
model.wv.most_similar('good')

[('great', 0.7249922156333923),
 ('decent', 0.7241107225418091),
 ('awesome', 0.6723101735115051),
 ('predictablei', 0.6567014455795288),
 ('enjoyable', 0.6471998691558838),
 ('excellent', 0.6431293487548828),
 ('impress', 0.6352933645248413),
 ('authorit', 0.6327791213989258),
 ('nice', 0.6307988166809082),
 ('roomthis', 0.6217061281204224)]