# Fake Review Detection: Data Preprocessing
### The Code is for the given steps that are mentioned in the checkpoint in order to clean and preprocess the data for further use.

In [1]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## 1. Load Data

In [2]:
def Load(file_path):
    data = pd.read_csv(file_path)
    return data

# Load dataset
file_path = 'fakeReviewData.csv'
data = Load(file_path)
data.head()


Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


## 2. Data Cleaning

In [3]:
def clean_data(data):
    # Remove duplicate rows
    data = data.drop_duplicates()
    data = data.dropna(subset=['text_'])
    
    # Remove reviews < 5 words
    data = data[data['text_'].str.split().str.len() > 5]
    
    # Remove reviews with URLs
    data = data[~data['text_'].str.contains('http|www|\.com', na=False)]
    
    return data


data = clean_data(data)
data.head()


Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


## 3. Text Normalization

In [4]:
def Normalize(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation, special characters, and numbers
    return text

data['text_'] = data['text_'].apply(Normalize)
data.head()


Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,love this well made sturdy and very comfortab...
1,Home_and_Kitchen_5,5.0,CG,love it a great upgrade from the original ive...
2,Home_and_Kitchen_5,5.0,CG,this pillow saved my back i love the look and ...
3,Home_and_Kitchen_5,1.0,CG,missing information on how to use it but it is...
4,Home_and_Kitchen_5,5.0,CG,very nice set good quality we have had the set...


## 4. Tokenization

In [5]:
def TokenizeText(data):
    data['tokens'] = data['text_'].apply(word_tokenize)
    return data

data = TokenizeText(data)
data.head()


Unnamed: 0,category,rating,label,text_,tokens
0,Home_and_Kitchen_5,5.0,CG,love this well made sturdy and very comfortab...,"[love, this, well, made, sturdy, and, very, co..."
1,Home_and_Kitchen_5,5.0,CG,love it a great upgrade from the original ive...,"[love, it, a, great, upgrade, from, the, origi..."
2,Home_and_Kitchen_5,5.0,CG,this pillow saved my back i love the look and ...,"[this, pillow, saved, my, back, i, love, the, ..."
3,Home_and_Kitchen_5,1.0,CG,missing information on how to use it but it is...,"[missing, information, on, how, to, use, it, b..."
4,Home_and_Kitchen_5,5.0,CG,very nice set good quality we have had the set...,"[very, nice, set, good, quality, we, have, had..."


## 5. Stopword Removal

In [6]:
def RemoveStopwords(data):
    stop_words = set(stopwords.words('english'))
    data['tokens'] = data['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
    return data

data = RemoveStopwords(data)
data.head()


Unnamed: 0,category,rating,label,text_,tokens
0,Home_and_Kitchen_5,5.0,CG,love this well made sturdy and very comfortab...,"[love, well, made, sturdy, comfortable, love, ..."
1,Home_and_Kitchen_5,5.0,CG,love it a great upgrade from the original ive...,"[love, great, upgrade, original, ive, mine, co..."
2,Home_and_Kitchen_5,5.0,CG,this pillow saved my back i love the look and ...,"[pillow, saved, back, love, look, feel, pillow]"
3,Home_and_Kitchen_5,1.0,CG,missing information on how to use it but it is...,"[missing, information, use, great, product, pr..."
4,Home_and_Kitchen_5,5.0,CG,very nice set good quality we have had the set...,"[nice, set, good, quality, set, two, months]"


## 6. Lemmatization

In [7]:
import nltk
nltk.download('omw-1.4')

def Lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    data['tokens'] = data['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    return data

data = Lemmatization(data)
data.head()


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,category,rating,label,text_,tokens
0,Home_and_Kitchen_5,5.0,CG,love this well made sturdy and very comfortab...,"[love, well, made, sturdy, comfortable, love, ..."
1,Home_and_Kitchen_5,5.0,CG,love it a great upgrade from the original ive...,"[love, great, upgrade, original, ive, mine, co..."
2,Home_and_Kitchen_5,5.0,CG,this pillow saved my back i love the look and ...,"[pillow, saved, back, love, look, feel, pillow]"
3,Home_and_Kitchen_5,1.0,CG,missing information on how to use it but it is...,"[missing, information, use, great, product, pr..."
4,Home_and_Kitchen_5,5.0,CG,very nice set good quality we have had the set...,"[nice, set, good, quality, set, two, month]"


## 7. Vectorization

In [8]:
def VectorizeText(data):
    data['processed_text'] = data['tokens'].apply(lambda x: ' '.join(x))
    tfid = TfidfVectorizer()
    vectors = tfid.fit_transform(data['processed_text'])
    return vectors, tfid

vectors, tfid = VectorizeText(data)
print(f"Vectorized data shape: {vectors.shape}")


Vectorized data shape: (40264, 42905)


## 8. Save Preprocessed Data
### To save the data in the folder

In [9]:
def SaveCleanData(data, file_path):
    data.to_csv(file_path, index=False)

output_path = 'CHP1_fakeReviewData.csv'
SaveCleanData(data, output_path)
