# **Fake Review Detection - Preprocessing**

In [1]:
!pip install contractions
!pip install nltk
!pip install emoji
!pip install langdetect

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2

### **Import Libraries**

In [15]:
import pandas as pd
import numpy as np

import re
import contractions
from tqdm import tqdm
# from textblob import TextBlob
import emoji
from langdetect import detect

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK resources once
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Initialize global variables
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


### **Load and Explore the Data**

In [16]:
data=pd.read_csv('/content/fakeReviewData.csv')
data.sample(3)

Unnamed: 0,category,rating,label,text_
6950,Sports_and_Outdoors_5,5.0,CG,This thing is super bright and has the ability...
37888,Clothing_Shoes_and_Jewelry_5,2.0,CG,Nice design but do not go with the wide width....
13787,Movies_and_TV_5,5.0,OR,"His favorite movie, done up in a beautiful Voi..."


## **Data Preprocessing**

In [17]:
data.sample(3)

Unnamed: 0,category,rating,label,text_
26817,Kindle_Store_5,5.0,CG,"Great story, very fun. I recommend it. I reall..."
27449,Kindle_Store_5,5.0,CG,"This book has a little more of the ""real"" side..."
24348,Kindle_Store_5,3.0,CG,I received this book for an honest review.\n\n...


In [18]:
def add_space_between_emojis(text):
    # Split emojis using the emoji library and insert space between them
    emoji_list = emoji.emoji_list(text)
    for emoji_dict in emoji_list:
        text = text.replace(emoji_dict['emoji'], f" {emoji_dict['emoji']} ")  # Add space around emojis
    return text

def data_cleaning(x):
    try:
        # Skip if language is not English
        if detect(x) != 'en':
            return ""
    except:
        return ""

    x = x.lower()
    # Expand contractions
    x = contractions.fix(x)
    # Adding sapce bw emojis
    x = add_space_between_emojis(x)
    # Replace emojis with text
    x = emoji.demojize(x)
    # Remove HTML tags
    x = re.sub(r'<.*?>', '', x)
    # Remove URLs
    x = re.sub(r'http\S+|www\S+', '', x)
    # Remove anything that's not space and word
    x = re.sub(r'[^\w\s]', '', x)
    # Remove extra whitespace
    x = re.sub(r'\s+', ' ', x).strip()
    # Tokenization (split text into individual words)
    tokens = word_tokenize(x)
    # Stopword removal (remove common unimportant words)
    filtered_tokens = [word for word in tokens if word not in stop_words or word == "not"]
    # Lemmatization (reduce words to their base form)
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Join tokens back into a single string
    return " ".join(lemmatized_tokens)

> If the input string is very short, contains special characters, or is not easily detectable as English (e.g., emoji-heavy input), the langdetect.detect method might fail or return an empty result.

In [21]:
x=input("enter text: ")
display(data_cleaning(x))


enter text: hello hello one 2 3 mic check


'hello hello one 2 3 mic check'

In [20]:
tqdm.pandas()

data['cleaned_text']=data['text_'].progress_apply(data_cleaning)
data.sample(3)

100%|██████████| 40432/40432 [04:44<00:00, 142.32it/s]


Unnamed: 0,category,rating,label,text_,cleaned_text
38542,Clothing_Shoes_and_Jewelry_5,5.0,CG,I love this watch. It has the wide band which...,love watch wide band perfect watch gorgeous ba...
8906,Electronics_5,5.0,CG,I bought it. For the price it is an excellent ...,bought price excellent value looking cheaper pair
19354,Tools_and_Home_Improvement_5,5.0,CG,"I had and constantly used this tool, and it wo...",constantly used tool worked flawlessly downsid...


In [22]:
tfidf_vectorizer = TfidfVectorizer(
    max_features=1000,
    ngram_range=(1, 2),
    min_df=1,
    max_df=0.8,
    use_idf=True,
    norm='l2'
)
vectorized_data = tfidf_vectorizer.fit_transform(data['cleaned_text'])

# Convert sparse matrix to dense and add to the dataset
data['vectorized'] = list(vectorized_data.toarray())


In [23]:
data.sample(3)

Unnamed: 0,category,rating,label,text_,cleaned_text,vectorized
9582,Electronics_5,1.0,CG,Nothing happens or can happen. The only way t...,nothing happens happen way fix throw one bag p...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
35727,Toys_and_Games_5,1.0,OR,These are tiny. The product description says ...,tiny product description say 112 inc not make ...,"[0.0, 0.1943587534045035, 0.0, 0.0, 0.0, 0.0, ..."
10248,Electronics_5,2.0,OR,Got it to work finally. Can't hear much if any...,got work finally not hear much difference firs...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [24]:
top_words = pd.Series(tfidf_vectorizer.idf_, index=tfidf_vectorizer.get_feature_names_out()).sort_values()[:10]
print(top_words)

not       1.814510
great     2.272634
love      2.406102
one       2.424638
good      2.459362
would     2.598321
like      2.710540
well      2.724039
book      2.824619
little    2.882544
dtype: float64
