In [None]:
#Dear Teammates this are some of the concepts you need to understand, also I have used comment alongside the code so that you can understand whats happening 

# Text preprocessing in NLP is like preparing raw text data so that it's easier for a computer to understand and analyze. It involves doing things like:

#     Lowercasing: Making all letters lowercase so that "hello" and "Hello" are treated the same.

#     Tokenization: Breaking down sentences into individual words or "tokens."

#     Removing Punctuation and Special Characters: Getting rid of unnecessary symbols like commas or exclamation marks.

#     Removing Stopwords: Removing common words (like "the" or "and") that don't add much meaning.

#     Stemming and Lemmatization: Simplifying words to their basic form (like turning "running" into "run").

In [12]:
#importing necessary libraries
import numpy as np
import pandas as pd
import string

In [None]:
#using natural language toolkit for various use later in the preprocessing part
import nltk
nltk.download('punkt')
nltk.download('stopwords')


In [35]:
from nltk.corpus import stopwords
from nltk import word_tokenize

In [2]:
df = pd.read_csv("fake reviews dataset.csv")

In [3]:
df.head() #gives you the first five rows

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [5]:
df.isnull().sum()
#we got output as no missing values in the dataset

category    0
rating      0
label       0
text_       0
dtype: int64

In [9]:
df.columns #gives you the name of the columns present in the dataset

Index(['category', 'rating', 'label', 'text_'], dtype='object')

In [10]:
df['rating'].value_counts() #this will give you count of every rating

rating
5.0    24559
4.0     7965
3.0     3786
1.0     2155
2.0     1967
Name: count, dtype: int64

In [14]:
#the below function will remove punctuation and also remove all stopwords
def clean_text(text):
    nopunc = [w for w in text if w not in string.punctuation]
    nopunc = ''.join(nopunc)
    return  ' '.join([word for word in nopunc.split() if word.lower() not in stopwords.words('english')])


In [17]:
#when we do this it will add a newcolumn to the dataset with column name afterclean and keep all those texts aftering going through the clean_text function
#running this can take a couple of time 
#have patience guys :)
df['afterclean'] = df['text_'].apply(lambda x:clean_text(x))

In [18]:
df.head() #you can see here the afterclean column has been added it contains the clean texts

Unnamed: 0,category,rating,label,text_,afterclean
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor...",Love Well made sturdy comfortable love itVery ...
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I...",love great upgrade original Ive mine couple years
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...,pillow saved back love look feel pillow
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i...",Missing information use great product price
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...,nice set Good quality set two months


In [22]:
df[['text_','afterclean']].head() #here you compare say the first sentence(Love this! Well made.....) stopwords like 'this' present in the text_column has been removed in the afterclean column also special characters like exclamation mark has been removed
#so this marks the ending of removing punctuation and stopwords.  

Unnamed: 0,text_,afterclean
0,"Love this! Well made, sturdy, and very comfor...",Love Well made sturdy comfortable love itVery ...
1,"love it, a great upgrade from the original. I...",love great upgrade original Ive mine couple years
2,This pillow saved my back. I love the look and...,pillow saved back love look feel pillow
3,"Missing information on how to use it, but it i...",Missing information use great product price
4,Very nice set. Good quality. We have had the s...,nice set Good quality set two months


In [27]:
#but we don't want to create a new column instead we are editing the text_ column and changes will be in that column only so I am now deleting the afterclean column.
df.drop("afterclean",axis=1,inplace=True)

In [29]:
#now again after understanding the above things we will apply the same to the text_ columns
df['text_'] = df['text_'].apply(lambda x:clean_text(x))

In [30]:
df.head() #reqd op below

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,Love Well made sturdy comfortable love itVery ...
1,Home_and_Kitchen_5,5.0,CG,love great upgrade original Ive mine couple years
2,Home_and_Kitchen_5,5.0,CG,pillow saved back love look feel pillow
3,Home_and_Kitchen_5,1.0,CG,Missing information use great product price
4,Home_and_Kitchen_5,5.0,CG,nice set Good quality set two months


In [31]:
df['text_'] = df['text_'].astype(str) #The code df['text_'] = df['text_'].astype(str) is converting the values in the 'text_' column of our Pandas DataFrame (df) to strings. Specifically, it's using the astype method to cast the values to the str type.

In [36]:
def preprocess(text):
    return ' '.join([word for word in word_tokenize(text) if word not in stopwords.words('english') and not word.isdigit() and word not in string.punctuation])
#the above function will tokenize,remove stopwords, remove numeric values,remove punctuation some of these function i have performed already like punctuation removal etc above ,anyways it will have no effects.

In [38]:
df['text_'] = df['text_'].str.lower() #converting everything to lower case


In [39]:
preprocess(df['text_'][0])

'love well made sturdy comfortable love itvery pretty'

In [None]:
#Now we are gonna apply the preprocess function on all rows 
#but we will apply it to every 10000 rows one by one total 4 times as our data set has around 40k rows. Applying a function to a smaller subset of the data is generally faster than applying it to the entire column, especially when dealing with large datasets. This can be crucial for optimizing the runtime of your code.
df['text_'][:10000] = df['text_'][:10000].apply(preprocess)

In [None]:
df['text_'][10001:20000] = df['text_'][10001:20000].apply(preprocess)

In [None]:
df['text_'][20001:30000] = df['text_'][20001:30000].apply(preprocess)

In [None]:
df['text_'][30001:40000] = df['text_'][30001:40000].apply(preprocess)

In [None]:
df['text_'][40001:40432] = df['text_'][40001:40432].apply(preprocess)

In [47]:
df.sample(frac=1).head() #see random rows

Unnamed: 0,category,rating,label,text_
12836,Movies_and_TV_5,5.0,OR,remember movie growing kids loved much great k...
8433,Electronics_5,5.0,CG,needed bag wife loves also loves fact
38431,Clothing_Shoes_and_Jewelry_5,5.0,OR,use south asia extreme weather conditions suff...
1107,Home_and_Kitchen_5,4.0,OR,juliennes zucchini nice ribbons zoodles freque...
11782,Electronics_5,5.0,CG,bought daughter use ipad loves itvery nicei bo...
