In [1]:
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
import contractions
import string
from num2words import num2words

from tqdm import tqdm

In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ethan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data preprocessing

We will need to do some data preprocessing so that we can properly vectorize the data for our machine learning methods.

In [3]:
df = pd.read_csv('../Data/filtered_data.csv')
df.head()

Unnamed: 0,overall,reviewText,asin
0,5,I don't spend a lot on my flags because they r...,9539723809
1,5,A very dear friend of mine is slowly losing he...,B00000JSZH
2,5,This is absolutely exquisite! It's made of car...,B00000JSZH
3,4,"This is really nice to use, however, just not ...",B00000JSZH
4,5,This Angel is beautiful. I as so glad I chose ...,B00000JSZH


The preproccesing steps are as follows

1. Lowercase the text
2. Expand contractions e.g "don't" -> "do not"
3. Remove HTML tags, whitespace and special characters. This has already been done but we will want to apply this function to reviews that have been typed (not in our dataset)
4. Remove punctuation
5. Convert numerical data to text data e.g "21" -> "twenty-one"
6. Remove punctuation again since num2words adds a hyphen
8. Tokenize the words for stemming and stop word removal
9. Stop word removal as they add no value to seniment
10. Word stemming to simplify the text for the vectorization

In [4]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Expand contractions
    text = contractions.fix(text)
    
    # Remove HTML tags
    text = re.sub('<[^<]+?>', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Replace special characters with their ASCII equivalent
    text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Replace numbers with their written form
    words = []
    for word in text.split():
        if word.isdigit():
            words.append(num2words(int(word)))
        else:
            words.append(word)
    text = ' '.join(words)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stem the words
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Join the tokens back into a string
    preprocessed_text = " ".join(tokens)

    return preprocessed_text

tqdm.pandas()
df['preprocessed_text'] = df['reviewText'].progress_apply(clean_text)
df.head()

100%|████████████████████████████████████████████████████████████████████████| 179642/179642 [02:24<00:00, 1246.20it/s]


Unnamed: 0,overall,reviewText,asin,preprocessed_text
0,5,I don't spend a lot on my flags because they r...,9539723809,spend lot flag realli get beat lesser qualiti ...
1,5,A very dear friend of mine is slowly losing he...,B00000JSZH,dear friend mine slowli lose sight pen make po...
2,5,This is absolutely exquisite! It's made of car...,B00000JSZH,absolut exquisit made cardboard like descript ...
3,4,"This is really nice to use, however, just not ...",B00000JSZH,realli nice use howev color saddl shimmer give...
4,5,This Angel is beautiful. I as so glad I chose ...,B00000JSZH,angel beauti glad chose one even beauti look o...


In [5]:
print(df.iloc[0]['reviewText'], '\n')
print(df.iloc[0]['preprocessed_text'])

I don't spend a lot on my flags because they really get beat up. So I but the lesser quality figuring that they'll have to be replaced a couple of times a year anyways. The fact that this very inexpensive flag is still flying 8 months later pleases me well. It's up about 20 feet, 7/24, and sees winds averaging 5 to 10 mph on a 'normal' day with the occasional storm friont gusts reaching as high as 60+ mph. So far (8 months later) it's just now starting to show a little fraying on the ends. I'll be buying another shortly to have ready when this one finally gives in. 

spend lot flag realli get beat lesser qualiti figur replac coupl time year anyway fact inexpens flag still fli eight month later pleas well twenti feet seven hundr twentyfour see wind averag five ten mph normal day occasion storm friont gust reach high sixti mph far eight month later start show littl fray end buy anoth short readi one final give


In [6]:
df.to_csv('../Data/preprocessed_data.csv', index=False)