# Preprocessing Dataset for NLP Modelling

In [1]:
# Loading in required modules 
import numpy as np 
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import string

### Reading in the Data

In [2]:
# Read in csv (Source - https://www.kaggle.com/lievgarcia/amazon-reviews)
df = pd.read_csv(r"C:\Machine Learning\UU\amazon_reviews.csv", encoding = "latin")

# Fixing Labels 
df = df.rename(columns={"ï»¿LABEL":"class", "REVIEW_TEXT":"raw_sentence"})

# Re-labeling data for increased accessibility 
# Mapping to binary 
df["class"] = df["class"].map({"__label1__": 0, "__label2__": 1})
df["VERIFIED_PURCHASE"] = df["VERIFIED_PURCHASE"].map({"Y": 1, "N": 0})

# Converting categorical data to numerical
df["PRODUCT_CATEGORY"] = pd.factorize(df["PRODUCT_CATEGORY"])[0]

df.sample(5)

Unnamed: 0,class,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_TITLE,REVIEW_TITLE,raw_sentence
17981,1,5,1,2,Best Baltic Amber Teething Necklace For Baby (...,Love this necklace,Love this necklace!!! My daughter is almost a ...
941,0,3,0,2,Badger Basket Five Basket Storage Unit with Wi...,Baskets are not even toy quality,We purchased this for our church nursery. This...
12908,1,5,1,0,iPazzPort 2.4GHz Mini Wireless Fly Keyboard wi...,Excellent and convenient,I have an HTPC (Home Theater Personal Computer...
2573,0,2,0,17,"3M WorkTunes Hearing Protector, MP3 Compatible...",makes a static sound,"I would not recommend this, unless it was free..."
4519,0,5,0,4,Perfecto 100% Pure Badger Shaving Brush-Silver...,Nice Shavinf brush,I got this one as a gift from perfecto for bei...


### Text Preprocessing

- Punctuation removal 
- Tokenising 
- Parts of Speech Tagging 
- Lemmatizing

In [3]:
def punc_remove(text):
    """
    Removing punctuation 
    """
    punc = list(string.punctuation)
    
    punc_free = "".join([i for i in text if i not in punc])
    
    return punc_free

df["raw_sentence"] = df["raw_sentence"].progress_apply(lambda x:punc_remove(x))

  0%|          | 0/21000 [00:00<?, ?it/s]

In [4]:
def tokenise(text = df["raw_sentence"]):
    # Creating word tokens in lower case
    tokens = word_tokenize(text.lower())
    return tokens 

df["token_sentence"] = df["raw_sentence"].progress_apply(lambda x:tokenise(x))

  0%|          | 0/21000 [00:00<?, ?it/s]

In [5]:
def pos_tokens(tokens = df["token_sentence"]):
    """
    Returns tokenised and tagged words
    Required pre-process for nltk lemmatizing
    """
    
    # Tagging using nltk 
    pos_token = nltk.pos_tag(tokens)
    
    # Replacing tags with tags accepted by lemmatiser 
    for i, (word, tag) in enumerate(pos_token):
        # Adjectives
        if tag.startswith("J"):
             tag = wordnet.ADJ
        # Verbs 
        elif tag.startswith("V"):
             tag = wordnet.VERB
        # Nouns 
        elif tag.startswith("N"):
             tag = wordnet.NOUN
        # Adverbs 
        elif tag.startswith("R"):
             tag = wordnet.ADV
        else:
             tag = ""
        # Replacing tags 
        pos_token[i] = (word, tag)
    
    return pos_token

# Applying function to entire dataset 
df["pos_tagged_sentence"] = df["token_sentence"].progress_apply(lambda x: pos_tokens(x))

  0%|          | 0/21000 [00:00<?, ?it/s]

In [6]:
lm = WordNetLemmatizer()

def get_all_lemmas(pos_tokens = df["pos_tagged_sentence"]):
    # Creating empty list
    lemmatized_list = []
    stop = stopwords.words("english")
    
    # Loop through all tagged sentences
    for sentence in tqdm(pos_tokens):
        sent = []
        # Loop through all words and POS tag
        for word, tag in sentence:
            if tag in ["a", "r", "n", "v"] and word not in stop:
                # Only nouns, adverbs, adjectives and verbs have lemma form
                lemma = lm.lemmatize(word, tag)
                sent.append(lemma)
            elif word not in stop:
                # Keeping non-stop words (NLP useful)
                lemma = word
                sent.append(lemma)
            else:
                pass

        lemmatized_list.append(sent)
    
    # Adding lemmas to dataframe 
    df["lemmatized"] = lemmatized_list
    
    return df["lemmatized"].sample(5)

get_all_lemmas()

  0%|          | 0/21000 [00:00<?, ?it/s]

7212     [product, permit, take, pleasure, yard, refill...
34       [bag, look, like, little, small, version, one,...
3954     [light, receive, bright, color, different, loo...
12538    [many, think, pirate, favorite, letter, 34r34,...
20539    [buy, two, cover, center, thread, pandora, bra...
Name: lemmatized, dtype: object

In [7]:
# Drop unwanted columns 
df = df.drop(["PRODUCT_TITLE", "REVIEW_TITLE", "token_sentence"], axis = 1)

In [8]:
df.sample(5)

Unnamed: 0,class,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,raw_sentence,pos_tagged_sentence,lemmatized
13213,1,5,1,5,I chose 5 stars because this product is usuall...,"[(i, n), (chose, v), (5, ), (stars, n), (becau...","[choose, 5, star, product, usually, expensive,..."
17976,1,2,0,22,B button on backside of wheel does not work un...,"[(b, ), (button, n), (on, ), (backside, n), (o...","[b, button, backside, wheel, work, unless, cra..."
12362,1,5,1,22,this is a Christmas gift for one of my grandso...,"[(this, ), (is, v), (a, ), (christmas, a), (gi...","[christmas, gift, one, grandson, hope, enjoy, ..."
6099,0,5,0,20,My cousin just bought tubes Its great and come...,"[(my, ), (cousin, n), (just, r), (bought, v), ...","[cousin, buy, tubes, great, come, consistent, ..."
19655,1,3,1,19,My old Clarks sandals were much better and las...,"[(my, ), (old, a), (clarks, a), (sandals, n), ...","[old, clarks, sandal, much, good, last, long, ..."


In [9]:
# Save to pickle to not lose functionality of dataframe
df.to_pickle("labelled_data_preprocessed.pkl")