In [1]:
import string

import pandas as pd
import spacy
from autocorrect import Speller
from spacy.lang.en.stop_words import STOP_WORDS
from tqdm import tqdm

Load Data

- Read in the data and then shuffle

In [2]:
data = pd.read_table('TripAdvisorUKRestaurant-max_MF.txt', names=["rating","review"])
data = data.sample(frac=1)

In [None]:
# Helper function to print tokens in a sentence 

def print_tokens(doc):
    for token in doc:
        token_text = token.text
        token_pos = token.pos_
        token_dep = token.dep_
        token_vec = token.vector
        print("{:<12}{:<10}{:<10}".format(token_text, token_pos, token_dep))
    print(token.vector)


Initialise SpaCy

You need to download the spacy model with:

`python -m spacy download en_core_web_md`

In [None]:
nlp = spacy.load("en_core_web_md")

Clean Text

1. Pronouns are removed from the text - lemmtaisation
2. Then stopwords and punctuation are removed
3. Then the removal of spaces and numbers and undetected punctuation
4. Removal of URLS and Emails
5. The whole sentence is then spell checked and corrected

In [5]:
def tokenizer(sentence):
    punctuation = string.punctuation
    stopwords = list(STOP_WORDS)
    filterTokens = [ word for word in sentence if word.lemma_ != "-PRON-" ]
    filterTokens = [ word for word in filterTokens if word.text not in stopwords and word.text not in punctuation ]
    filterTokens = [ word for word in filterTokens if word.pos_ not in ["SPACE","PUNCT","NUM"]]
    filterTokens = [ word for word in filterTokens if (not word.like_url or not word.like_email) ]
    return filterTokens


In [6]:
def clean_text(num_entries):
    spell = Speller(lang='en')
    text = []
    for review in tqdm(data[0:num_entries].review):
        doc = nlp(review)
        clean_token = tokenizer(doc)
        sentence = ' '.join([x.text.lower() for x in clean_token])
        sentence = spell(sentence)
        text.append(sentence)
    return text

Clean all the text and put into a list

In [7]:
text = clean_text()





100%|██████████| 1000/1000 [01:40<00:00, 10.00it/s]
