In [2]:
TEXT DATA: FILTERING, FLATTENING, AND CHUNKING

In [None]:
# Computing n-grams
import pandas
import json
from sklearn.feature_extraction.text import CountVectorizer

# Load reviews
f = open('yelp_academic_data_set_review.json')
js = []
for i in range(10000):
    js.append(json.loads(f.readline()))
f.close()
review_df = pd.DataFrame(js)

# Create feature transformers for unigrams, bigrams, trigrams.
# The default ignores single character words, which is useful in practice because
# it trims uninformative words, but we include them here 
bow_convertor = CountVectorizer(token_pattern = '(?u)\\b\\w+\\b')
bigram_convertor = CountVectorizer(ngram_range = (2,2), token_pattern = '(?u)\\b\\w+\\b')
trigram_convertor = CountVectorizer(ngram_range = (3,3), token_pattern = '(?u)\\b\\w+\\b')

# Fit the transformers and look at the vocab size
bow_convertor.fit(review_df['text'])
words = bow_convertor.get_feature_names()

bigram_convertor.fit(review_df['text'])
bigrams = bigram_convertor.get_feature_names()

trigram_convertor.fit(review_df['text'])
trigrams = trigram_convertor.get_feature_names()
print(len(words), len(bigrams), len(trigrams))

print(words[:10])
print(bigrams[-10:])
print(trigrams[:10])

In [None]:
# Filtering using Stemming
# Stemming
import nltk
stemmer = nltk.stem.porter.PorterStemmer()
print(stemmer.stem('flowers'))
print(stemmer.stem('zeroes'))
print(stemmer.stem('stemmer'))
print(stemmer.stem('sixties'))
print(stemmer.stem('sixty'))
print(stemmer.stem('goes'))
print(stemmer.stem('go'))

In [None]:
# POS Tagging and chunking
import pandas as pd
import json

f = open('yelp_academic_data_set_review.json')
js = []
for i in range(10):
    js.append(json.loads(f.readline()))
f.close()
review_df = pd.DataFrame(js)

# Using Spacy
import spacy
# preload the language model
nlp = spacy.load('en')
# Create pandas series of spaCy nlp vars
doc_df = review_df['text'].apply(nlp)
# spaCy gives us fine-grained parts of speech using pos and coarse-grained parts of speech using tag
for doc in doc_df[4]:
    print([doc.text, doc.pos_, doc.tag_])
#spaCy also does some basic noun chunking...
print([chunk for chunk in doc_df[4].noun_chunks])

# Using TextBlob
# The default tagger in TextBlob uses the Pattern Tagger, which is cool
# NLTK Tagger can also be specified, which works better for incomplete sentences
blob_df = review_df[text].apply(TextBlob)
print(blob_df[4].tags)
print([np for np in blob_df[4].noun_phrases])

![Hypothesis Testing - 1](Images/Hypothesis_Tesing-1.jpg)

![Hypothesis Testing - 2](Images/Hypothesis_Tesing-2.jpg)

![Hypothesis Testing - 3](Images/Hypothesis_Tesing-3.jpg)

![Hypothesis Testing - 4](Images/Hypothesis_Tesing-4.jpg)