In [3]:
import numpy as np
from bs4 import BeautifulSoup # for maniuplating html

# START OF REVIEW EXTRACTING

with open('car_output.txt', 'r') as stringfile: # so we load in ALL the data as a massive string
	car_output = stringfile.read()

out = ["<!DOCTYPE html>" + rest for rest in car_output.split("<!DOCTYPE html>")] # split up by start pf page, but add the start point

out_soup = [BeautifulSoup(html, "html.parser") for html in out[1:]] # convert to beautiful soup

# I think the above is standard, now we actually do our document (i.e. if change URLs to look at)

# Now extracting reviews - golf gives me 2098, could add another car to get more
review_rating = []
for page in out_soup:
	page_contents = page.find_all("div", itemprop="review") # locating all reviews on a page
	result = [(review.find(itemprop="reviewBody").get_text().encode("ascii","replace"), # getting text
	           review.find(class_="size8").get_text().encode("ascii","replace").split()[0]) for review in page_contents] # getting rating
	[review_rating.append(review) for review in result]

# Removing empty reviews and getting rid of paragraphs and saving review and rating as a tuple
review_rating_red = [(review[0].replace('\n', ' ').replace('\r', ''), review[1]) for review in review_rating if review[0] != "Reviewer left no comment"] # leaves us with 151 reviews

# Save all data just for safety
with open("reviews_ratings.txt", "w") as out_file:
	out_file.write("\n".join("%s %s" % review for review in review_rating_red))
    
# Getting reviews
reviews = [review[0] for review in review_rating_red]
reviews[1] = "Top car I've had every convertible VW have made and still have a mk 1 golf but not cabriollet" # Just randomly noticed this review was written by an idiot

# Getting ratings
ratings = [review[1] for review in review_rating_red] # distribution of ratings is not high, may not use

# END OF REVIEW EXTRACTION

In [24]:
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer # could change the stemmer or lemmatise

# START OF CLASS CREATION
# Will create a class that will get rating and text
class Review():
    # On initialising
    def __init__(self, review_text, review_rating):
        self.text = review_text.lower()
        self.rating = review_rating
        self.tokens = np.array(wordpunct_tokenize(self.text)) # Would like to split on punct as well to remove later
    
    # Remove non-alphanumeric
    def token_remove_alpha(self):
            self.tokens = np.array([word for word in self.tokens if word.isalpha()])
    
    # Remove stop words
    def token_stop_remove(self, stopwords):
            self.tokens = np.array([word for word in self.tokens if word not in stopwords])
            
    # Now stem everything
    def token_stem(self):
            self.tokens = np.array([PorterStemmer().stem(word) for word in self.tokens])
    
    # Define a process that will clean our tokens
    def token_clean_up(self, stopwords):    
        # Now cleaning up
        self.token_remove_alpha()
        self.token_stop_remove(stopwords)
        self.token_stem()
    
# END OF CLASS CREATION

Note that for the stop words used, also excluded some common "mistakes" like youll or youre

Also included some words previously included like "high", "first", "long", as they could refer to performance

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# START OF CORPUS
class Review_set():
    # On initialising
    def __init__(self, review_data, stopword_file):
        self.reviews = [Review(review[0], review[1]) for review in review_data] # feed in the tuples of ratings etc.
        self.create_stopwords(stopword_file) # stopword element
        
        # Now for each document we will get a cleaned up token set - note that this is done for each document
        self.clean_reviews() 
        
        self.tokenise_reviews() # return reviews but in tokenised form
        
        # Getting unique tokens in the review set - note we pull data from each document
        self.get_unique_tokens()
    
    # Loading and creating stopwords
    def create_stopwords(self, stopword_file):
        self.stopwords = np.array(np.loadtxt("stopwords.txt", dtype = str, delimiter = "\n"))
        
    # Cleaning all documents
    def clean_reviews(self):
        [review.token_clean_up(self.stopwords) for review in self.reviews]
        
    # Getting reviews in tokenised form
    def tokenise_reviews(self):
        self.tokenised_reviews = [" ".join(review.tokens) for review in self.reviews]
        
    # Getting the unique set of tokens
    def get_unique_tokens(self):
        self.unique_tokens = set()
        for review in self.reviews:
            self.unique_tokens = self.unique_tokens.union(review.tokens)
        
    # Document term matrix of CLEANED UP TOKENS - NOTE that will return in alphabetical order of words
    def doc_term_mat(self):
        vectorisor = CountVectorizer()
        count_fit = vectorisor.fit_transform(self.tokenised_reviews)
        
        self.DTM = count_fit.toarray() # return array of counts
    
    # Get tf_idf scores
    def tf_idf_score(self):   
        vectorisor = TfidfVectorizer()
        tf_idf_fit = vectorisor.fit_transform(self.tokenised_reviews)
        
        # Now return tuples of words and their associated score - note that am returning ALL
        self.tf_idf = zip(vectorisor.get_feature_names(),vectorisor.idf_)
        

Ultimately will want to find common topics or "trends" in the reviews. To identify this could look for
- Commonly occuring words: see common trends
- Important words: maybe we can use this to identify more in depth reviews that focus on faults

I guess then what would be left would be to try and identify the topics by looking at the common words

Could extend and look at some kind of sentiment analysis of the top words in a topic

Issues
- Haven't really specified stop words
- Superlatives etc. will really mess up the data, really want to focus on the technical aspects

In [132]:
# Creating the corpus etc
review_data = zip(reviews, ratings)
review_corpus = Review_set(review_data, "stopwords.txt")

In [103]:
# Need to be careful because the actual "REVIEWS" does NOT have any of these attributes and need to go over it better
#print review_corpus.reviews[1].text # THIS returns the text / tokens / rating etc

#print review_corpus.stopwords # gets our stopwords

#print review_corpus.unique_tokens # now correct (Y)


#print review_corpus.tokenised_reviews # this correctly returns a list of all reivews in tokenised form

# GOT EVERYTHING TO WORK
#from operator import itemgetter #sorted(data,key=itemgetter(1))
#review_corpus.tf_idf_score()

#print sorted(review_corpus.tf_idf, key = itemgetter(1)) # now we can see some of the most common terms used, e.g. car

In [138]:
review_corpus.doc_term_mat()
print review_corpus.DTM.shape # the reviews are far too dissimilar to each other, need to cut down on rare words

print sum(np.sum(review_corpus.DTM, axis = 0) == 1) # 1334 words only have one entry

vectorisor = CountVectorizer(min_df = 0.0004767)
count_fit = vectorisor.fit_transform(review_corpus.tokenised_reviews)

print count_fit.toarray().shape
print sum(np.sum(count_fit.toarray(), axis = 1) == 0)

print count_fit.toarray()[np.sum(count_fit.toarray(), axis = 1) != 0,].shape

(2098L, 2906L)
1334
(2098L, 1548L)
12
(2086L, 1548L)


In [146]:
import lda

#print np.argmin(np.sum(review_corpus.DTM, axis = 1)) # PROBELM in that some review was only stop words
#print review_corpus.reviews[137].text

vectorisor = CountVectorizer(min_df = 0.0004767)
count_fit = vectorisor.fit_transform(review_corpus.tokenised_reviews)
DT_matrix_reduced = count_fit.toarray()[np.sum(count_fit.toarray(), axis = 1) != 0,]

# PARAMETERS
n_topics = 5
n_iter = 10000
###

lda_model = lda.LDA(n_iter = n_iter, n_topics = n_topics)
#lda_model.fit(review_corpus.DTM)
lda_model.fit(DT_matrix_reduced)

# Running an LDA to try and uncover topics present

<lda.lda.LDA instance at 0x0000000013388048>

In [143]:
print [key for key in vectorisor.vocabulary_.iterkeys()]

[u'four', u'hate', u'forget', u'categori', u'accur', u'cabriolet', u'sorri', u'swap', u'regrett', u'pride', u'worth', u'risk', u'distort', u'affect', u'school', u'factori', u'reliabilti', u'enjoy', u'rusti', u'direct', u'brill', u'hors', u'blue', u'neg', u'asid', u'near', u'new', u'told', u'occas', u'punch', u'pleasur', u'daughter', u'anymor', u'everyon', u'precis', u'smoke', u'pack', u'tremend', u'brought', u'unsur', u'total', u'unit', u'highli', u'call', u'recommend', u'type', u'relax', u'relat', u'haha', u'notic', u'warn', u'warm', u'adult', u'hole', u'hold', u'must', u'torquey', u'accid', u'word', u'room', u'cheapest', u'work', u'mk', u'spec', u'ms', u'brake', u'volkswagon', u'climat', u'give', u'liter', u'want', u'end', u'quot', u'cylind', u'travel', u'faulti', u'hot', u'disappoint', u'confid', u'pokey', u'lag', u'badli', u'wrong', u'beauti', u'arch', u'third', u'minim', u'think', u'perform', u'maintain', u'enter', u'worst', u'haldex', u'order', u'wind', u'origin', u'fall', u'fine

In [147]:
# Note, maybe should take names from an attribute of DT_MATRIX_REDUCE

# Vocab needs to be a sorted list of words that went IN to the LDA
# vocab = sorted(list(review_corpus.unique_tokens))
vocab = sorted([key for key in vectorisor.vocabulary_.iterkeys()]) # nice
topic_word = lda_model.topic_word_
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: car good drive fuel reliabl economi comfort
Topic 1: car golf year love drive new mile
Topic 2: car golf best own vw love great
Topic 3: car vw problem year mile servic issu
Topic 4: car great drive reliabl good comfort love
