Things to think about
- Removing short reviews

In [1]:
import numpy as np
from bs4 import BeautifulSoup # for maniuplating html

# START OF REVIEW EXTRACTING

with open('car_output.txt', 'r') as stringfile: # so we load in ALL the data as a massive string
	car_output = stringfile.read()

out = ["<!DOCTYPE html>" + rest for rest in car_output.split("<!DOCTYPE html>")] # split up by start pf page, but add the start point

out_soup = [BeautifulSoup(html, "html.parser") for html in out[1:]] # convert to beautiful soup

# I think the above is standard, now we actually do our document (i.e. if change URLs to look at)

# Now extracting reviews - golf gives me 2098, could add another car to get more
review_rating = []
for page in out_soup:
	page_contents = page.find_all("div", itemprop="review") # locating all reviews on a page
	result = [(review.find(itemprop="reviewBody").get_text().encode("ascii","replace"), # getting text
	           review.find(class_="size8").get_text().encode("ascii","replace").split()[0]) for review in page_contents] # getting rating
	[review_rating.append(review) for review in result]

# Removing empty reviews and getting rid of paragraphs and saving review and rating as a tuple
review_rating_red = [(review[0].replace('\n', ' ').replace('\r', ''), review[1]) for review in review_rating if review[0] != "Reviewer left no comment"] # leaves us with 151 reviews

# Save all data just for safety
with open("reviews_ratings.txt", "w") as out_file:
	out_file.write("\n".join("%s %s" % review for review in review_rating_red))
    
# Getting reviews
reviews = [review[0] for review in review_rating_red]
reviews[1] = "Top car I've had every convertible VW have made and still have a mk 1 golf but not cabriollet" # Just randomly noticed this review was written by an idiot

# Getting ratings
ratings = [review[1] for review in review_rating_red] # distribution of ratings is not high, may not use

# END OF REVIEW EXTRACTION

In [2]:
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer # could change the stemmer or lemmatise

# START OF CLASS CREATION
# Will create a class that will get rating and text
class Review():
    # On initialising
    def __init__(self, review_text, review_rating):
        self.text = review_text.lower() # NOTE that only the tokens are fixed up, text kept the same
        self.rating = review_rating
        self.tokens = np.array(wordpunct_tokenize(self.text)) # Would like to split on punct as well to remove later
    
    # Remove non-alphanumeric
    def token_remove_alpha(self):
            self.tokens = np.array([word for word in self.tokens if word.isalpha()])
    
    # Remove stop words
    def token_stop_remove(self, stopwords):
            self.tokens = np.array([word for word in self.tokens if word not in stopwords])
            
    # Now stem everything
    def token_stem(self):
            self.tokens = np.array([PorterStemmer().stem(word) for word in self.tokens])
    
    # Define a process that will clean our tokens
    def token_clean_up(self, stopwords):    
        # Now cleaning up
        self.token_remove_alpha()
        self.token_stop_remove(stopwords)
        self.token_stem()
    
# END OF CLASS CREATION

Note that for the stop words used, also excluded some common "mistakes" like youll or youre

Also included some words previously included like "high", "first", "long", as they could refer to performance

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# START OF CORPUS
class Review_set():
    # On initialising
    def __init__(self, review_data, stopword_file):
        self.reviews = [Review(review[0], review[1]) for review in review_data] # feed in the tuples of ratings etc.
        self.create_stopwords(stopword_file) # stopword element
        
        # Now for each document we will get a cleaned up token set - note that this is done for each document
        self.clean_reviews() 
        
        self.tokenise_reviews() # return reviews but in tokenised form
        
        # Getting unique tokens in the review set - note we pull data from each document
        self.get_unique_tokens()
    
    # Loading and creating stopwords
    def create_stopwords(self, stopword_file):
        self.stopwords = np.array(np.loadtxt("stopwords.txt", dtype = str, delimiter = "\n"))
        
    # Cleaning all documents
    def clean_reviews(self):
        [review.token_clean_up(self.stopwords) for review in self.reviews]
        
    # Getting reviews in tokenised form
    def tokenise_reviews(self):
        self.tokenised_reviews = [" ".join(review.tokens) for review in self.reviews]
        
    # Getting the unique set of tokens
    def get_unique_tokens(self):
        self.unique_tokens = set()
        for review in self.reviews:
            self.unique_tokens = self.unique_tokens.union(review.tokens)
        
    # Document term matrix of CLEANED UP TOKENS - NOTE that will return in alphabetical order of words
    def doc_term_mat(self):
        vectorisor = CountVectorizer()
        count_fit = vectorisor.fit_transform(self.tokenised_reviews)
        
        self.DTM = count_fit.toarray() # return array of counts
    
    # Get tf_idf scores
    def tf_idf_score(self):   
        vectorisor = TfidfVectorizer()
        tf_idf_fit = vectorisor.fit_transform(self.tokenised_reviews)
        
        # Now return tuples of words and their associated score - note that am returning ALL
        self.tf_idf = zip(vectorisor.get_feature_names(),vectorisor.idf_)
        

Ultimately will want to find common topics or "trends" in the reviews. To identify this could look for
- Commonly occuring words: see common trends
- Important words: maybe we can use this to identify more in depth reviews that focus on faults

I guess then what would be left would be to try and identify the topics by looking at the common words

Could extend and look at some kind of sentiment analysis of the top words in a topic

Issues
- Haven't really specified stop words
- Superlatives etc. will really mess up the data, really want to focus on the technical aspects

In [4]:
# Creating the corpus etc
review_data = zip(reviews, ratings)
review_corpus = Review_set(review_data, "stopwords.txt")

In [5]:
# Just looking at dimensions and reducing by some words / documents

review_corpus.doc_term_mat()

doc_ratio = 0.00024

vectorisor = CountVectorizer(min_df = 20, max_df = 2000) # 1138
count_fit = vectorisor.fit_transform(review_corpus.tokenised_reviews)

print review_corpus.DTM.shape # OLD SHAPE

print sum(np.sum(review_corpus.DTM, axis = 0) == 1) # 1334 words only have one entry
print max(np.sum(count_fit.toarray(), axis = 0)) # no word is present in more than 1000

print count_fit.toarray().shape # NEW SHAPE
print sum(np.sum(count_fit.toarray(), axis = 1) == 0)

print count_fit.toarray()[np.sum(count_fit.toarray(), axis = 1) != 0,].shape # GET RID OF ZERO WORDS


(4194L, 4005L)
1861
913
(4194L, 418L)
269
(3925L, 418L)


In [6]:
import lda

count_fit = vectorisor.fit_transform(review_corpus.tokenised_reviews)
DT_matrix_reduced = count_fit.toarray()[np.sum(count_fit.toarray(), axis = 1) != 0,] # contain zero words
# This could be done in a different way whereby we delete from start any short reviews??

# PARAMETERS
n_topics = 3
n_iter = 30000
###

lda_model = lda.LDA(n_iter = n_iter, n_topics = n_topics)
#lda_model.fit(review_corpus.DTM)
lda_model.fit(DT_matrix_reduced)

# Running an LDA to try and uncover topics present

<lda.lda.LDA instance at 0x000000008C78B5C8>

In [148]:
#print [key for key in vectorisor.vocabulary_.iterkeys()]

In [7]:

vocab = sorted([word[0] for word in zip(vectorisor.get_feature_names(), np.asarray(count_fit.sum(axis = 0)).ravel())])

topic_word = lda_model.topic_word_
n_top_words = 12
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: year problem mile vw new servic time reliabl replac fault engin
Topic 1: reliabl comfort fuel economi econom excel famili run easi power engin
Topic 2: best year reliabl vw own car buy new will recommend drive


Remove things like "love" and general evaluation, because will do a sentiment thing on that later
- although then we will need a different set of stop words to do this, which seems inefficient


For the sentiment - could either do an average sentiment of each document and then average by weight
OR could do an actual sentiment on the words in the topic
- the only thing is that "engine" has no connotation, but "has a great engine" does, so we get by document rather than word

In [8]:
# SENTIMENT
## Getting AFINN
import string
AFINN = dict(map(lambda (k,v): (k,int(v)), [line.split('\t') for line in open("./AFINN/AFINN-111.txt")]))

## NOW HOW TO RETURN SCORE - note feed a string which is the entire review - then outside we loop
def sentiment_scores(review_string):
    # remove punct from string we feed
    string_punct_rem = review_string.translate(string.maketrans("",""), string.punctuation)
    # Go over each word in the string to get score
    score_list = map(lambda word: AFINN.get(word, 0), string_punct_rem.lower().split())
    
    return score_list

###

# Feeding in each review in string form
review_scores = [sentiment_scores(review) for review in reviews]
non_zero = map(lambda score_list: [score for score in score_list if score != 0], review_scores) # maybe should do an "if not None" or something
review_scores_sum = [sum(score) for score in review_scores]
review_scores_mean = [np.mean(score) if len(score) != 0 else 0 for score in non_zero] # should only include those entries that are non-zero`

Decision of whether to use full text - or to use some tokenised form. I guess the thing is that if we use tokenised, there will be many words not included, while the AFINN is supposed to be very complete


To proceed - we now have the sentiment score for each of the reviews (SHOULD EVALUATE A POOR SCORE ONE)
Then what we want to do is loop over the topics and add to its score - the proportion of the score explained by the document

In [9]:
review_scores_mean_red = np.array(review_scores_mean)[np.sum(count_fit.toarray(), axis = 1) != 0,]

# FIRST TRIAL OF TOPIC SENTIMENT
print np.dot(np.transpose(review_scores_mean_red),lda_model.doc_topic_)


[ 1254.29229625  2741.11178707  2637.23851991]


In [10]:
# get a look at the average probabilities
print lda_model.doc_topic_.mean(axis = 0)
print lda_model.doc_topic_.max(axis = 0)
print lda_model.doc_topic_.min(axis = 0)
print lda_model.doc_topic_.sum(axis = 0)


[ 0.30650529  0.35456518  0.33892953]
[ 0.99449036  0.98963731  0.99267399]
[ 0.003003    0.00146413  0.00095877]
[ 1203.03324433  1391.66834515  1330.29841053]


In [11]:
from __future__ import division

doc_topic_normed = lda_model.doc_topic_ / lda_model.doc_topic_.sum(axis = 0)

# Second trial with attempt to normalise
print np.dot(np.transpose(review_scores_mean_red), doc_topic_normed)

# Looks better now and at least has some sense

[ 1.04260818  1.96965879  1.98244131]


Ideas for further
- Could do a word cloud where we have size by probability and colour by sentiment
- Then another thing to do is to look at the relationship between frequency of words and their sentiment

In [12]:
# Trying to make a data frame where we have a word,
# the probability for each topic, and then the sentiment

# The words
words = vocab

# Their scores
scores_1 = topic_word[0] # comes as three lists
scores_2 = topic_word[1]
scores_3 = topic_word[2]

# Sentiments
sentiments = [AFINN.get(word, 0) for word in vocab]

# Putting them all together
data_for_export = np.asarray([words, scores_1, scores_2, scores_3, sentiments]).T

# Save csv
np.savetxt("data_export.csv", np.array(data_for_export), delimiter=",", fmt = "%s")


In [22]:
from sklearn import linear_model
import matplotlib.pyplot as plt
import math as mt

word_counts = np.sum(DT_matrix_reduced, axis = 0) # this is the count of the words over all documents

# Sentiments
sent_array = [sent for sent in sentiments if sent != 0]
sent_array = np.array(sent_array).reshape((len(sent_array),1))

# Getting the weighted counts
scores_red_1 = np.array([w for (s,w) in zip(sent_array, scores_1) if s != 0])
counts_red_1 = np.array([w[0] for (s,w) in zip(sent_array, weighted_counts_1) if s != 0])

scores_red_2 = np.array([w for (s,w) in zip(sent_array, scores_2) if s != 0])
counts_red_2 = np.array([w[0] for (s,w) in zip(sent_array, weighted_counts_2) if s != 0])

scores_red_3 = np.array([w for (s,w) in zip(sent_array, scores_3) if s != 0])
counts_red_3 = np.array([w[0] for (s,w) in zip(sent_array, weighted_counts_3) if s != 0])

weighted_counts_1 = scores_red_1 * counts_red_1
weighted_counts_2 = scores_red_2 * counts_red_2
weighted_counts_3 = scores_red_3 * counts_red_3

weighted_counts_1 = weighted_counts_1.reshape((len(weighted_counts_1),1))
weighted_counts_2 = weighted_counts_2.reshape((len(weighted_counts_2),1))
weighted_counts_3 = weighted_counts_3.reshape((len(weighted_counts_3),1))

[[  1.53295440e-08]
 [  2.81439258e-17]
 [  5.98189617e-09]
 [  3.48106866e-07]
 [  5.42773209e-07]
 [  3.68454764e-07]
 [  1.12317078e-06]
 [  2.85978192e-09]
 [  3.05774936e-06]
 [  9.79841268e-06]
 [  3.83236436e-17]
 [  3.47308020e-17]
 [  1.79383988e-07]
 [  5.47188982e-06]
 [  1.48672662e-06]
 [  4.61515990e-08]
 [  4.07932061e-07]
 [  3.29711004e-06]
 [  8.21868770e-07]
 [  4.11662430e-08]
 [  3.95585135e-08]
 [  1.55689802e-17]
 [  1.98918209e-07]
 [  1.67665941e-17]
 [  1.79642079e-17]
 [  1.98918209e-07]
 [  2.57486981e-16]
 [  2.69069461e-06]
 [  5.15680704e-09]
 [  1.21501082e-07]
 [  3.78729436e-08]
 [  7.45235921e-06]
 [  1.80181545e-08]
 [  2.65390010e-07]
 [  2.57486981e-17]
 [  3.89189157e-05]
 [  1.29673859e-07]
 [  4.67667113e-05]
 [  5.76327402e-08]
 [  3.29329944e-08]
 [  1.29602383e-06]
 [  6.04795001e-17]
 [  5.81016330e-07]
 [  1.87621702e-04]]


In [34]:
# Regression
regr_1 = linear_model.LinearRegression()
regr_2 = linear_model.LinearRegression()
regr_3 = linear_model.LinearRegression()

regr_1.fit(sent_array, weighted_counts_1)
regr_2.fit(sent_array, weighted_counts_2)
regr_3.fit(sent_array, weighted_counts_3)

# The coefficients
print 'Coefficients 1: \n', regr_1.coef_
print 'Coefficients 2: \n', regr_2.coef_
print 'Coefficients 3: \n', regr_3.coef_

print 'Y inter 1: \n', regr_1.intercept_
print 'Y inter 2: \n', regr_2.intercept_
print 'Y inter 3: \n', regr_3.intercept_

# Plot outputs
## 1
x = [x[0] for x in sorted(sent_array)]
y_1 = [w[0] for (s, w) in sorted(zip(sent_array, weighted_counts_1))]

y_fit_1 = [regr_1.intercept_[0] + regr_1.coef_[0] * x_val for x_val in x]

plt.plot(x, y_1, 'x')
plt.plot(x, y_fit_1, '-')
plt.xlabel("Sentiment")
plt.ylabel("Weighted count")
plt.title("Topic 2")
plt.show()

## 2
x = [x[0] for x in sorted(sent_array)]
y_2 = [w[0] for (s, w) in sorted(zip(sent_array, weighted_counts_2))]

y_fit_2 = [regr_2.intercept_[0] + regr_2.coef_[0] * x_val for x_val in x]

plt.plot(x, y_2, 'x')
plt.plot(x, y_fit_2, '-')
plt.xlabel("Sentiment")
plt.ylabel("Weighted count")
plt.title("Topic 1")
plt.show()

## 3
x = [x[0] for x in sorted(sent_array)]
y_3 = [w[0] for (s, w) in sorted(zip(sent_array, weighted_counts_3))]

y_fit_3 = [regr_3.intercept_[0] + regr_3.coef_[0] * x_val for x_val in x]

plt.plot(x, y_3, 'x')
plt.plot(x, y_fit_3, '-')
plt.xlabel("Sentiment")
plt.ylabel("Weighted count")
plt.title("Topic 3")
plt.show()

Coefficients 1: 
[[ -2.24812180e-06]]
Coefficients 2: 
[[ -8.97237542e-07]]
Coefficients 3: 
[[ 0.0004694]]
Y inter 1: 
[  9.47412870e-06]
Y inter 2: 
[  1.01625649e-05]
Y inter 3: 
[ 0.00056902]
