The code I have found on the internet to score a test based on sentiWordNet from NLTK

In [65]:
"""
Class to score sentiment of text.
Use domain-independent method of dictionary lookup of sentiment words,
handling negations and multiword expressions. Based on SentiWordNet 3.0.
"""

import nltk
import re


class SentimentAnalysis(object):
    """Class to get sentiment score based on analyzer."""

    def __init__(self, filename='SentiWordNet.txt', weighting='geometric'):
        """Initialize with filename and choice of weighting."""
        if weighting not in ('geometric', 'harmonic', 'average'):
            raise ValueError(
                'Allowed weighting options are geometric, harmonic, average')
        # parse file and build sentiwordnet dicts
        self.swn_pos = {'a': {}, 'v': {}, 'r': {}, 'n': {}}
        self.swn_all = {}
        self.build_swn(filename, weighting)

    def average(self, score_list):
        """Get arithmetic average of scores."""
        if(score_list):
            return sum(score_list) / float(len(score_list))
        else:
            return 0

    def geometric_weighted(self, score_list):
        """"Get geometric weighted sum of scores."""
        weighted_sum = 0
        num = 1
        for el in score_list:
            weighted_sum += (el * (1 / float(2**num)))
            num += 1
        return weighted_sum

    # another possible weighting instead of average
    def harmonic_weighted(self, score_list):
        """Get harmonic weighted sum of scores."""
        weighted_sum = 0
        num = 2
        for el in score_list:
            weighted_sum += (el * (1 / float(num)))
            num += 1
        return weighted_sum

    def build_swn(self, filename, weighting):
        """Build class's lookup based on SentiWordNet 3.0."""
        records = [line.split('\t') for line in open(filename)]
        for rec in records:
            # has many words in 1 entry
            words = rec[4].split()
            pos = rec[0]
            for word_num in words:
                word = word_num.split('#')[0]
                sense_num = int(word_num.split('#')[1])

                # build a dictionary key'ed by sense number
                if word not in self.swn_pos[pos]:
                    self.swn_pos[pos][word] = {}
                self.swn_pos[pos][word][sense_num] = float(
                    rec[2]) - float(rec[3])
                if word not in self.swn_all:
                    self.swn_all[word] = {}
                self.swn_all[word][sense_num] = float(rec[2]) - float(rec[3])

        # convert innermost dicts to ordered lists of scores
        for pos in self.swn_pos.keys():
            for word in self.swn_pos[pos].keys():
                newlist = [self.swn_pos[pos][word][k] for k in sorted(
                    self.swn_pos[pos][word].keys())]
                if weighting == 'average':
                    self.swn_pos[pos][word] = self.average(newlist)
                if weighting == 'geometric':
                    self.swn_pos[pos][word] = self.geometric_weighted(newlist)
                if weighting == 'harmonic':
                    self.swn_pos[pos][word] = self.harmonic_weighted(newlist)

        for word in self.swn_all.keys():
            newlist = [self.swn_all[word][k] for k in sorted(
                self.swn_all[word].keys())]
            if weighting == 'average':
                self.swn_all[word] = self.average(newlist)
            if weighting == 'geometric':
                self.swn_all[word] = self.geometric_weighted(newlist)
            if weighting == 'harmonic':
                self.swn_all[word] = self.harmonic_weighted(newlist)

    def pos_short(self, pos):
        """Convert NLTK POS tags to SWN's POS tags."""
        if pos in set(['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']):
            return 'v'
        elif pos in set(['JJ', 'JJR', 'JJS']):
            return 'a'
        elif pos in set(['RB', 'RBR', 'RBS']):
            return 'r'
        elif pos in set(['NNS', 'NN', 'NNP', 'NNPS']):
            return 'n'
        else:
            return 'a'

    def score_word(self, word, pos):
        """Get sentiment score of word based on SWN and part of speech."""
        try:
            return self.swn_pos[pos][word]
        except KeyError:
            try:
                return self.swn_all[word]
            except KeyError:
                return 0

    def score(self, sentence):
        """Sentiment score a sentence."""
        # init sentiwordnet lookup/scoring tools
        impt = set(['NNS', 'NN', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS',
                    'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN',
                    'VBP', 'VBZ', 'unknown'])
        non_base = set(['VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'NNS', 'NNPS'])
        negations = set(['not', 'n\'t', 'less', 'no', 'never',
                         'nothing', 'nowhere', 'hardly', 'barely',
                         'scarcely', 'nobody', 'none'])
        stopwords = nltk.corpus.stopwords.words('english')
        wnl = nltk.WordNetLemmatizer()

        scores = []
        tokens = nltk.tokenize.word_tokenize(sentence)
        tagged = nltk.pos_tag(tokens)

        index = 0
        for el in tagged:

            pos = el[1]
            try:
                word = re.match('(\w+)', el[0]).group(0).lower()
                start = index - 5
                if start < 0:
                    start = 0
                neighborhood = tokens[start:index]

                # look for trailing multiword expressions
                word_minus_one = tokens[index-1:index+1]
                word_minus_two = tokens[index-2:index+1]

                # if multiword expression, fold to one expression
                if(self.is_multiword(word_minus_two)):
                    if len(scores) > 1:
                        scores.pop()
                        scores.pop()
                    if len(neighborhood) > 1:
                        neighborhood.pop()
                        neighborhood.pop()
                    word = '_'.join(word_minus_two)
                    pos = 'unknown'

                elif(self.is_multiword(word_minus_one)):
                    if len(scores) > 0:
                        scores.pop()
                    if len(neighborhood) > 0:
                        neighborhood.pop()
                    word = '_'.join(word_minus_one)
                    pos = 'unknown'

                # perform lookup
                if (pos in impt) and (word not in stopwords):
                    if pos in non_base:
                        word = wnl.lemmatize(word, self.pos_short(pos))
                    score = self.score_word(word, self.pos_short(pos))
                    if len(negations.intersection(set(neighborhood))) > 0:
                        score = -score
                    scores.append(score)

            except AttributeError:
                pass

            index += 1

        if len(scores) > 0:
            return sum(scores) / float(len(scores))
        else:
            return 0

    def is_multiword(self, words):
        """Test if a group of words is a multiword expression."""
        joined = '_'.join(words)
        return joined in self.swn_all

Import the required libraries

In [146]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np

read data

In [147]:
engine = create_engine('postgresql://donya:@localhost:5432/tripadvisor')
df = pd.read_sql('test_senti', engine)
# df['SentiWordnetH'] = 0

Check if the data has been read correctly from postgresql database

In [87]:
text = df['preprocessed_text'][1]
words = text.split()
print(words)


['my', 'wife', 'and', 'i', 'recently', 'flew', 'jet', 'airways', 'from', 'mumbai', 'to', 'lhr', 'and', 'lhr', 'to', 'delhi', 'and', 'flights', 'were', 'impeccable', 'for', 'one', 'we', 'were', 'ahead', 'of', 'schedule', 'both', 'legs', 'something', 'which', 'regular', 'travellers', 'value', 'most', 'check', 'in', 'and', 'boarding', 'were', 'efficient', 'and', 'without', 'fuss', '737', 'ers', 'were', 'configured', 'perfectly', 'whilst', 'we', 'flew', 'business', 'seats', 'in', 'economy']


SentiWordNet Approach

In [75]:
s = SentimentAnalysis(filename='SentiWordNet.txt',weighting='harmonic')


1. Remove Stop words(a, an, the, punctuation)

2. Remove Punctuation

 3. Convert Negative to negtag_ ( “not”, “no”, “none”, “nei- ther”, “never” or “nobody”.)

4. Convert Numbers (I have commented this part, I can do it later)

5. Get Sentiment Score based on SentiWordnet

In [80]:
neg_pat = re.compile(r'( not | nobody | none | no | neither | never )(\w+)')
stopwords = set(['a', 'an', 'the'])
print(stopwords)
stop_pat = re.compile('(\s)a(\s)|(\s)an(\s)|(\s)the(\s)')
df['preprocessed_text'] = df['text_review']
df['preprocessed_text'] = df['preprocessed_text'].str.lower()
#remove stopwords
df['preprocessed_text'] = df['preprocessed_text'].apply(lambda x: stop_pat.sub(' ', x) )

#remove punctuation
df['preprocessed_text'] = df['preprocessed_text'].apply(lambda x: re.sub(r'[^\w\s]',' ',x))

#convert Negative
df['preprocessed_text'] = df['preprocessed_text'].apply(lambda x: re.sub(neg_pat.search(x).group(),' neg_'+ neg_pat.search(x).group(2) , x) if len(neg_pat.findall(x))>0 else x )

#Convert Numbers
# df.loc[i, 'text_review']=  re.sub(' [0-9]+ ', 'NumTag', review)

df['SentiWordnetH']= df['preprocessed_text'].apply(lambda x: s.score(x) if x is not np.nan else -1000)




{'a', 'an', 'the'}


Build Term Document Matrix(unigram & bigram)

In [88]:
is_bad = df['review_rating']<40
is_bad = np.array(is_bad)
is_good = df['review_rating']>=40
is_good =np.array(is_good)

In [89]:

from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(ngram_range=(1, 2))
X = vec.fit_transform(df['preprocessed_text'])
print(X.shape)


(604625, 2786495)


L1 (Weighted field-specific lexicon)

In [103]:
words_score = pd.DataFrame({'neg': np.ravel(np.sum(X[is_bad,:],axis=0)), 'pos': np.ravel(np.sum(X[is_good, :],axis=0))}, index = vec.get_feature_names())
words_score['SW']= (words_score['pos']- words_score['neg'])/(words_score['pos']+ words_score['neg'])
words_score_filtered =words_score.loc[words_score['pos']+words_score['neg'] >75]


Get quantiles

In [105]:
print(words_score_filtered.SW.quantile([0.25,0.5,0.75]))


0.25   -0.101636
0.50    0.151918
0.75    0.401264
Name: SW, dtype: float64


Set the first quartile 'Negative' and 4th quartile 'Positive' (As said in the article)

In [108]:
words_score_neg = words_score_filtered.loc[words_score_filtered['SW']<=-0.101636] 
words_score_pos = words_score_filtered.loc[words_score_filtered['SW']>= 0.401264] 



Save SentiWordnet Scores

In [109]:
words_score_neg.to_sql('Negative Words', engine)
words_score_pos.to_sql('Positive Words', engine)
words_score_filtered.to_sql('AllWordsScore', engine)

Get the Prediction from L1(?)

In [130]:
#Make a dictionary
lexicon_pd = pd.concat([words_score_neg['SW'], words_score_pos['SW']], axis=0)
lexicon = lexicon_pd.to_dict()
def calculateScore(text):
    score = 0
    count = 0
    for word in text.split():
        if word in lexicon:
            score = score + lexicon[word]
            count +=1
    if count >0:
        return score/count
    else:
        #the lexicon could not analyse this text
        return -1000

In [148]:
df['L1_prediction']= df['preprocessed_text'].apply(lambda x: calculateScore(x))

The lexicon could not assign polarity to 185 texts(train) and 78 texts(test)

In [149]:
len(df['L1_prediction'][df['L1_prediction']==-1000])

78

In the following you can see these texts

In [150]:
df['preprocessed_text'][df['L1_prediction']==-1000]

4123       plane itself did neg_impress me as it was pre...
6201      \nthought we were taking quality airline on re...
6687      \nmy friends flew with different company  had ...
6802      \nthere really isnt much you can review on ba ...
9073      \n我是用15 000 asia milies 換  個心態是不想浪漫15 000分  於是...
10925     \nairline is getting smaller and smaller  if t...
14179      whole experience  checking in  boarding  cabi...
27461     \nrestrictions  restrictions  restrictions  an...
29530     \nmein eingecheckter koffer sowie der meiner f...
30892     \ni observed how much stewardess were sullen  ...
32737     \nshort flight to phitsanulok and seems ok for...
34117     \nthis was small plane so we had to check our ...
38562     \nsikerült belefutni egy őrült check in pultos...
38718     \nyou know what you re getting with ryanair  i...
45464     \nthis was lot more better experience than pre...
49731     \nde classificatie  bijzonder goed  krijgt tra...
51912     \nnothing special  but we had 

M1 (Machine Learning- Naive Bayes Classifier)

In [110]:
df['review_rating_b']= df['review_rating'].apply(lambda x: True if x >=40 else False)

In [111]:
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
train_data = df[['preprocessed_text', 'review_rating_b']]
train_data = shuffle(train_data)
pipe = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),('clf', MultinomialNB()),])
model = pipe.fit(train_data['preprocessed_text'],train_data['review_rating_b'])


Get the Prediction from M1

In [151]:
df['M1_prediction'] = model.predict(df['preprocessed_text'])

A part of our final train dataframe

In [152]:
print(df.loc[1:10,:])

    index  id  review_rating split  SentiWordnetH  \
1     754   2             10  test       0.000090   
2     755   2             20  test       0.066018   
3     756   2             30  test       0.003976   
4     757   2             20  test       0.033217   
5     758   2             30  test       0.049401   
6     759   2             10  test       0.010155   
7     760   2             10  test      -0.057323   
8     761   2             10  test      -0.011332   
9     762   2             10  test       0.033313   
10    763   2             30  test       0.110153   

                                    preprocessed_text  L1_prediction  \
1   \nthis year was same as last  always late  tv ...       0.003314   
2   \nyour typical round trip flight to europe min...       0.392721   
3   \nflying back from ireland was only little bet...       0.036385   
4   \nwhen we checked in flight was shown as depar...      -0.183497   
5   \ntight squeeze walking to back of plane and i...   

Export the resulting data

In [153]:
df.to_sql('test_labelled', engine)