### In this tutorial we will train a sentiment classifier on a sample dataset

In [1]:
import csv
import random
from nltk.corpus import sentiwordnet as swn

In [4]:
################## Loading data file #######################
reader_train = csv.reader(open('C:/Desktop/data/sentiment_analysis/training.csv','r'))
reader_test = csv.reader(open('C:/desktop/data/sentiment_analysis/test.csv','r'))
training_data = []
test_data = []
header = 1
for row in reader_train:
        if header==1:
                header=0
                continue
        training_data.append(row)
header=1
for row in reader_test:
        if header==1:
                header=0
                continue
        test_data.append(row)

IOError: [Errno 2] No such file or directory: 'C:/Desktop/data/sentiment_analysis/training.csv'

In [3]:
# Examples from training data
print(training_data[0])
print(len(training_data), len(test_data))

["Not Everybody loves Raymond.  The only joke here is that the idea actually found its way onto the airwaves.  In other words it\\'s just one joke less than a one joke program.  As with most sitcoms, it appeals to those with an intellectual development that was stopped at age 8.  Fortunately they did have the good sense to employ a laugh-track so that the audience will be signaled whenever something is broadcast that was intended to be funny. The program amounts to a scandalous waste of talent.", 'neg']
385 80


In [4]:
#Required for Bag of words (unigram features) creation
vocabulary = [x.lower() for tagged_sent in training_data for x in tagged_sent[0].split()]
vocabulary = list(set(vocabulary))
vocabulary.sort() #sorting the list
print(len(vocabulary))
# print(vocabulary)

2759


################## Extracting Features #########################
#### Prepare a unigram feature vector based on the presence or absence of words######### 

In [5]:
def get_unigram_features(data,vocab):
    fet_vec_all = []
    for tup in data:
        single_feat_vec = []
        sent = tup[0].lower() #lowercasing the dataset
        for v in vocab:
            if sent.__contains__(v):
                single_feat_vec.append(1)
            else:
                single_feat_vec.append(0)
        fet_vec_all.append(single_feat_vec)
    return fet_vec_all

#### Add sentiment scores from sentiwordnet, here we take the average sentiment scores of all words 

In [6]:
def get_senti_wordnet_features(data):
    fet_vec_all = []
    for tup in data:
        sent = tup[0].lower()
        words = sent.split()
        pos_score = 0
        neg_score = 0
        for w in words:
            senti_synsets = swn.senti_synsets(w.lower())
            for senti_synset in senti_synsets:
                p = senti_synset.pos_score()
                n = senti_synset.neg_score()
                pos_score+=p
                neg_score+=n
                break #take only the first synset (Most frequent sense)
        fet_vec_all.append([float(pos_score),float(neg_score)])
    return fet_vec_all

#### Merge the two scores ####

In [7]:
def merge_features(featureList1,featureList2):
    # For merging two features
    if featureList1==[]:
        return featureList2
    merged = []
    for i in range(len(featureList1)):
        m = featureList1[i]+featureList2[i]
        merged.append(m)
    return merged

In [8]:
#extract the sentiment labels by making positive reviews as class 1 and negative reviews as class 2
def get_lables(data):
    labels = []
    for tup in data:
        if tup[1].lower()=="neg":
            labels.append(-1)
        else:
            labels.append(1)
    return labels

In [9]:
def calculate_precision(prediction, actual):
    prediction = list(prediction)
#     correct_labels = [1  if test_gold_labels[i] == predictions[i] else 0 for i in range(len(predictions))]
    correct_labels = [predictions[i]  for i in range(len(predictions)) if actual[i] == predictions[i]]
    precision = float(len(correct_labels))/float(len(prediction))
    return precision

In [15]:
def real_time_test(classifier,vocab):
    print("Enter a sentence: ")
    inp = input()
    inp = [[inp,0]] #Assign a dummy label 0 and format the input data
    print(inp)
    print("\n")
    feat_vec_uni = get_unigram_features(inp,vocab)
    feat_vec_swn =get_senti_wordnet_features(test_data)
    feat_vec = merge_features(feat_vec_uni, feat_vec_swn)

    predict = classifier.predict(feat_vec)
    if predict[0]==1:
        print("The sentiment expressed is: positive")
    else:
        print("The sentiment expressed is: negative")
    
#For naive bayes classifier, uncomment the next two lines
#     predict = classifier.predict_proba(feat_vec)
#     print(predict)

#For SVM classifier, uncomment the next two lines
#     predict = classifier.decision_function(feat_vec)
#     print(predict)

################# Training and Evaluation #######################
#### Preparing training and test tuples
#### The feature_vecor set looks like [featurevector1, featurevector2,...,featurevectorN] where each featurevectorX is a list
#### The label set looks like [label1,label2,...,labelN]

In [11]:
training_unigram_features = get_unigram_features(training_data,vocabulary) # vocabulary extracted in the beginning
training_swn_features = get_senti_wordnet_features(training_data)

training_features = merge_features(training_unigram_features,training_swn_features)

training_labels = get_lables(training_data)

test_unigram_features = get_unigram_features(test_data,vocabulary)
test_swn_features=get_senti_wordnet_features(test_data)
test_features= merge_features(test_unigram_features,test_swn_features)

test_gold_labels = get_lables(test_data)

In [16]:
# Naive Bayes Classifier 
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(training_features,training_labels) #training process
predictions = classifier.predict(test_features)
# print("Prediciton of NB classifier is ")
# print(predictions)
print("Precision of NB classifier is")
precision = calculate_precision(predictions,test_gold_labels)
print(precision)
#Real time tesing
real_time_test(classifier,vocabulary)

Precision of NB classifier is
0.775
Enter a sentence: 
I love movie because it is awesome.
[['I love movie because it is awesome.', 0]]


The sentiment expressed is: positive


In [13]:
# SVM Classifier
#Refer to : http://scikit-learn.org/stable/modules/svm.html
from sklearn.svm import LinearSVC
classifier = LinearSVC(penalty='l2', C=0.01).fit(training_features,training_labels)
predictions = classifier.predict(training_features)
# print("Prediciton of linear SVM classifier is: ")
# print(predictions)
print("Precision of linear SVM classifier is:")
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = classifier.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))

# print(classifier.C)

#Real time tesing
# real_time_test(classifier,vocabulary)
# print(len(training_features[0]))

Precision of linear SVM classifier is:
Training data	0.9584415584415584
Test data	0.775


In [14]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(random_state=0).fit(training_features,training_labels)
predictions = classifier.predict(training_features)
# print("Prediciton of Decision Tree classifier is: ")
# print(predictions)
print("Precision of Decision Tree classifier is:")
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = classifier.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))
# Real time tesing
# real_time_test(classifier,vocabulary)

Precision of Decision Tree classifier is:
Training data	1.0
Test data	0.7
