In [1]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Task 1 - Splitting the data 
# Task 3 - Selectings the metadata 'text' and 'airline_sentiment'

data = pd.read_csv('tweets.csv')
data_x = data[['text', 'airline_sentiment']]
x_train, x_test = train_test_split(data_x)

In [3]:
# Task 2 - Creating a vocabulary using vectorizer. 

vectorizer = CountVectorizer(stop_words='english')
vector_test = CountVectorizer(stop_words='english')
vectorizer.fit(x_train.text)
vocabulary = vectorizer.vocabulary_

In [4]:
# Task 4 - Creating the (prior probabilities) total number of yk for k= 'negative', 'positive' and 'neutral' in 
# the traiting set.

class PriorProbabilities:
    def __init__(self, tweets):
        self.tweets = tweets.airline_sentiment
        self.dictionary = {}
        self.score = 0
        self.final_score = 0
    
    def dic(self):
        for line in self.tweets:
            line = line.split()
            for word in line:
                if word not in self.dictionary:
                    self.dictionary[word] = 1
                elif word in self.dictionary:
                    self.dictionary[word] += 1
        return self.dictionary
    
    def probability(self, x):
        self.score = sum(self.dictionary.values())
        self.final_score = self.dictionary[x] / self.score
        return self.final_score

prob = PriorProbabilities(x_train)
prob.dic()
positive_prob = prob.probability('positive')
negative_prob = prob.probability('negative')
neutral_prob = prob.probability('neutral')
print('The probability of a word being positive is:', positive_prob, '\nThe probability of a word being positive is:', neutral_prob,
      '\nThe probability of a word being positive is:', negative_prob)

The probability of a word being positive is: 0.16083788706739527 
The probability of a word being positive is: 0.2098360655737705 
The probability of a word being positive is: 0.6293260473588342


In [5]:
# Task 5 (1/2) - Creating several lists based on a word being in a tweet with yk|k = negative/positive/neutral, 
# which in turn is the likelihood probabilities.

negative_list = []
positive_list = []
neutral_list = []

def check_sentiment():
    for line in x_train.values:
        var = line[0]
        var = var.split(' ')
        
        for word in var:
            if word in vocabulary:
                if line[1] == 'negative':
                    negative_list.append(word)
                
                if line[1] == 'positive':
                    positive_list.append(word)
                
                if line[1] == 'neutral':
                    neutral_list.append(word)
        
check_sentiment()

In [6]:
# Using a function to track the amount of times a single word is in each given sentiment (positive, negative, neutral).

def iterate(vocabulary):
    dictionary = {}
    for word in vocabulary:
        if word not in dictionary:
            dictionary[word] = 1
        if word in dictionary:
            dictionary[word] += 1
            
    return dictionary
 
# Task 8 (1/2) - Creating a function which adds all words not currently in a given class with a minimal probability.    

def failsafe():
    for word in vocabulary:
        if word not in new_negative:
            new_negative[word] = 1/(len(new_negative) + len(vocabulary) + 1)
            
        if word not in new_positive:
            new_positive[word] = 1/(len(new_positive) + len(vocabulary) + 1)
            
        if word not in new_neutral:
            new_neutral[word] = 1/(len(new_neutral) + len(vocabulary) + 1)
            

new_negative = iterate(negative_list)
new_positive = iterate(positive_list)
new_neutral = iterate(neutral_list)
failsafe()

In [7]:
# Task 5 (2/2) - Creating dictionarys to supply the probability of each word belonging to each class.

negative_dictionary = {}
positive_dictionary = {}
neutral_dictionary = {}


for word in vocabulary:
    if word not in negative_dictionary:
        negative_dictionary[word] = new_negative[word] / (new_negative[word] + new_positive[word] + new_neutral[word])

    if word not in positive_dictionary:
        positive_dictionary[word] = new_positive[word] / (new_negative[word] + new_positive[word] + new_neutral[word])

    if word not in neutral_dictionary:
        neutral_dictionary[word] = new_neutral[word] / (new_negative[word] + new_positive[word] + new_neutral[word])


In [8]:
# Task 6- Creating a classifier which calculates the probability of each word belongig to each class, then combining every score to give the 
# full tweet a score of either negative, positive or neutral

class Classifier_Score():
    def __init__(self, inp):
        self.negative_score = 0
        self.positive_score = 0
        self.neutral_score = 0
        self.inp = inp

    def classifier(self, fulltweet):
        for word in self.inp:
            if word in vocabulary:
                #Using variables to inplement Bayes theorem.
                negative = ((negative_prob * negative_dictionary[word])/(negative_dictionary[word]+positive_dictionary[word]+neutral_dictionary[word]))
                positive = ((positive_prob * positive_dictionary[word])/(negative_dictionary[word]+positive_dictionary[word]+neutral_dictionary[word]))    
                neutral = ((neutral_prob * neutral_dictionary[word])/(negative_dictionary[word]+positive_dictionary[word]+neutral_dictionary[word]))
                
                
                if max([positive, negative, neutral]) == negative:
                    self.negative_score += negative
                    
                elif max([positive, negative, neutral]) == positive:
                    self.positive_score += positive
                
                elif max([positive, negative, neutral]) == neutral:
                    self.neutral_score += neutral
            
# Task 8 (2/2) - Creating a failsafe for words not currently in the vocabulary.

            elif word not in vocabulary:

                vocabulary[word] = 0.1
                negative_dictionary[word] = 1/(len(negative_dictionary) + len(vocabulary) + 1)
                neutral_dictionary[word] = 1/(len(neutral_dictionary) + len(vocabulary) + 1)
                positive_dictionary[word] = 1/(len(positive_dictionary) + len(vocabulary) + 1)
            
        
        if max(self.positive_score, self.negative_score, self.neutral_score) == 0:
            fulltweet[' '.join(self.inp)] = 'neutral'
        elif max(self.positive_score, self.negative_score, self.neutral_score) == self.negative_score:
            fulltweet[' '.join(self.inp)] = 'negative'    
        elif max(self.positive_score, self.negative_score, self.neutral_score) == self.positive_score:
            fulltweet[' '.join(self.inp)] = 'positive'
        elif max(self.positive_score, self.negative_score, self.neutral_score) == self.neutral_score:
            fulltweet[' '.join(self.inp)] = 'neutral'
        
        

In [9]:
# Task 7 (1/2) - Testing the classifier on the test set.

def test_classifier():
    test_set = {}
    for line in x_test.text:
        line = line.split(' ')
        node = Classifier_Score(line)
        node.classifier(test_set)
    
    return pd.DataFrame(test_set.items(), 
                        columns=['text', 'sent_predict'])

In [10]:
x_tested = test_classifier()
x_test = x_test.reset_index(drop = True)

complete_predict = pd.merge(x_tested, x_test, 
              left_index = True, 
              right_index = True)

complete_predict = complete_predict.drop(['text_y'], axis=1)

In [11]:
# Task 7 (2/2) - Calculating the Naive Bayes classifier error rate.

score = 0

for line in complete_predict.values:
    if line[1] == line[2]:
        score += 1

prediction_rate = round((score / len(complete_predict.values)) * 100)
print('Prediction rate of the Naive Bayes classifier is: ' + str(prediction_rate) + '%')

Prediction rate of the Naive Bayes classifier is: 62%


In [12]:
# Task 9 - Creating a function to test which class a given input is.

def class_tester():
    test_dictionary = {}
    tweet = input('Please insert your tweet here!')
    tweet = tweet.split(' ')
    complete_tweet = Classifier_Score(tweet)
    complete_tweet.classifier(test_dictionary)
    liste = []
    count = 0
    print(test_dictionary)
    
# Task 10 - Prints an explination why a tweet is labeled in a certain way.

    for fulltweet in test_dictionary:
        tweet = fulltweet.split(' ')
        for word in tweet:
            if vocabulary[word] >= 1:
                count += 1
            else:
                liste.append(word)
                
                
        if count >= 1:
            print('The tweet \'' + str(fulltweet) + '\' is classified as', 
                          test_dictionary[fulltweet], 'because by using the Bayes algorithm we can input the prior probabilities of a tweet being', 
                          test_dictionary[fulltweet], 'then multiply it by the likelihood probabilities of the word in that tweet =', 
                          test_dictionary[fulltweet], 'and divide it by the evidence that is the probability of the word to exist in the sentence.')
        
        if len(liste) > 0:
            print('\nThe following word(s)', liste, 'is currently not in the vocabulary.', 
                  '\nTo avoid 0 probability, the word(s) has been calculated and added to the vocabulary with a probability of less than 1 occurance.', 
                  '\nAnd further calculate the word(s) to being either neutral, negative or positive based on recognized words in the tweet.\n')
            
        

In [13]:
# Task 11 - Explaining why two tweets are catagorized as they are.
def example():
    print('The tweet \'@united @dmb41shows I need this plane to get to buffalo so I can leave tonight.', 
          'Any progress?\' is categorized as negative, and also predicted as such.', 
          'This is because the words: [need, plane, buffalo, leave] are strong negative words in the classifier.', 
          'The rest of the words are either stopwords or not recognized so they won\'t impact the classification\n')

    print('The tweet \'@AmericanAir do you guys have wifi on international flights?\' is categorized as neutral, but predicted as negative.',
         'The reason for the classifier to think that this tweet is negative is because the words: [guys, wifi, international]',
         'are al categorized as negative words. However all the remaining words are either stop words or recognized by the vocabulary.',
         'Hence why, instead of it realizing that this is actually a neutral tweet, it categorizes it as a negative tweet.')

In [None]:
#Run this code to test any text you want!
#class_tester()

In [None]:
#Run this code to see why two tweets are either correctly or incorrectly predicted!
#example()