In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
df_tweets = pd.read_csv('Tweets.csv')

In [3]:
def dataset_split_stratified(df_tweets):
    '''
    This function performs stratified splitting i.e ratio of classes in train and test data is kept the same while spitting.
    
    Arguments
    ---------
    df_tweets : dataframe
        A dataframe containing data with labels to be split.
    
    Returns
    -------
    df_tweets_train : dataframe
        the dataframe containing train data
     df_tweets_test : dataframe
        the dataframe containing test data.

    '''
    df_tweets = df_tweets.sample(frac=1)
    df_tweets.reset_index(inplace=True, drop=True)
    df_tweets_pos = df_tweets[df_tweets.label=='positive']
    df_tweets_neg = df_tweets[df_tweets.label=='negative']
    df_tweets_neutral = df_tweets[df_tweets.label=='neutral']
    pos_count = len(df_tweets_pos)
    neg_count = len(df_tweets_neg)
    neutral_count = len(df_tweets_neutral)
    
    pos_train = int(0.8 * pos_count)
    neg_train = int(0.8 *neg_count)
    neutral_train = int(0.8 * neutral_count)
    
    df_tweets_pos_train = df_tweets_pos.iloc[0:pos_train]
    df_tweets_pos_test = df_tweets_pos.iloc[pos_train:pos_count]
    df_tweets_neg_train = df_tweets_neg.iloc[0:neg_train]
    df_tweets_neg_test = df_tweets_neg.iloc[neg_train:neg_count]
    df_tweets_neutral_train = df_tweets_neutral.iloc[0:neutral_train]
    df_tweets_neutral_test = df_tweets_neutral.iloc[neutral_train:neutral_count]
    
    df_tweets_train = df_tweets_pos_train.append(df_tweets_neg_train).append(df_tweets_neutral_train)
    df_tweets_test = df_tweets_pos_test.append(df_tweets_neg_test).append(df_tweets_neutral_test)
    
    return df_tweets_train, df_tweets_test

In [4]:
def preprocessing(tweet):
    '''
    This function performs preprocessing like removing punctuation, emojis, stopwords and words whose
    count is less than 3 since they are either joining words or are useless.
    Arguments
    ---------
    tweets : str
        A string tweet.
    
    Returns
    -------
    tweet_out : list
          A list containing words from the tweet after preprocessing.

    '''
    tweet = tweet.encode('ascii', 'ignore').decode('ascii')
    tweet_lower = tweet.lower()
    punctuation_numbers = '''!()-[]{};:'"\|,<>./?@#$%^&*+=_—~0123456789'''
    for x in tweet_lower:
        if x in punctuation_numbers: 
            tweet_lower = tweet_lower.replace(x, " ")  #remove punctuation
    tweet_bow = tweet_lower.split()
     # remove stopwords
    stopwords = ['a', 'an', 'the', 'who', 'what', 'when', 'where', 'has', 'is', 'was', 'and', 'they', 'we', 'us', 'im', 'them', 'it', 'i', 'u', 'st', 'bag', 'book', 'my', 'her', 'him', 'have', 'had', 'on', 'me', 'wifi', 'ive', 'in', 'gotten', 'httptcohovuaisg', 'if', 'jfk', 'la', 'phx', 'turkish', 'cs', 'cxld', 'tkt', 'tues', 'logan', 'gong', 'hm', 'wu', 'syr', 'id', 'ri', 'sm', 'ees', 'yr', 'bw', 'bf', 'bcs', 'resched', 'abq', 'thatthis', 'ty']
    tweet_out = []
    for word in tweet_bow:
        if (word not in stopwords) and (len(word)>3) and ('http' not in word) :
            tweet_out.append(word)
        
    return tweet_out

In [5]:
def feature_extraction(df_tweets, filename):
    '''
    This function counts occurences of all words and removes words whose count is less than 3. It then creates a bag of words
    representation for all tweets.
    Arguments
    ---------
    df_tweets: dataframe
        An input dataframe containing all tweets.
    filename : str
        A csv file name where features are stored.
    
    Returns
    -------
    df_features : dataframe
         A dataframe containg the bag of words for all tweets.

    '''
    all_words, words_list = [], []
    words_dict = {}
    for i in range(0, len(df_tweets)):                        
        tweet_bow = preprocessing(df_tweets.loc[i,'text'])
        for word in tweet_bow:
            all_words.append(word)
    word_frequency = Counter(all_words).most_common()      # count occurences of all words
    for i in range(0, len(word_frequency)):
        if word_frequency[i][1] >= 5:                 #remove words which appear in less than 5 tweets or whose count is less than 10
            words_list.append(word_frequency[i][0])
    words_dict = {k: [] for k in words_list}
    
    for word in words_list:
        for i in range(0, len(df_tweets)):
            words_dict[word].append(preprocessing(df_tweets.loc[i, "text"]).count(word))
    df_features = pd.DataFrame(words_dict)
    df_features['label'] = df_tweets['airline_sentiment']
    df_features.to_csv(filename)
    return df_features

In [6]:
# features = feature_extraction(df_tweets,'feature_tweets.csv')

In [7]:
features = pd.read_csv('feature_tweets.csv')
features.drop('Unnamed: 0', axis=1, inplace=True)

In [8]:
features.head()

Unnamed: 0,united,flight,usairways,americanair,southwestair,jetblue,your,that,with,this,...,gain,inappropriate,selling,paypal,remains,emailing,neptune,greatservice,printed,label
0,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,neutral
1,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
3,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,negative
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,negative


In [9]:
features_train, features_test = dataset_split_stratified(features)

In [10]:
features_train.head()

Unnamed: 0,united,flight,usairways,americanair,southwestair,jetblue,your,that,with,this,...,gain,inappropriate,selling,paypal,remains,emailing,neptune,greatservice,printed,label
16,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
31,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
49,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
67,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
69,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,positive


In [11]:
features_train.reset_index(inplace=True, drop=True)
features_test.reset_index(inplace=True, drop=True)

In [12]:
features_train.head()

Unnamed: 0,united,flight,usairways,americanair,southwestair,jetblue,your,that,with,this,...,gain,inappropriate,selling,paypal,remains,emailing,neptune,greatservice,printed,label
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
4,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,positive


In [13]:
def train_naive_bayes(features_train):
    """
    Implements naive bayes algorithm to compute the log priors and log likelihoods for training data. 
    It also performs add-one smoothing on training data
    
    Arguments
    ---------
    features_train : dataframe
        The bag of words representation for training data.
    
    Returns
    -------
    log_prior : array
        The log of priors for each class.
        
    log_likelihood : array
        The log of likelihood probabilities for all words of each class.
        
    vocab : array
        vocabulary of all words.
    
    """
    train_pos = features_train[features_train.label == 'positive'].drop('label', axis=1)
    train_neg = features_train[features_train.label == 'negative'].drop('label', axis=1)
    train_neutral = features_train[features_train.label == 'neutral'].drop('label', axis=1)
    vocab = list(features_train.drop('label', axis=1).columns)
    len_v = len(vocab)
    log_prior_pos = np.log(len(train_pos)/len(features_train))
    log_prior_neg = np.log(len(train_neg)/len(features_train))
    log_prior_neutral = np.log(len(train_neutral)/len(features_train))
    
    # count the total occurence of each word, add one to each word and divide it by total occurences 
    # of all words plus the length of the vocabulary
    log_prob_pos = np.log((np.array(train_pos.sum())+1)/(sum(train_pos.sum())+len_v))  # with add one smoothing
    log_prob_neg = np.log((np.array(train_neg.sum())+1)/(sum(train_neg.sum())+len_v))
    log_prob_neutral = np.log((np.array(train_neutral.sum())+1)/(sum(train_neutral.sum())+len_v))
    log_prior = np.array([log_prior_pos, log_prior_neg, log_prior_neutral])
    log_likelihood = np.array([list(log_prob_pos), list(log_prob_neg), list(log_prob_neutral)])
    
    return log_prior, log_likelihood, vocab

In [27]:
def predict(x, log_likelihood, log_prior):
    """
    predicts the class of a particular test instance.
    
    Arguments
    ---------
    x : array
        bag of words representation for a test tweet.
    log_prior : array
        The log of priors for each class.
        
    log_likelihood : array
        The log of likelihood probabilities for all words of each class.
    
    Returns
    -------
    predicted_class : int
        The class predicted by Naive Bayes for a particular tweet.
    
    """
    posterior = np.zeros(3)
    for i in range(3):
        # word count is multiplied with the log likelihood and added to log prior.
        posterior[i] = sum(np.array(x)*log_likelihood[i,:])+log_prior[i]
    predicted_class = np.argmax(posterior) + 1
    return predicted_class

In [28]:
def test_naive_bayes(X, log_likelihood, log_prior):
    """
    Tests Naive Bayes for all test instances by calling the predict function repeatedly.
    
    Arguments
    ---------
    X : 2d-array
        The input test instances of shape (m,n).
    log_prior : array
        The log of priors for each class.
        
    log_likelihood : array
        The log of likelihood probabilities for all words of each class.
    
    Returns
    -------
    Y_predict : array
        The class predicted by Naive Bayes for all test instances.
    
    """
    Y_predict = list()
    m = X.shape[0]
    for i in range(0, m):
        Y_predict.append(predict(X[i,:], log_likelihood, log_prior))
    return Y_predict

In [29]:
def decode(Y):
    '''
    This function decodes the labels.
    Arguments
    ---------
    Y : array
        The values of the function at each data point. This is a vector of
        shape (m, k), where m is the number of training examples and k is the number of categories.
    
    Returns
    -------
    Y_d : list
          Decoded values . 1 for "positive" class, 2 for "negative" and 3 for "neutral".

    '''
    Y_d = []
    for i in range(0, len(Y)):
        if Y[i] == 'positive':
            Y_d.append(1)
        elif Y[i] == 'negative':
            Y_d.append(2)
        else:
            Y_d.append(3)
    return Y_d

In [30]:
def confusion_matrix(actual_values, predicted_values):
    '''
    Generates confusion matrix for classification evaluation.
    Arguments
    ---------
    actual_values : array
        The actual decoded labels of test data: 1 for "positive", 2 for "negative" and 3 for "neutral".
        
    predicted_values : array
        The predicted decoded labels of test data.
        
    Returns
    -------
    conf_matrix : 2d array
        The confusion matrix
    

    '''
    conf_matrix = np.zeros((3,3))
    for i in range(0, len(actual_values)):
        if actual_values[i] == predicted_values[i]:
            if actual_values[i] == 1:
                conf_matrix[0,0] = conf_matrix[0,0] + 1
            if actual_values[i] == 2:
                conf_matrix[1,1] = conf_matrix[1,1] + 1
            if actual_values[i] == 3:
                conf_matrix[2,2] = conf_matrix[2,2] + 1
        else:
            if actual_values[i] == 1 and predicted_values[i] == 2:
                conf_matrix[1,0] = conf_matrix[1,0] + 1
            if actual_values[i] == 1 and predicted_values[i] == 3:
                conf_matrix[2,0] = conf_matrix[2,0] + 1
            if actual_values[i] == 2 and predicted_values[i] == 1:
                conf_matrix[0,1] = conf_matrix[0,1] + 1
            if actual_values[i] == 2 and predicted_values[i] == 3:
                conf_matrix[2,1] = conf_matrix[2,1] + 1
            if actual_values[i] == 3 and predicted_values[i] == 1:
                conf_matrix[0,2] = conf_matrix[0,2] + 1
            if actual_values[i] == 3 and predicted_values[i] == 2:
                conf_matrix[1,2] = conf_matrix[1,2] + 1
    return conf_matrix

In [31]:
def classification_report(conf_matrix):
    '''
    Generates micro and macro average scores for classification evaluation using the confusion matrix.
    Arguments
    ---------
    conf_matrix : 2d array
        The confusion matrix
        
    Returns
    -------
    classification_report : dataframe
        A dataframe containing micro and macro average precision, recall, accuracy and F1-scores.
    

    '''
    tp_c1 = conf_matrix[0,0]
    fp_c1 = conf_matrix[0,1] + conf_matrix[0,2]
    fn_c1 = conf_matrix[1,0] + conf_matrix[2,0]
    tn_c1 = conf_matrix[1,1] + conf_matrix[1,2] + conf_matrix[2,1] + conf_matrix[2,2]
    precision_c1 = tp_c1 / (tp_c1 + fp_c1)
    recall_c1 = tp_c1 / (tp_c1 + fn_c1)
    acc_c1 = (tp_c1 + tn_c1) / (tp_c1 + fp_c1 + tn_c1 + fn_c1)
    F1_score_c1 = (2 * precision_c1 * recall_c1) / (precision_c1 + recall_c1)
    
    tp_c2 = conf_matrix[1,1]
    fp_c2 = conf_matrix[1,0] + conf_matrix[1,2]
    fn_c2 = conf_matrix[0,1] + conf_matrix[2,1]
    tn_c2 = conf_matrix[0,0] + conf_matrix[0,2] + conf_matrix[2,0] + conf_matrix[2,2]
    precision_c2 = tp_c2 / (tp_c2 + fp_c2)
    recall_c2 = tp_c2 / (tp_c2 + fn_c2)
    acc_c2 = (tp_c2 + tn_c2) / (tp_c2 + fp_c2 + tn_c2 + fn_c2)
    F1_score_c2 = (2 * precision_c2 * recall_c2) / (precision_c2 + recall_c2)
    
    tp_c3 = conf_matrix[2,2]
    fp_c3 = conf_matrix[2,0] + conf_matrix[2,1]
    fn_c3 = conf_matrix[0,2] + conf_matrix[1,2]
    tn_c3 = conf_matrix[0,0] + conf_matrix[0,1] + conf_matrix[1,0] + conf_matrix[1,1]
    precision_c3 = tp_c3 / (tp_c3 + fp_c3)
    recall_c3 = tp_c3 / (tp_c3 + fn_c3)
    acc_c3 = (tp_c3 + tn_c3) / (tp_c3 + fp_c3 + tn_c3 + fn_c3)
    F1_score_c3 = (2 * precision_c3 * recall_c3) / (precision_c3 + recall_c3)
    
    
    macro_prec = (precision_c1 + precision_c2 + precision_c3) / 3
    macro_recall = (recall_c1 + recall_c2 + recall_c3) / 3
    macro_acc = (acc_c1 + acc_c2 + acc_c3) / 3
    macro_F1 = (F1_score_c1 + F1_score_c2 + F1_score_c3) / 3
    
    micro_prec = (tp_c1 + tp_c2 + tp_c3) / (tp_c1 + tp_c2 + tp_c3 + fp_c1 + fp_c2 + fp_c3)
    micro_recall = (tp_c1 + tp_c2 + tp_c3) / (tp_c1 + tp_c2 + tp_c3 + fn_c1 + fn_c2 + fn_c3)
    micro_acc = (tp_c1 + tp_c2 + tp_c3 + tn_c1 + tn_c2 + tn_c3) / (tp_c1 + tp_c2 + tp_c3 + tn_c1 + tn_c2 + tn_c3 + fp_c1 + fp_c2 + fp_c3 + fn_c1 + fn_c2 + fn_c3)
    micro_F1 = (2 * micro_prec * micro_recall) / (micro_prec + micro_recall)
    
    data = np.array([[macro_prec, macro_recall, macro_acc, macro_F1], [micro_prec, micro_recall, micro_acc, micro_F1]])
    classification_report = pd.DataFrame(data, columns = ['precision', 'recall', 'accuracy', 'F1_score'], index=['macro_average', 'micro_avg'])
    return classification_report

# Training

In [32]:
log_prior, log_likelihood, vocab = train_naive_bayes(features_train)

# Testing

In [33]:
X_test = np.array(features_test.drop(['label'], axis=1))
Y_test = np.array(features_test[['label']])

In [34]:
Y_predict = test_naive_bayes(X_test, log_likelihood, log_prior)

In [35]:
Y_test = decode(Y_test)

In [36]:
conf_matrix = confusion_matrix(Y_test, Y_predict)

In [37]:
conf_matrix

array([[ 322.,   77.,   62.],
       [  91., 1593.,  231.],
       [  60.,  166.,  327.]])

In [38]:
cf_report = classification_report(conf_matrix)

In [39]:
cf_report

Unnamed: 0,precision,recall,accuracy,F1_score
macro_average,0.707218,0.691943,0.843633,0.698809
micro_avg,0.765449,0.765449,0.843633,0.765449
