In [32]:
import numpy as np
import pandas as pd
from collections import Counter

In [33]:
df_tweets = pd.read_csv('Tweets.csv')

In [34]:
def dataset_split_stratified(df_tweets):
    '''
    This function performs stratified splitting i.e ratio of classes in train and test data is kept the same while spitting.
    
    Arguments
    ---------
    df_tweets : dataframe
        A dataframe containing data with labels to be split.
    
    Returns
    -------
    df_tweets_train : dataframe
        the dataframe containing train data
     df_tweets_test : dataframe
        the dataframe containing test data.

    '''
    df_tweets = df_tweets.sample(frac=1)
    df_tweets.reset_index(inplace=True, drop=True)
    df_tweets_pos = df_tweets[df_tweets.label=='positive']
    df_tweets_neg = df_tweets[df_tweets.label=='negative']
    df_tweets_neutral = df_tweets[df_tweets.label=='neutral']
    pos_count = len(df_tweets_pos)
    neg_count = len(df_tweets_neg)
    neutral_count = len(df_tweets_neutral)
    
    pos_train = int(0.8 * pos_count)
    neg_train = int(0.8 *neg_count)
    neutral_train = int(0.8 * neutral_count)
    
    df_tweets_pos_train = df_tweets_pos.iloc[0:pos_train]
    df_tweets_pos_test = df_tweets_pos.iloc[pos_train:pos_count]
    df_tweets_neg_train = df_tweets_neg.iloc[0:neg_train]
    df_tweets_neg_test = df_tweets_neg.iloc[neg_train:neg_count]
    df_tweets_neutral_train = df_tweets_neutral.iloc[0:neutral_train]
    df_tweets_neutral_test = df_tweets_neutral.iloc[neutral_train:neutral_count]
    
    df_tweets_train = df_tweets_pos_train.append(df_tweets_neg_train).append(df_tweets_neutral_train)
    df_tweets_test = df_tweets_pos_test.append(df_tweets_neg_test).append(df_tweets_neutral_test)
    
    return df_tweets_train, df_tweets_test

In [35]:
def preprocessing(tweet):
    '''
    This function performs preprocessing like removing punctuation, emojis, stopwords and words whose
    count is less than 3 since they are either joining words or are useless.
    Arguments
    ---------
    tweets : str
        A string tweet.
    
    Returns
    -------
    tweet_out : list
          A list containing words from the tweet after preprocessing.

    '''
    tweet = tweet.encode('ascii', 'ignore').decode('ascii')
    tweet_lower = tweet.lower()
    punctuation_numbers = '''!()-[]{};:'"\|,<>./?@#$%^&*+=_—~0123456789'''
    for x in tweet_lower:
        if x in punctuation_numbers: 
            tweet_lower = tweet_lower.replace(x, " ")  #remove punctuation
    tweet_bow = tweet_lower.split()
     # remove stopwords
    stopwords = ['a', 'an', 'the', 'who', 'what', 'when', 'where', 'has', 'is', 'was', 'and', 'they', 'we', 'us', 'im', 'them', 'it', 'i', 'u', 'st', 'bag', 'book', 'my', 'her', 'him', 'have', 'had', 'on', 'me', 'wifi', 'ive', 'in', 'gotten', 'httptcohovuaisg', 'if', 'jfk', 'la', 'phx', 'turkish', 'cs', 'cxld', 'tkt', 'tues', 'logan', 'gong', 'hm', 'wu', 'syr', 'id', 'ri', 'sm', 'ees', 'yr', 'bw', 'bf', 'bcs', 'resched', 'abq', 'thatthis', 'ty']
    tweet_out = []
    for word in tweet_bow:
        if (word not in stopwords) and (len(word)>3) and ('http' not in word) :
            tweet_bow.append(word)
        
    return tweet_bow

In [36]:
def feature_extraction(df_tweets, filename):
    '''
    This function counts occurences of all words and removes words whose count is less than 3. It then creates a bag of words
    representation for all tweets.
    Arguments
    ---------
    df_tweets: dataframe
        An input dataframe containing all tweets.
    filename : str
        A csv file name where features are stored.
    
    Returns
    -------
    df_features : dataframe
         A dataframe containg the bag of words for all tweets.

    '''
    all_words, words_list = [], []
    words_dict = {}
    for i in range(0, len(df_tweets)):                        
        tweet_bow = preprocessing(df_tweets.loc[i,'text'])
        for word in tweet_bow:
            all_words.append(word)
    word_frequency = Counter(all_words).most_common()      # count occurences of all words
    for i in range(0, len(word_frequency)):
        if word_frequency[i][1] >= 5:                 #remove words which appear in less than 5 tweets or whose count is less than 10
            words_list.append(word_frequency[i][0])
          
    words_dict = {k: [] for k in words_list}
    
    for word in words_list:
        for i in range(0, len(df_tweets)):
            words_dict[word].append(preprocessing(df_tweets.loc[i, "text"]).count(word))
    df_features = pd.DataFrame(words_dict)
    df_features['label'] = df_tweets['airline_sentiment']
#     df_features.to_csv(filename)
    return df_features

In [37]:
# features = feature_extraction(df_tweets,'features.csv')

In [38]:
features = pd.read_csv('features.csv')
features.drop('Unnamed: 0', axis=1, inplace=True)

In [39]:
features.head()

Unnamed: 0,united,flight,usairways,americanair,southwestair,jetblue,your,that,with,this,...,gain,inappropriate,selling,paypal,remains,emailing,neptune,greatservice,printed,label
0,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,neutral
1,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
3,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,negative
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,negative


In [40]:
features_train, features_test = dataset_split_stratified(features)

In [41]:
features_train.head()

Unnamed: 0,united,flight,usairways,americanair,southwestair,jetblue,your,that,with,this,...,gain,inappropriate,selling,paypal,remains,emailing,neptune,greatservice,printed,label
2,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,positive
7,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
8,0,0,0,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,positive
10,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
14,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive


In [42]:
features_train.reset_index(inplace=True, drop=True)
features_test.reset_index(inplace=True, drop=True)

In [43]:
features_train.head()

Unnamed: 0,united,flight,usairways,americanair,southwestair,jetblue,your,that,with,this,...,gain,inappropriate,selling,paypal,remains,emailing,neptune,greatservice,printed,label
0,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,positive
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
2,0,0,0,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,positive
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,positive


In [44]:
def one_hot_encoding(data):
    '''
    This function does one hot encoding for data.
    Arguments
    ---------
    data : dataframe
        The dataframe containg data whose labels are to be encoded.
    
    Returns
    -------
    data : dataframe
         input data frame with encoded variables added.

    '''
    data['positive'] = np.zeros(len(data))
    data['negative'] = np.zeros(len(data))
    data['neutral'] = np.zeros(len(data))
    for i in range(0, len(data)):
        if data['label'][i] == 'positive':
            data['positive'][i] = 1
        elif data['label'][i] == 'negative':
            data['negative'][i] = 1
        else:
            data['neutral'][i] = 1
    data.drop('label', axis=1, inplace=True)
    return data
            

In [45]:
features_train = one_hot_encoding(features_train)
features_test = one_hot_encoding(features_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [46]:
features_train.head(10)

Unnamed: 0,united,flight,usairways,americanair,southwestair,jetblue,your,that,with,this,...,inappropriate,selling,paypal,remains,emailing,neptune,greatservice,printed,negative,neutral
0,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0.0,0.0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0.0
2,0,0,0,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0.0,0.0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0.0
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0.0
5,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0.0
6,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0.0
7,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0.0
8,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0.0
9,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0.0,0.0


In [47]:
def softmax(x, theta1, theta2, theta3):
    '''
    Function to calculate softmax of three hypotheses.
    Arguments
    ---------
    x : array
        A training example of shape n.
        
    theta1 : array
        The weights vector of shape (n+1,) for class 1.
        
    theta2 : array
        The weights vector of shape (n+1,) for class 2.
        
    theta3 : array
        The weights vector of shape (n+1,) for class 3.
    
    Returns
    -------
    h_x : list
          A list containing the softmax of hypothesis of each class.

    '''
    h_x = list()
    x = np.append(1,x)
    z1 = sum(theta1 * x)
    z2 = sum(theta2 * x)
    z3 = sum(theta3 * x)
    sum_exp = np.exp(z1) + np.exp(z2) + np.exp(z3)
    h1_x = np.exp(z1) / sum_exp
    h2_x = np.exp(z2) / sum_exp
    h3_x = np.exp(z3) / sum_exp
    h_x = [h1_x, h2_x, h3_x]
    return h_x

In [48]:
def cross_entropy_loss(X, Y, theta1, theta2, theta3):
    '''
    This function calculates the cross-entropy-loss for multinomial logistic regression.
    
    Arguments
    ---------
    X : 2d-array
        The input dataset of shape (m, n), where m is the number of training examples and n is the number of features.
    
    Y : array
        The values of the function at each data point. This is a vector of
        shape (m, k), where m is the number of training examples and k is the number of categories.
    
    theta1 : array
        The weights vector of shape (n+1,) for class 1.
        
    theta2 : array
        The weights vector of shape (n+1,) for class 2.
        
    theta3 : array
        The weights vector of shape (n+1,) for class 3.
    
    Returns
    -------
    J : array
        The value of multinomial the logistic regression cost function for each training example.

    '''
    # initialize some useful values
    m = X.shape[0] 
    n = X.shape[1] #number of features
    k = Y.shape[1]
    # You need to return the following variable(s) correctly
    J = 0
 
    ### START CODE HERE ### (≈ 3-4 lines of code)
    
    hx = np.array([softmax(X[i,:], theta1, theta2, theta3) for i in range(0, m)])
    for i in range(0, k):
         J = J + ((-1/m)*sum(Y[:,i] * np.log(np.array(hx[:,i]))))
    return J

In [49]:
def batch_gradient_descent(X, Y, alpha, n_epoch):
    """
    Performs batch gradient descent to learn thetas. Updates thetas for each class simultaneously,  by taking `n_epoch`
    gradient steps with learning rate `alpha`.
    
    Arguments
    ---------
    X : 2d-array
        The input dataset of shape (m, n), where m is the number of training examples and n is the number of features.
    
    Y : array
        The values of the function at each data point. This is a vector of
        shape (m, k), where m is the number of training examples and k is the number of categories.
        
    alpha : float
        The learning rate.
    
    n_epoch : int
        The number of iterations for gradient descent. 
    
    Returns
    -------
    theta1 : array
        The weights vector of shape (n+1,) for class 1.
        
    theta2 : array
        The weights vector of shape (n+1,) for class 2.
        
    theta3 : array
        The weights vector of shape (n+1,) for class 3.
        
    J : list
        A python list for the values of the cost function after each iteration.
    
    """
    # initialize some useful values
    m = X.shape[0]  # number of training examples
    n = X.shape[1]
    J = list()  # list to store cost
    
    # You need to return the following variables correctly
    #, dtype=np.longdouble
    
    theta1 = np.zeros(n+1)
    theta2 = np.zeros(n+1)
    theta3 = np.zeros(n+1)
    X_1 = np.concatenate((np.ones((m,1)),X), axis=1)
    for epoch in range(n_epoch):
        ### START CODE HERE ### (≈ 5-10 lines of code)
        
        hx = np.array([softmax(X[i,:], theta1, theta2, theta3) for i in range(0, m)])
        for j in range(0, m, 32):
            for k in range(0, n+1):
            
                temp1 = (alpha/32) * sum((hx[j:j+32,0]-Y[j:j+32,0]) * X_1[j:j+32,k] )
                theta1[k] = theta1[k] - temp1
                
                temp2 = (alpha/32) * sum((hx[j:j+32,1]-Y[j:j+32,1]) * X_1[j:j+32,k] )
                theta2[k] = theta2[k] - temp2
                
                temp3 = (alpha/32) * sum((hx[j:j+32,2]-Y[j:j+32,2]) * X_1[j:j+32,k] )
                theta3[k] = theta3[k] - temp3
        ### END CODE HERE ###
        J.append(cross_entropy_loss(X, Y, theta1, theta2, theta3))
    return theta1, theta2, theta3, J

In [50]:
def predict(X, theta1, theta2, theta3):
    '''
    Function which selects the most probable class by taking the max of probabilities.
    Arguments
    ---------
    X : 2d-array
        The test dataset of shape (m, n), where m is the number of instances and n is the number of features.
    
    theta1 : array
        The weights vector of shape (n+1,) for class 1.
        
    theta2 : array
        The weights vector of shape (n+1,) for class 2.
        
    theta3 : array
        The weights vector of shape (n+1,) for class 3.
    
    Returns
    -------
    Y : array
        The predicted values of the function at each data point. This is a vector of
        shape (m, k), where m is the number of instances and k is the number of categories.
    

    '''
    m = X.shape[0]
    Y_predict = list()
    for i in range(0, m):
        h_x = softmax(X[i, :], theta1, theta2, theta3)
        max_arg = np.argmax(np.array(h_x))
        if max_arg == 0:
            Y_predict.append([1, 0, 0])
        elif max_arg == 1:
            Y_predict.append([0, 1, 0])
        else:
            Y_predict.append([0, 0, 1])
    Y_predict = np.array(Y_predict)
    return Y_predict

In [51]:
def one_hot_decode(Y):
    '''
    This function decodes the labels.
    Arguments
    ---------
    Y : array
        The values of the function at each data point. This is a vector of
        shape (m, k), where m is the number of training examples and k is the number of categories.
    
    Returns
    -------
    Y_d : list
          Decoded values . 1 for "positive" class, 2 for "negative" and 3 for "neutral".

    '''
    Y_d = []
    for i in range(0, len(Y)):
        if Y[i][0] == 1:
            Y_d.append(1)
        elif Y[i][1] == 1:
            Y_d.append(2)
        else:
            Y_d.append(3)
    return Y_d

In [52]:
def confusion_matrix(actual_values, predicted_values):
    '''
    Generates confusion matrix for classification evaluation.
    Arguments
    ---------
    actual_values : array
        The actual decoded labels of test data: 1 for "positive", 2 for "negative" and 3 for "neutral".
        
    predicted_values : array
        The predicted decoded labels of test data.
        
    Returns
    -------
    conf_matrix : 2d array
        The confusion matrix
    

    '''
    conf_matrix = np.zeros((3,3))
    for i in range(0, len(actual_values)):
        if actual_values[i] == predicted_values[i]:
            if actual_values[i] == 1:
                conf_matrix[0,0] = conf_matrix[0,0] + 1
            if actual_values[i] == 2:
                conf_matrix[1,1] = conf_matrix[1,1] + 1
            if actual_values[i] == 3:
                conf_matrix[2,2] = conf_matrix[2,2] + 1
        else:
            if actual_values[i] == 1 and predicted_values[i] == 2:
                conf_matrix[1,0] = conf_matrix[1,0] + 1
            if actual_values[i] == 1 and predicted_values[i] == 3:
                conf_matrix[2,0] = conf_matrix[2,0] + 1
            if actual_values[i] == 2 and predicted_values[i] == 1:
                conf_matrix[0,1] = conf_matrix[0,1] + 1
            if actual_values[i] == 2 and predicted_values[i] == 3:
                conf_matrix[2,1] = conf_matrix[2,1] + 1
            if actual_values[i] == 3 and predicted_values[i] == 1:
                conf_matrix[0,2] = conf_matrix[0,2] + 1
            if actual_values[i] == 3 and predicted_values[i] == 2:
                conf_matrix[1,2] = conf_matrix[1,2] + 1
    return conf_matrix
                
            
                

In [53]:
def classification_report(conf_matrix):
    '''
    Generates micro and macro average scores for classification evaluation using the confusion matrix.
    Arguments
    ---------
    conf_matrix : 2d array
        The confusion matrix
        
    Returns
    -------
    classification_report : dataframe
        A dataframe containing micro and macro average precision, recall, accuracy and F1-scores.
    

    '''
    tp_c1 = conf_matrix[0,0]
    fp_c1 = conf_matrix[0,1] + conf_matrix[0,2]
    fn_c1 = conf_matrix[1,0] + conf_matrix[2,0]
    tn_c1 = conf_matrix[1,1] + conf_matrix[1,2] + conf_matrix[2,1] + conf_matrix[2,2]
    precision_c1 = tp_c1 / (tp_c1 + fp_c1)
    recall_c1 = tp_c1 / (tp_c1 + fn_c1)
    acc_c1 = (tp_c1 + tn_c1) / (tp_c1 + fp_c1 + tn_c1 + fn_c1)
    F1_score_c1 = (2 * precision_c1 * recall_c1) / (precision_c1 + recall_c1)
    
    tp_c2 = conf_matrix[1,1]
    fp_c2 = conf_matrix[1,0] + conf_matrix[1,2]
    fn_c2 = conf_matrix[0,1] + conf_matrix[2,1]
    tn_c2 = conf_matrix[0,0] + conf_matrix[0,2] + conf_matrix[2,0] + conf_matrix[2,2]
    precision_c2 = tp_c2 / (tp_c2 + fp_c2)
    recall_c2 = tp_c2 / (tp_c2 + fn_c2)
    acc_c2 = (tp_c2 + tn_c2) / (tp_c2 + fp_c2 + tn_c2 + fn_c2)
    F1_score_c2 = (2 * precision_c2 * recall_c2) / (precision_c2 + recall_c2)
    
    tp_c3 = conf_matrix[2,2]
    fp_c3 = conf_matrix[2,0] + conf_matrix[2,1]
    fn_c3 = conf_matrix[0,2] + conf_matrix[1,2]
    tn_c3 = conf_matrix[0,0] + conf_matrix[0,1] + conf_matrix[1,0] + conf_matrix[1,1]
    precision_c3 = tp_c3 / (tp_c3 + fp_c3)
    recall_c3 = tp_c3 / (tp_c3 + fn_c3)
    acc_c3 = (tp_c3 + tn_c3) / (tp_c3 + fp_c3 + tn_c3 + fn_c3)
    F1_score_c3 = (2 * precision_c3 * recall_c3) / (precision_c3 + recall_c3)
    
    
    macro_prec = (precision_c1 + precision_c2 + precision_c3) / 3
    macro_recall = (recall_c1 + recall_c2 + recall_c3) / 3
    macro_acc = (acc_c1 + acc_c2 + acc_c3) / 3
    macro_F1 = (F1_score_c1 + F1_score_c2 + F1_score_c3) / 3
    
    micro_prec = (tp_c1 + tp_c2 + tp_c3) / (tp_c1 + tp_c2 + tp_c3 + fp_c1 + fp_c2 + fp_c3)
    micro_recall = (tp_c1 + tp_c2 + tp_c3) / (tp_c1 + tp_c2 + tp_c3 + fn_c1 + fn_c2 + fn_c3)
    micro_acc = (tp_c1 + tp_c2 + tp_c3 + tn_c1 + tn_c2 + tn_c3) / (tp_c1 + tp_c2 + tp_c3 + tn_c1 + tn_c2 + tn_c3 + fp_c1 + fp_c2 + fp_c3 + fn_c1 + fn_c2 + fn_c3)
    micro_F1 = (2 * micro_prec * micro_recall) / (micro_prec + micro_recall)
    
    data = np.array([[macro_prec, macro_recall, macro_acc, macro_F1], [micro_prec, micro_recall, micro_acc, micro_F1]])
    classification_report = pd.DataFrame(data, columns = ['precision', 'recall', 'accuracy', 'F1_score'], index=['macro_average', 'micro_avg'])
    return classification_report

# Training

In [54]:
X = np.array(features_train.drop(['positive', 'negative', 'neutral'], axis=1))
Y = np.array(features_train[['positive', 'negative', 'neutral']])

In [55]:
n_epoch1 = 100
alpha = 0.001
theta1, theta2, theta3, J1 = batch_gradient_descent(X ,Y, alpha, n_epoch1)
print('Predicted theta = {}, cost = {}' .format (theta1, J1[-1]))

Predicted theta = [-0.3472963  -0.19743009 -0.21150006 ... -0.00170982  0.00963477
 -0.00230686], cost = 0.7044805803526517


# Testing

In [63]:
X_test = np.array(features_test.drop(['positive', 'negative', 'neutral'], axis=1))
Y_test = np.array(features_test[['positive', 'negative', 'neutral']])

In [64]:
Y_hat = predict(X_test, theta1, theta2, theta3)

In [65]:
Y_hat_d = one_hot_decode(Y_hat)
Y_test_d = one_hot_decode(Y_test)

In [66]:
conf_matrix = confusion_matrix(Y_test_d, Y_hat_d)

In [67]:
conf_matrix

array([[ 141.,   14.,   13.],
       [ 295., 1780.,  470.],
       [  37.,   42.,  137.]])

In [68]:
cf_report = classification_report(conf_matrix)

In [69]:
cf_report

Unnamed: 0,precision,recall,accuracy,F1_score
macro_average,0.724319,0.496188,0.801753,0.526763
micro_avg,0.702629,0.702629,0.801753,0.702629
