In [1]:
import numpy as np
import pickle
from scipy.optimize import minimize_scalar
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.utils.extmath import safe_sparse_dot
from time import strftime, gmtime

In [2]:
class DAN2Regressor(object):

    def __init__(self, depth=50, bounds=(0,5000000000)):
        self.bounds = bounds
        self.depth = depth
        self.lin_predictor = LinearRegression(fit_intercept=True) 
        self.coef_ = None
        self.name = strftime('dan2model-'+ str(depth) + '-%Y-%b-%d-%H-%M-%S', gmtime())
        #self.lin_predictions = None


    """ Layer activation """
    def f(self, x):
        
        f = self.f_k
        A = self.A
        alpha = self.alpha
        a = self.a
        rows = f.shape[0]
        ''' check if intercept term should be placed first'''
        #Xn = np.hstack((a, A[0]*f, A[1]*np.cos(alpha*x), A[2]*np.sin(alpha*x)))
        Xn = a + A[0]*f + A[1]*np.cos(alpha*x) + A[2]*np.sin(alpha*x)
        return np.sum(Xn)


    """ Method to get alpha column for DAN2 """
    def compute_alpha(self, X):
        cols = X.shape[1]

        """ Create resultant vector of ones """
        R = np.ones(cols)
        #print('R', R.shape)

        """ Compute dot product """
        X_dot_R = (1 + np.dot(X,R))
        #print('XdR', X_dot_R.shape)
        X_dot_R = X_dot_R.reshape((len(X),))
        #print('XdR', X_dot_R.shape)

        """ Compute X and R magnitudes """
        X_mag = np.sqrt(1*1 + np.sum(np.square(X), axis=1))
        R_mag = np.sqrt(np.sum(R**2) + 1*1)

        """ Compute arccosine """
        acos = np.arccos(X_dot_R / (X_mag * R_mag))
        #print('acos', acos.shape)

        return acos.reshape(len(acos),1) 


    """ Linear method """
    def lin_reg(self, X, y):
        self.model['lr'] = LinearRegression(fit_intercept=True).fit(X, y)
        return self.model['lr'].predict(X), self.model['lr'].coef_[0], self.model['lr'].intercept_


    ''' '''
    def build_X1(self, f, alpha):
        return np.column_stack((f, np.cos(alpha), np.sin(alpha)))


    ''' '''
    def build_Xn(self, f, A, alpha, mu):
        rows = f.shape[0]
        if A is None and mu is None:
            X = np.hstack((f, np.cos(alpha), np.sin(alpha)))
            A = LinearRegression(fit_intercept=True).fit(X, y)

        return np.hstack((A[0]*f, A[1]*np.cos(alpha*mu), A[2]*np.sin(alpha*mu)))


    def logging(self, coef_):
        if self.coef_ is None:
            self.coef_ = coef_.reshape(1,5)

        else:
            self.coef_ = np.vstack((self.coef_ , coef_))


    """ Fit method  """
    def fit(self, X, y):

        # Number of rows
        m = X.shape[0]

        ## Get non-linear projection of input records
        alpha = self.compute_alpha(X)
        
        ## Get linear model from n input cols
        self.lin_predictor.fit(X, y)
        f_k = self.lin_predictor.predict(X)
        self.lin_predictions = f_k
        """ Start fit algorithm """
        i = 1
        mu = 1
        while (i <= self.depth):
            if i==1:
                Xn = self.build_X1(f_k, alpha)
                lr = LinearRegression(fit_intercept=True).fit(Xn, y)
                A = lr.coef_[0]
                a = lr.intercept_
                f_k = lr.predict(Xn)
            else:
                mu = self.minimize(f_k, A, a, alpha)
                Xn = self.build_Xn(f_k, A, alpha, mu) # eventually override the build_X1 method
                lr = LinearRegression(fit_intercept=True).fit(Xn, y)
                A = lr.coef_[0]
                a = lr.intercept_
                f_k = lr.predict(Xn) 

            # Error metrics
            mse = self.mse(f_k, y, m)
            pred = np.where(f_k >= 0.5, 1, 0)
            acc = accuracy_score(y, pred)
            
            # Save layer
            coef_ = A.reshape((1,3))
            coef_ = np.insert(coef_, 0, a)
            coef_ = np.insert(coef_, 0, mu)
            print(i, coef_)
            self.logging(coef_)

            # add layers
            print('Iteration:', i, " Mu:", mu, "MSE:", mse, "Accuracy:", acc)

            i += 1
        return f_k

    def minimize(self, f_k, A, a, alpha):
        self.f_k = f_k
        self.A = A
        self.alpha = alpha
        self.a = a
        res = minimize_scalar(self.f, bounds=self.bounds, method='bounded')
        return res.x
        

    def mse(self, f_k, y, m):
        return np.sum((f_k - y)**2) / m        

    def _activation_function(self, X, coef_):
        intercept = coef_[0]
        A = coef_[1:]
        return safe_sparse_dot(X, A.T, dense_output=True) + intercept

    def predict(self, X_test):
        X = X_test
        m = X.shape[0]
        alpha = self.compute_alpha(X)
        f_k = self.lin_predictor.predict(X)
        i = 0

        for coef_ in self.coef_:
            mu = coef_[0]
            if i == 0:
                X = np.hstack((f_k, np.cos(alpha*mu), np.sin(alpha*mu)))
                f_k = self._activation_function(X, coef_[1:])
                f_k = f_k.reshape(m,1)
            else:
                X = np.hstack((prev_coef_[2]*f_k, prev_coef_[3]*np.cos(alpha*mu), prev_coef_[4]*np.sin(alpha*mu)))
                f_k = self._activation_function(X, coef_[1:])
                f_k = f_k.reshape(m,1)

            i += 1
            prev_coef_ = coef_
        return f_k
    
    def plot_error():
        pass

In [3]:
import numpy as np
import pandas as pd
import pickle
import sys
import os,string,collections
import utils

In [4]:
dataset= pd.read_csv(r"C:\Users\ajaym\Downloads\twitter-airline-sentiment\Tweets2.csv")

In [5]:
dataset.head()

Unnamed: 0,tweet_id,airline,text
0,5.7e+17,Virgin America,@VirginAmerica What @dhepburn said.
1,5.7e+17,Virgin America,@VirginAmerica plus you've added commercials t...
2,5.7e+17,Virgin America,@VirginAmerica I didn't today... Must mean I n...
3,5.7e+17,Virgin America,@VirginAmerica it's really aggressive to blast...
4,5.7e+17,Virgin America,@VirginAmerica and it's a really big bad thing...


In [6]:
tc = utils.TextCleaner()
dataset['clean_text'] = tc.transform(dataset['text'])

In [7]:
import re,nltk
import os

In [8]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()

In [9]:
tokenized = dataset['clean_text'].apply(lambda row: tokenize(row))

In [10]:
pd.set_option('display.max_colwidth', -1)
dataset.insert(3,"tokenized", tokenized)

In [11]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [12]:
vader = SentimentIntensityAnalyzer()

In [13]:
def score(text: str) -> float:
    return vader.polarity_scores(text)['compound']

In [14]:
dataset['score'] = dataset['clean_text'].apply(score)

In [15]:
dataset.head()

Unnamed: 0,tweet_id,airline,text,tokenized,clean_text,score
0,5.7e+17,Virgin America,@VirginAmerica What @dhepburn said.,"[what, said]",what said,0.0
1,5.7e+17,Virgin America,@VirginAmerica plus you've added commercials to the experience... tacky.,"[plus, youve, added, commercials, to, the, experience, tacky]",plus youve added commercials to the experience tacky,0.0
2,5.7e+17,Virgin America,@VirginAmerica I didn't today... Must mean I need to take another trip!,"[i, didnt, today, must, mean, i, need, to, take, another, trip]",i didnt today must mean i need to take another trip,0.0
3,5.7e+17,Virgin America,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse","[its, really, aggressive, to, blast, obnoxious, entertainment, in, your, guests, faces, amp, they, have, little, recourse]",its really aggressive to blast obnoxious entertainment in your guests faces amp they have little recourse,-0.2716
4,5.7e+17,Virgin America,@VirginAmerica and it's a really big bad thing about it,"[and, its, a, really, big, bad, thing, about, it]",and its a really big bad thing about it,-0.5829


In [16]:
dataset['pred'] = pd.cut(dataset['score'], bins=5, labels=['strongly negative','weakly negative', 'neutral', 'weakly positive','strongly positive' ])

In [17]:
dataset.head()

Unnamed: 0,tweet_id,airline,text,tokenized,clean_text,score,pred
0,5.7e+17,Virgin America,@VirginAmerica What @dhepburn said.,"[what, said]",what said,0.0,neutral
1,5.7e+17,Virgin America,@VirginAmerica plus you've added commercials to the experience... tacky.,"[plus, youve, added, commercials, to, the, experience, tacky]",plus youve added commercials to the experience tacky,0.0,neutral
2,5.7e+17,Virgin America,@VirginAmerica I didn't today... Must mean I need to take another trip!,"[i, didnt, today, must, mean, i, need, to, take, another, trip]",i didnt today must mean i need to take another trip,0.0,neutral
3,5.7e+17,Virgin America,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse","[its, really, aggressive, to, blast, obnoxious, entertainment, in, your, guests, faces, amp, they, have, little, recourse]",its really aggressive to blast obnoxious entertainment in your guests faces amp they have little recourse,-0.2716,weakly negative
4,5.7e+17,Virgin America,@VirginAmerica and it's a really big bad thing about it,"[and, its, a, really, big, bad, thing, about, it]",and its a really big bad thing about it,-0.5829,strongly negative


In [18]:
def update_vocab_counter(row):
    for word in row:
        vocab_counter[word] += 1

In [19]:
vocab_counter = collections.Counter()
tokenized.apply(update_vocab_counter);
vocab = sorted(vocab_counter, key=vocab_counter.get, reverse=True)

In [20]:
len(vocab)

12523

In [21]:
max_words = 5000

In [22]:
w2id = {w:i for i, w in enumerate(vocab[:max_words])}
w2id['unk'] = -1

In [23]:
def transform_to_ids(row):
    return [w2id[w] if w in w2id else w2id['unk'] for w in row]

In [24]:
dataset['tokenized_int'] = dataset['tokenized'].apply(lambda x: transform_to_ids(x))

In [25]:
lens = dataset['tokenized_int'].apply(lambda x: len(x))

In [26]:
min(lens), max(lens), np.mean(lens)

(1, 32, 15.984972677595628)

In [27]:
maxlen = 20

In [28]:
def sentiment2target(sentiment):
    return {
        'strongly negative': 0,
        'weakly negative': 1,
        'neutral': 2,
        'weakly positive' : 3,
        'strongly positive' : 3
    }[sentiment]
dataset['target'] = dataset.pred.apply(sentiment2target)

In [29]:
dataset.head()

Unnamed: 0,tweet_id,airline,text,tokenized,clean_text,score,pred,tokenized_int,target
0,5.7e+17,Virgin America,@VirginAmerica What @dhepburn said.,"[what, said]",what said,0.0,neutral,"[49, 208]",2
1,5.7e+17,Virgin America,@VirginAmerica plus you've added commercials to the experience... tacky.,"[plus, youve, added, commercials, to, the, experience, tacky]",plus youve added commercials to the experience tacky,0.0,neutral,"[509, 510, 1070, 2304, 0, 1, 188, -1]",2
2,5.7e+17,Virgin America,@VirginAmerica I didn't today... Must mean I need to take another trip!,"[i, didnt, today, must, mean, i, need, to, take, another, trip]",i didnt today must mean i need to take another trip,0.0,neutral,"[2, 178, 92, 742, 533, 2, 70, 0, 140, 135, 182]",2
3,5.7e+17,Virgin America,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse","[its, really, aggressive, to, blast, obnoxious, entertainment, in, your, guests, faces, amp, they, have, little, recourse]",its really aggressive to blast obnoxious entertainment in your guests faces amp they have little recourse,-0.2716,weakly negative,"[59, 126, 3390, 0, 4188, 4189, 933, 11, 15, 2879, 3391, 56, 52, 16, 464, 2541]",1
4,5.7e+17,Virgin America,@VirginAmerica and it's a really big bad thing about it,"[and, its, a, really, big, bad, thing, about, it]",and its a really big bad thing about it,-0.5829,strongly negative,"[8, 59, 3, 126, 446, 199, 465, 76, 14]",0


In [30]:
X=dataset.iloc[:,7]
y=dataset.iloc[:,-1]

In [31]:
X=X.values
y=y.values

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=0)

In [33]:
from keras.preprocessing.sequence import pad_sequences
x_train = pad_sequences(X_train, maxlen=maxlen, value=-1)
x_test = pad_sequences(X_test, maxlen=maxlen, value=-1)

Using TensorFlow backend.


In [34]:
x_train[:5]

array([[ 514,    6,    1,  224,  171,   84,    1,  754,   -1,  210,   64,
           9,  893,    5,   17,    1,  105,   69,   31,   78],
       [1760,   42,   11,    8,   42,   47,  112, 1183,    8,  669,   10,
          64,    2,  162,  120,   19,    4,   97,   62,    4],
       [  -1,   -1,   -1,   -1,   -1,  615,  364,    0,  573,    8,   74,
           8,   77,  109,  172,   37,    7,  449, 1744,   28],
       [  -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
          -1,   -1,   -1,   -1,   -1,   -1,  166,   -1,   -1],
       [  73,  116,   12,  261, 3188,   69,   25,    1,  124,    5,    0,
          -1,   10,   22,   88,   27,   33,   32,   36,  412]])

In [35]:
y_train[:5]

array([2, 3, 3, 2, 2], dtype=int64)

In [36]:
y_train = y_train.reshape(len(y_train), 1)

In [37]:
y_train[:5]

array([[2],
       [3],
       [3],
       [2],
       [2]], dtype=int64)

In [38]:
def test_fit_and_predict(training_preds, testing_preds):
    return np.array_equal(training_preds, testing_preds)

In [39]:
def main(X, y, depth):
    clf = DAN2Regressor(depth=depth)
    tr_pred = clf.fit(X, y)
    path = clf.name
    print(clf.coef_)
    y_pred = clf.predict(X)
    print(tr_pred, y_pred)
    print(test_fit_and_predict(tr_pred, y_pred))

In [40]:
if __name__ == '__main__':
    main(x_train,y_train, depth=50)

1 [ 1.          0.01630493  0.61205353 -0.61950655  1.1288456 ]
Iteration: 1  Mu: 1 MSE: 0.9665056257290605 Accuracy: 0.21657559198542806
2 [ 3.09017869e+09  7.94059374e-04  1.63336239e+00  2.37768990e-02
 -2.40995647e-03]
Iteration: 2  Mu: 3090178693.4581933 MSE: 0.9663933137865285 Accuracy: 0.21657559198542806
3 [ 1.90949952e+09  7.59223125e-03  6.09597766e-01 -1.29965692e+00
 -6.50873918e+00]
Iteration: 3  Mu: 1909499521.4437659 MSE: 0.9657909760072506 Accuracy: 0.21657559198542806
4 [1.90983006e+09 3.77772073e-03 1.63718714e+00 7.49474133e-03
 8.61576204e-04]
Iteration: 4  Mu: 1909830056.2505255 MSE: 0.9657273808918838 Accuracy: 0.21657559198542806
5 [ 1.91036487e+09  3.71764290e-03  6.09557413e-01 -1.57414986e+00
 -2.78628089e+00]
Iteration: 5  Mu: 1910364868.1563673 MSE: 0.965654850151888 Accuracy: 0.21657559198542806
6 [ 1.90983006e+09 -4.90245784e-05  1.64057648e+00 -3.70262709e-05
 -7.33693988e-06]
Iteration: 6  Mu: 1909830056.2505255 MSE: 0.9656548482426632 Accuracy: 0.216575