In [1]:
import numpy as np
import pickle
from scipy.optimize import minimize_scalar
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.utils.extmath import safe_sparse_dot
from time import strftime, gmtime

In [2]:
class DAN2Regressor(object):

    def __init__(self, depth=10, bounds=(0,5000000000)):
        self.bounds = bounds
        self.depth = depth
        self.lin_predictor = LinearRegression(fit_intercept=True)
        self.coef_ = None
        self.name = strftime('dan2model-'+ str(depth) + '-%Y-%b-%d-%H-%M-%S', gmtime())
        #self.lin_predictions = None


    """ Layer activation """
    def f(self, x):
        
        f = self.f_k
        A = self.A
        alpha = self.alpha
        a = self.a
        rows = f.shape[0]
        ''' check if intercept term should be placed first'''
        #Xn = np.hstack((a, A[0]*f, A[1]*np.cos(alpha*x), A[2]*np.sin(alpha*x)))
        Xn = a + A[0]*f + A[1]*np.cos(alpha*x) + A[2]*np.sin(alpha*x)
        return np.sum(Xn)


    """ Method to get alpha column for DAN2 """
    def compute_alpha(self, X):
        cols = X.shape[1]

        """ Create resultant vector of ones """
        R = np.ones(cols)
        #print('R', R.shape)

        """ Compute dot product """
        X_dot_R = (1 + np.dot(X,R))
        #print('XdR', X_dot_R.shape)
        X_dot_R = X_dot_R.reshape((len(X),))
        #print('XdR', X_dot_R.shape)

        """ Compute X and R magnitudes """
        X_mag = np.sqrt(1*1 + np.sum(np.square(X), axis=1))
        R_mag = np.sqrt(np.sum(R**2) + 1*1)

        """ Compute arccosine """
        acos = np.arccos(X_dot_R / (X_mag * R_mag))
        #print('acos', acos.shape)

        return acos.reshape(len(acos),1) 


    """ Linear method """
    def linear_reg(self, X, y):
        self.model['lr'] = LinearRegression(fit_intercept=True).fit(X, y)
        return self.model['lr'].predict(X), self.model['lr'].coef_[0], self.model['lr'].intercept_


    ''' '''
    def build_X1(self, f, alpha):
        return np.column_stack((f, np.cos(alpha), np.sin(alpha)))


    ''' '''
    def build_Xn(self, f, A, alpha, mu):
        rows = f.shape[0]
        if A is None and mu is None:
            X = np.hstack((f, np.cos(alpha), np.sin(alpha)))
            A = LinearRegression(fit_intercept=True).fit(X, y)

        return np.hstack((A[0]*f, A[1]*np.cos(alpha*mu), A[2]*np.sin(alpha*mu)))


    def logging(self, coef_):
        if self.coef_ is None:
            self.coef_ = coef_.reshape(1,5)

        else:
            self.coef_ = np.vstack((self.coef_ , coef_))


    """ Fit method  """
    def fit(self, X, y):

        # Number of rows
        m = X.shape[0]

        ## Get non-linear projection of input records
        alpha = self.compute_alpha(X)
        
        ## Get linear model from n input cols
        self.lin_predictor.fit(X, y)
        f_k = self.lin_predictor.predict(X)
        self.lin_predictions = f_k
        """ Start fit algorithm """
        i = 1
        mu = 1
        while (i <= self.depth):
            if i==1:
                Xn = self.build_X1(f_k, alpha)
                lr = LinearRegression(fit_intercept=True).fit(Xn, y)
                A = lr.coef_[0]
                a = lr.intercept_
                f_k = lr.predict(Xn)
            else:
                mu = self.minimize(f_k, A, a, alpha)
                Xn = self.build_Xn(f_k, A, alpha, mu) # eventually override the build_X1 method
                lr = LinearRegression(fit_intercept=True).fit(Xn, y)
                A = lr.coef_[0]
                a = lr.intercept_
                f_k = lr.predict(Xn) 

            # Error metrics
            mse = self.mse(f_k, y, m)
            pred = np.where(f_k >= 0.5, 1, 0)
            acc = accuracy_score(y, pred)
            
            # Save layer
            coef_ = A.reshape((1,3))
            coef_ = np.insert(coef_, 0, a)
            coef_ = np.insert(coef_, 0, mu)
            print(i, coef_)
            self.logging(coef_)

            # add layers
            print('Iteration:', i, " Mu:", mu, "MSE:", mse, "Accuracy:", acc)

            i += 1
        return f_k

    def minimize(self, f_k, A, a, alpha):
        self.f_k = f_k
        self.A = A
        self.alpha = alpha
        self.a = a
        res = minimize_scalar(self.f, bounds=self.bounds, method='bounded')
        return res.x
        

    def mse(self, f_k, y, m):
        return np.sum((f_k - y)**2) / m        

    def _activation_function(self, X, coef_):
        intercept = coef_[0]
        A = coef_[1:]
        return safe_sparse_dot(X, A.T, dense_output=True) + intercept

    def predict(self, X_test):
        X = X_test
        m = X.shape[0]
        alpha = self.compute_alpha(X)
        f_k = self.lin_predictor.predict(X)
        i = 0

        for coef_ in self.coef_:
            mu = coef_[0]
            if i == 0:
                X = np.hstack((f_k, np.cos(alpha*mu), np.sin(alpha*mu)))
                f_k = self._activation_function(X, coef_[1:])
                f_k = f_k.reshape(m,1)
            else:
                X = np.hstack((prev_coef_[2]*f_k, prev_coef_[3]*np.cos(alpha*mu), prev_coef_[4]*np.sin(alpha*mu)))
                f_k = self._activation_function(X, coef_[1:])
                f_k = f_k.reshape(m,1)

            i += 1
            prev_coef_ = coef_
        return f_k
    
    def plot_error():
        pass

In [3]:
import numpy as np
import pandas as pd
import pickle
import sys
import os,string,collections
import utils

In [4]:
dataset= pd.read_csv(r"C:\Users\ajaym\Downloads\twitter-airline-sentiment\Tweets2.csv")

In [5]:
dataset.head()

Unnamed: 0,tweet_id,airline,text
0,5.7e+17,Virgin America,@VirginAmerica What @dhepburn said.
1,5.7e+17,Virgin America,@VirginAmerica plus you've added commercials t...
2,5.7e+17,Virgin America,@VirginAmerica I didn't today... Must mean I n...
3,5.7e+17,Virgin America,@VirginAmerica it's really aggressive to blast...
4,5.7e+17,Virgin America,@VirginAmerica and it's a really big bad thing...


In [6]:
tc = utils.TextCleaner()
dataset['clean_text'] = tc.transform(dataset['text'])

In [7]:
import re,nltk
import os

In [8]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()

In [9]:
tokenized = dataset['clean_text'].apply(lambda row: tokenize(row))

In [10]:
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
stop.update(['amp', 'rt', 'cc'])
stop = stop - set(['no', 'not'])

In [11]:
def remove_stopwords(row):
    return [t for t in row if t not in stop]

In [12]:
tokenized = tokenized.apply(lambda row: remove_stopwords(row))

In [13]:
pd.set_option('display.max_colwidth', -1)
dataset.insert(3,"tokenized", tokenized)

In [14]:
from textblob import TextBlob, Word, Blobber

In [15]:
dataset['clean_text'][:10].apply(lambda x: TextBlob(x).sentiment)

0    (0.0, 0.0)                               
1    (0.0, 0.0)                               
2    (-0.3125, 0.6875)                        
3    (0.0062500000000000056, 0.35)            
4    (-0.3499999999999999, 0.3833333333333333)
5    (-0.2083333333333333, 0.6333333333333333)
6    (0.45, 0.65)                             
7    (0.2, 0.2)                               
8    (0.0, 0.0)                               
9    (0.4666666666666666, 0.6)                
Name: clean_text, dtype: object

In [16]:
dataset['sentiment_score'] = dataset['clean_text'].apply(lambda x: TextBlob(x).sentiment[0])
dataset.head()

Unnamed: 0,tweet_id,airline,text,tokenized,clean_text,sentiment_score
0,5.7e+17,Virgin America,@VirginAmerica What @dhepburn said.,[said],what said,0.0
1,5.7e+17,Virgin America,@VirginAmerica plus you've added commercials to the experience... tacky.,"[plus, youve, added, commercials, experience, tacky]",plus youve added commercials to the experience tacky,0.0
2,5.7e+17,Virgin America,@VirginAmerica I didn't today... Must mean I need to take another trip!,"[didnt, today, must, mean, need, take, another, trip]",i didnt today must mean i need to take another trip,-0.3125
3,5.7e+17,Virgin America,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse","[really, aggressive, blast, obnoxious, entertainment, guests, faces, little, recourse]",its really aggressive to blast obnoxious entertainment in your guests faces amp they have little recourse,0.00625
4,5.7e+17,Virgin America,@VirginAmerica and it's a really big bad thing about it,"[really, big, bad, thing]",and its a really big bad thing about it,-0.35


In [17]:
sentiment = pd.Series([]) 
for x in range(len(dataset)):
    if dataset["sentiment_score"][x] > 0.1 :
        sentiment[x] = "positive"
        
    elif dataset["sentiment_score"][x] < (-0.1) :
        sentiment[x] = "negative"
        
    else :
        sentiment[x] = "neutral"
        
dataset.insert(6,"Sentiment", sentiment)

In [18]:
dataset.head()

Unnamed: 0,tweet_id,airline,text,tokenized,clean_text,sentiment_score,Sentiment
0,5.7e+17,Virgin America,@VirginAmerica What @dhepburn said.,[said],what said,0.0,neutral
1,5.7e+17,Virgin America,@VirginAmerica plus you've added commercials to the experience... tacky.,"[plus, youve, added, commercials, experience, tacky]",plus youve added commercials to the experience tacky,0.0,neutral
2,5.7e+17,Virgin America,@VirginAmerica I didn't today... Must mean I need to take another trip!,"[didnt, today, must, mean, need, take, another, trip]",i didnt today must mean i need to take another trip,-0.3125,negative
3,5.7e+17,Virgin America,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse","[really, aggressive, blast, obnoxious, entertainment, guests, faces, little, recourse]",its really aggressive to blast obnoxious entertainment in your guests faces amp they have little recourse,0.00625,neutral
4,5.7e+17,Virgin America,@VirginAmerica and it's a really big bad thing about it,"[really, big, bad, thing]",and its a really big bad thing about it,-0.35,negative


In [19]:
def update_vocab_counter(row):
    for word in row:
        vocab_counter[word] += 1

In [20]:
vocab_counter = collections.Counter()
dataset['tokenized'].apply(update_vocab_counter);
vocab = sorted(vocab_counter, key=vocab_counter.get, reverse=True)

In [21]:
len(vocab)

12390

In [22]:
max_words = 5000

In [23]:
w2id = {w:i for i, w in enumerate(vocab[:max_words])}

In [24]:
w2id['unk'] = -1

In [25]:
def transform_to_ids(row):
    return [w2id[w] if w in w2id else w2id['unk'] for w in row]

In [26]:
dataset['tokenized_int'] = dataset['tokenized'].apply(lambda x: transform_to_ids(x))

In [27]:
lens = dataset['tokenized_int'].apply(lambda x: len(x))

In [28]:
min(lens), max(lens), np.mean(lens)

(0, 21, 8.987704918032787)

In [29]:
maxlen = 20

In [30]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dataset['target'] = le.fit_transform(dataset['Sentiment'])

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset['tokenized_int'].values, dataset['target'].values, test_size=0.25, random_state=0)

In [34]:
pip install keras

Note: you may need to restart the kernel to use updated packages.


In [32]:
from keras.preprocessing.sequence import pad_sequences
x_train = pad_sequences(X_train, maxlen=maxlen, value=-1)
x_test = pad_sequences(X_test, maxlen=maxlen, value=-1)

Using TensorFlow backend.


In [33]:
x_train[:5]

array([[  -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,  412,  141,
          98,  647,   -1,  129,  783,    0,   21,    5,   27],
       [  -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1, 1348,
        1642,    9,    9,   49,  562,   89,   55,   38,   17],
       [  -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
         510,  274,  469,   25,   48,   99,    7, 1627,    4],
       [  -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
          -1,   -1,   -1,   -1,   -1,   -1,   93, 4980,   -1],
       [  -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,    0,  274,
          24,  176, 3060,   21,   59,    0,   -1,   33,  318]])

In [34]:
y_train = y_train.reshape(len(y_train), 1)

In [35]:
y_train[:5]

array([[2],
       [2],
       [2],
       [1],
       [1]])

In [36]:
def test_fit_and_predict(training_preds, testing_preds):
    return np.array_equal(training_preds, testing_preds)

In [37]:
def main(X, y, depth):
    clf = DAN2Regressor(depth=depth)
    tr_pred = clf.fit(X, y)
    path = clf.name
    print(clf.coef_)
    y_pred = clf.predict(X)
    print(tr_pred, y_pred)
    print(test_fit_and_predict(tr_pred, y_pred))

In [38]:
if __name__ == '__main__':
    main(x_train,y_train, depth=10)


1 [ 1.          0.02175377  0.86114349 -0.24256921  0.24627964]
Iteration: 1  Mu: 1 MSE: 0.4929378992487241 Accuracy: 0.4914389799635701
2 [ 2.53414589e+09 -3.92661536e-03  1.16560379e+00  3.10327969e-02
  4.98088224e-02]
Iteration: 2  Mu: 2534145892.0116167 MSE: 0.49283515190771177 Accuracy: 0.49153005464480876
3 [ 1.12656189e+09  1.36822590e-02  8.46965382e-01 -6.09382356e-01
 -3.12614916e-01]
Iteration: 3  Mu: 1126561886.3222744 MSE: 0.4925332630930337 Accuracy: 0.49153005464480876
4 [3.37840729e+09 6.84786562e-04 1.18006837e+00 2.91094399e-03
 2.13094195e-02]
Iteration: 4  Mu: 3378407287.381178 MSE: 0.4925098072657395 Accuracy: 0.49153005464480876
5 [ 3.20281007e+09  3.83449622e-03  8.44284623e-01 -1.06098133e+00
 -5.43135363e-01]
Iteration: 5  Mu: 3202810068.063372 MSE: 0.4924372181826588 Accuracy: 0.49153005464480876
6 [ 3.33354055e+09  5.17789968e-03  1.17873402e+00 -1.20464728e-02
  2.30419904e-02]
Iteration: 6  Mu: 3333540549.8014803 MSE: 0.4922770689947139 Accuracy: 0.4914389