In [16]:
import pandas as pd
trump_tweets = pd.read_csv('trump.csv')

In [17]:
pd.set_option('display.max_colwidth', -1)
trump_tweets = trump_tweets.dropna()
trump_tweets = trump_tweets.head(1000)

In [18]:
sentiment_counts = trump_tweets.Sentiment.value_counts()
print(sentiment_counts)

Neutral     556
Positive    315
Negative    129
Name: Sentiment, dtype: int64


In [19]:
def sentiment2target(sentiment):
    return {
        'Negative': 0,
        'Neutral': 1,
        'Positive' : 2
    }[sentiment]
targets = trump_tweets.Sentiment.apply(sentiment2target)

In [20]:
from sklearn.model_selection import train_test_split

sentences = trump_tweets['Tweet'].values
y = targets

sentences_train, sentences_test, y_train, y_test = train_test_split(
   sentences, y, test_size=0.20, random_state=1000)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)
X_train

<800x3017 sparse matrix of type '<class 'numpy.int64'>'
	with 8751 stored elements in Compressed Sparse Row format>

In [22]:
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(ngram_range=(1,2))

In [23]:
vectorized_data = count_vectorizer.fit_transform(trump_tweets.Tweet)
indexed_data = hstack((np.array(range(0,vectorized_data.shape[0]))[:,None], vectorized_data))
indexed_data

<1000x9349 sparse matrix of type '<class 'numpy.int64'>'
	with 22091 stored elements in COOrdinate format>

In [24]:
from sklearn.model_selection import train_test_split
data_train, data_test, targets_train, targets_test = train_test_split(indexed_data, targets, test_size=0.4, random_state=0)

data_train_index = data_train[:,0]
data_train = data_train[:,1:]
data_test_index = data_test[:,0]
data_test = data_test[:,1:]


In [25]:
def CMatrix(CM, labels=['pay', 'default']):
    df = pd.DataFrame(data=CM, index=labels, columns=labels)
    df.index.name = 'TRUE'
    df.columns.name = 'PREDICTION'
    df.loc['Total'] = df.sum()
    df['Total'] = df.sum(axis=1)
    return df

In [26]:
from sklearn.metrics import accuracy_score

metrics = pd.DataFrame(index=['accuracy'],
                      columns=['SVM', 'LogisticalReg', 'RNN'])

In [27]:
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
clf = OneVsRestClassifier(svm.SVC(gamma=0.01, C=100., probability=True, class_weight='balanced', kernel='linear'))
clf_output = clf.fit(X_train, y_train)

In [28]:
clf.score(X_test, y_test)

0.81

In [30]:
clf.score(X_test, y_test)
y_pred_test = clf.predict(X_test)
metrics.loc['accuracy', 'SVM'] = accuracy_score(y_pred=y_pred_test, y_true = y_test)
100*metrics

Unnamed: 0,SVM,LogisticalReg,RNN
accuracy,81,,


In [31]:
from sklearn.linear_model import LogisticRegression

#n_jobs=-1 = Use all available cores in machine
#random_state = randomise data
lg = LogisticRegression(random_state=10)

#use training data
lg.fit(data_train,targets_train)

y_pred_test = lg.predict(data_test)
metrics.loc['accuracy', 'LogisticalReg'] = accuracy_score(y_pred=y_pred_test, y_true = targets_test)
100*metrics



Unnamed: 0,SVM,LogisticalReg,RNN
accuracy,81,78.25,


In [32]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

vocab_size = len(tokenizer.word_index) + 1

print(sentences_train[2])
print(X_train[2])

Using TensorFlow backend.


b'today collusion cartoon torontostar donaldtrump russianasset http co lpbraptkq'
[148, 16, 50, 181, 3, 22, 1, 2, 182]


In [33]:
from keras.preprocessing.sequence import pad_sequences

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

print(X_train[0, :])

[  85   69   86 1028  179  602    1    2 1029    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [34]:
from keras.models import Sequential
from keras import layers
 
input_dim = data_train.shape[1]  # Number of features

model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [35]:
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 10)                93490     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 93,501
Trainable params: 93,501
Non-trainable params: 0
_________________________________________________________________


In [36]:
history = model.fit(data_train, targets_train,
                    epochs=100,
                    verbose=False,
                    validation_data=(data_test, targets_test),
                    batch_size=10)

In [37]:
loss, accuracy = model.evaluate(data_train, targets_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(data_test, targets_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.6883
Testing Accuracy:  0.5875


In [38]:
metrics.loc['accuracy', 'RNN'] = accuracy
100*metrics

Unnamed: 0,SVM,LogisticalReg,RNN
accuracy,81,78.25,58.75


In [43]:
from keras.models import Sequential
from keras import layers

embedding_dim = 50
maxlen = 100

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 50)           161950    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 50)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                510       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 11        
Total params: 162,471
Trainable params: 162,471
Non-trainable params: 0
_________________________________________________________________


In [44]:
history = model.fit(X_train, y_train,
                    epochs=50,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.6893
Testing Accuracy:  0.5600


In [49]:
from keras.layers import LSTM
from keras.layers import Embedding, Dense

lstm = Sequential()
lstm.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
lstm.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
lstm.add(Dense(1, activation="sigmoid"))

In [50]:
lstm.compile(loss="binary_crossentropy",
            optimizer="adam",
            metrics=['accuracy'])

In [51]:
history2 = lstm.fit(X_train, y_train,
                    epochs=50,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)


In [52]:
loss, accuracy = lstm.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = lstm.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.5600
Testing Accuracy:  0.5440
