[Reference](https://medium.com/hackernoon/presidential-debate-sentiment-analysis-with-lstm-onevsrest-linearsvc-nlp-step-by-step-guide-b9683e2c8ed9)

In [2]:
import numpy as np 
import pandas as pd 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical
import re

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import RidgeClassifier

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/Branden-Kang/Natural-Language-Processing/master/Data/Sentiment.csv")

In [4]:
df.head()

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,name,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,I_Am_Kenzi,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,PeacefulQuest,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,PussssyCroook,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,MattFromTexas31,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,sharonDay5,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


In [5]:
df = df[['text','sentiment']]
df.sample(5)

Unnamed: 0,text,sentiment
10914,THIS ISNT A DEBATE!!!! This is a Q and A #GOPD...,Negative
11092,“@msgoddessrises: Damn I'm out of Peach vodka!...,Negative
13800,"Okay, @JebBush gets a point for that! #commonC...",Positive
8686,RT @BettyFckinWhite: So many great jokes on Tw...,Negative
13241,RT @mozgovaya: 10 men on stage discussing one ...,Positive


In [6]:
df.shape

(13871, 2)

In [7]:
df['sentiment'].value_counts()

Negative    8493
Neutral     3142
Positive    2236
Name: sentiment, dtype: int64

In [8]:
df.iloc[57,0]

'RT @factcheckdotorg: .@JebBush said he cut FL taxes by $19B. But that includes cuts in estate taxes mandated by federal law. #GOPDebate. ht…'

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size=0.33, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [10]:
X_train.head()

10428    RT @SalMasekela: These self righteous hypocrit...
13671    RT @brock_a_r: I wonder which candidate is goi...
89       A few of my favorite #Twitter responses to the...
5024     @FoxNews Megyn Kelly's #GOPDebate performance ...
8572     I didn't watch the #GOPDebates tonight, so I w...
Name: text, dtype: object

In [11]:
y_train.head()

10428    Negative
13671    Negative
89        Neutral
5024     Negative
8572     Negative
Name: sentiment, dtype: object

# Text Pre-processing

In [12]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;#]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z +_]')
STOPWORDS = stopwords.words('english')
STOPWORDS.extend(['rt', 'http']) # extend stopwords; rt means re-tweet
STOPWORDS = set(STOPWORDS)

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join([word for word in text.split() if word not in STOPWORDS]) # delete stopwords from text
    text = text.strip()
    return text

In [13]:
X_train = [text_prepare(x) for x in X_train]

In [14]:
X_train[:3]

['salmasekela self righteous hypocrites trying god shameless gopdebates',
 'brock_a_r wonder candidate going first accidentally call ben carson help gopdebates',
 'favorite twitter responses gopdebate last night tco 2iqcdrcdlm tco 7tdma3vlm8']

In [15]:
X_val = [text_prepare(x) for x in X_val]

In [16]:
X_test = [text_prepare(x) for x in X_test]

In [17]:
# Dictionary of all words from train corpus with their counts.
words_counts = {}

from collections import Counter
words_counts = Counter([word for line in X_train for word in line.split(' ')])

# Sorting 
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:10]

# Top 10
most_common_words[:10]

[('gopdebate', 4811),
 ('gopdebates', 2827),
 ('tco', 1941),
 ('rwsurfergirl', 1072),
 ('trump', 953),
 ('fox', 714),
 ('amp', 578),
 ('debate', 569),
 ('realdonaldtrump', 560),
 ('news', 504)]

# Word Embedding with TF-IDF

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_features(X_train, X_val, X_test):
    """
        X_train, X_val, X_test - input text       
        return TF-IDF vectorizer for each dataset
    """
    
    # filter out too rare words (occur less than in 5 titles) and too frequent words (occur more than in 90% of the tweets)
    # ngram!!! -->  ngram_range=(1,2)
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5, token_pattern='(\S+)')
    
    # Fit and transform the vectorizer on the train set
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    
    # Transform the test and val sets 
    X_val_tfidf = tfidf_vectorizer.transform(X_val)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    
    return X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vectorizer.vocabulary_
    
    
X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, X_test)

In [19]:
X_train_tfidf.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.40615562],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

# 1st Model: Logistic regression

In [22]:
logreg = LogisticRegression()
# %%time
logreg.fit(X_train_tfidf, y_train)

# Return accuracy
scores = cross_val_score(logreg, X_train_tfidf, y_train, scoring='accuracy', n_jobs=-1, cv=3)

print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Cross-validation mean accuracy 67.93%, std 0.32.


# 2nd Model: LinearSVC

In [23]:
%%time
svc = LinearSVC(dual=False)
svc.fit(X_train_tfidf, y_train)


scores = cross_val_score(svc, X_test_tfidf, y_test, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean accuracy 63.81%, std 0.36.
CPU times: user 277 ms, sys: 4.03 ms, total: 281 ms
Wall time: 464 ms


# 3rd Model: OneVsRest

In [24]:
def train_classifier(X_train, y_train):
    """
      X_train, y_train — training text and sentiment
      
      return: trained classifier
    """
    
    # Create and fit LogisticRegression wraped into OneVsRestClassifier.
    
    model = OneVsRestClassifier(LogisticRegression(penalty='l2', C=1.0))
    model.fit(X_train, y_train)
    return model
    
 
classifier_tfidf = train_classifier(X_train_tfidf, y_train)

y_val_predicted_labels_tfidf = classifier_tfidf.predict(X_val_tfidf)
y_val_predicted_scores_tfidf = classifier_tfidf.decision_function(X_val_tfidf)

In [25]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

def evaluation_scores(y_val, predicted):
    
    print ("Accracy={}".format(accuracy_score(y_val, predicted)))
    print ("F1_macro={}".format(f1_score(y_val, predicted, average='macro')))
    print ("F1_micro={}".format(f1_score(y_val, predicted, average='micro')))
    print ("F1_wted={}".format(f1_score(y_val, predicted, average='weighted')))
    
print('Tfidf')
evaluation_scores(y_val, y_val_predicted_labels_tfidf)

Tfidf
Accracy=0.6906939214631522
F1_macro=0.5550930129562491
F1_micro=0.6906939214631522
F1_wted=0.6518652578108339


# 4th Model: LSTM with Keras

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical

In [27]:
X = [text_prepare(x) for x in df['text']]

In [28]:
X[:3]

['nancyleegrahn everyone feel climate change question last night exactly gopdebate',
 'scottwalker didnt catch full gopdebate last night scotts best lines 90 seconds walker16 tco zsff',
 'tjmshow mention tamir rice gopdebate held cleveland wow']

In [29]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X)

## Create LSTM model

In [30]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 24, 128)           256000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 24, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 3)                 591       
Total params: 511,391
Trainable params: 511,391
Non-trainable params: 0
_________________________________________________________________
None


In [31]:
Y = pd.get_dummies(df['sentiment']).values

In [32]:
Y[:3]

array([[0, 1, 0],
       [0, 0, 1],
       [0, 1, 0]], dtype=uint8)

In [33]:
# create train and test datasets
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print("Trianing ", X_train.shape,Y_train.shape)
print("Testing ",X_test.shape,Y_test.shape)

batch_size = 32
model.fit(X_train, Y_train, epochs = 20, batch_size=batch_size, verbose = 2)

Trianing  (9293, 24) (9293, 3)
Testing  (4578, 24) (4578, 3)
Epoch 1/20
291/291 - 37s - loss: 0.8316 - accuracy: 0.6423
Epoch 2/20
291/291 - 37s - loss: 0.6890 - accuracy: 0.7036
Epoch 3/20
291/291 - 37s - loss: 0.6250 - accuracy: 0.7337
Epoch 4/20
291/291 - 37s - loss: 0.5877 - accuracy: 0.7529
Epoch 5/20
291/291 - 37s - loss: 0.5568 - accuracy: 0.7652
Epoch 6/20
291/291 - 38s - loss: 0.5248 - accuracy: 0.7797
Epoch 7/20
291/291 - 37s - loss: 0.4934 - accuracy: 0.7950
Epoch 8/20
291/291 - 37s - loss: 0.4696 - accuracy: 0.8045
Epoch 9/20
291/291 - 37s - loss: 0.4457 - accuracy: 0.8162
Epoch 10/20
291/291 - 37s - loss: 0.4260 - accuracy: 0.8214
Epoch 11/20
291/291 - 37s - loss: 0.4044 - accuracy: 0.8315
Epoch 12/20
291/291 - 37s - loss: 0.3854 - accuracy: 0.8408
Epoch 13/20
291/291 - 37s - loss: 0.3777 - accuracy: 0.8445
Epoch 14/20
291/291 - 37s - loss: 0.3549 - accuracy: 0.8524
Epoch 15/20
291/291 - 37s - loss: 0.3520 - accuracy: 0.8568
Epoch 16/20
291/291 - 37s - loss: 0.3362 - accur

<tensorflow.python.keras.callbacks.History at 0x7fb40f9e4908>