# STEP 1: ADD REQUIRED LIBRARIES

In [5]:
import pandas as pd
import numpy as np
import nltk
import timeit
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import sklearn.metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score

# STEP 2: SET RANDOM SEED

In [6]:
# This is used to reproduce the same result every time if the script is kept consistent.
# Otherwise, each run will produce different results. The seed can be set to any number.
np.random.seed(500)

# STEP 3: LOAD TWITTER DATASET

In [7]:
# STEP 3: ADD CORPUS
print("Loading data...")
dataset = pd.read_csv("Datasets/Twitter.csv", encoding='latin-1', names=["label", "id", "date", "flag", "user", "text"])
dataset = dataset.drop(dataset.index[0])
print("Data successfully loaded!")
print(dataset)


# NOTE: The code below was used to reuce the source dataset (Twitter_Source.csv)
# into a reduced set of 50,000 records (25,000 positive, 25,000 negative). The
# purpose for doing so is to match the number of records in IMDB dataset.

#dataset = dataset.sample(frac=1).reset_index(drop=True)
#print(dataset)
#df_pos = dataset[(dataset.label == 0)].head(25000)
#print(df_pos)
#df_neg = dataset[(dataset.label == 4)].head(25000)
#print(df_neg)
#frames = [df_pos, df_neg]
#data_reduced = pd.concat(frames)
#print(data_reduced)
#data_reduced.to_csv("Twitter_Reduced.csv", encoding='utf-8', index=False)

Loading data...
Data successfully loaded!
      label          id                          date      flag  \
1         0  2049279662  Fri Jun 05 16:36:21 PDT 2009  NO_QUERY   
2         0  2204003722  Wed Jun 17 01:04:33 PDT 2009  NO_QUERY   
3         0  2186271634  Mon Jun 15 18:36:03 PDT 2009  NO_QUERY   
4         0  1678353782  Sat May 02 06:25:56 PDT 2009  NO_QUERY   
5         0  1695759054  Mon May 04 06:53:05 PDT 2009  NO_QUERY   
...     ...         ...                           ...       ...   
49996     4  1834175935  Mon May 18 01:55:22 PDT 2009  NO_QUERY   
49997     4  1982199141  Sun May 31 10:43:39 PDT 2009  NO_QUERY   
49998     4  1999647354  Mon Jun 01 20:34:49 PDT 2009  NO_QUERY   
49999     4  1988905306  Sun May 31 23:43:41 PDT 2009  NO_QUERY   
50000     4  1882355493  Fri May 22 06:43:20 PDT 2009  NO_QUERY   

                  user                                               text  
1             ccccourt                     My family is a bunch of hicks   
2

# STEP 4: DATA PROCESSING

In [8]:
# Step 4a: Remove blank rows if any.
print("\nStart of Data Pre-processing...")
print("- Removing blanks...")
dataset['text'].dropna(inplace=True)
print("- Blanks removed.")


Start of Data Pre-processing...
- Removing blanks...
- Blanks removed.


In [9]:
# Step 4b : Change all the text to lower case. 
print("- Converting to lowercase...")
dataset['text'] = [entry.lower() for entry in dataset['text']]
print("- Converted to lowercase.")

- Converting to lowercase...
- Converted to lowercase.


In [10]:
# Step 4c: Tokenization to divide each entry in dataset into a set of words (unigram)
print("- Starting tokenization...")
dataset['text'] = [word_tokenize(entry) for entry in dataset['text']]
print("- Tokenization is complete")
print(dataset['text'])
print("- End of data pre-processing.")
print("\n***** Current View of Corpus *****\n")
print(dataset)

- Starting tokenization...
- Tokenization is complete
1                    [my, family, is, a, bunch, of, hicks]
2                     [oh, gosh, !, what, can, i, do, now]
3        [facebook, status, :, james, ca, n't, get, int...
4        [sniffling, ,, and, lonely, ., great, ,, just,...
5        [@, bongchi, i, 'll, get, there, eventually, ,...
                               ...                        
49996    [@, loris_sl, london, is, nice, also, slovenia...
49997    [i, hope, wins, the, two, awards, that, she, i...
49998                                   [@, futureisclear]
49999               [spare, my, time, with, mom, at, home]
50000    [morning, all, ,, just, got, off, work, (, 6:3...
Name: text, Length: 50000, dtype: object
- End of data pre-processing.

***** Current View of Corpus *****

      label          id                          date      flag  \
1         0  2049279662  Fri Jun 05 16:36:21 PDT 2009  NO_QUERY   
2         0  2204003722  Wed Jun 17 01:04:33 PDT 2009  

# STEP 5: PREPARE TRAIN & TEST DATASETS

In [11]:
print("- Splitting dataset (80% train, 20% test)...")
train_x, test_x, train_y, test_y = model_selection.train_test_split(dataset['text'], dataset['label'], test_size=0.2)
print("- Data splitting is complete.")

- Splitting dataset (80% train, 20% test)...
- Data splitting is complete.


# STEP 6: DATA ENCODING

In [12]:
print("- Encoding data if necessary (i.e. categorical data converted to numerical values)...\n")
Encoder = LabelEncoder()
print("Original Train_Y: ", train_y)
train_y = Encoder.fit_transform(train_y)
print("New Train_Y:      ", train_y)
print("Original Test_Y:  ", test_y)
test_y = Encoder.fit_transform(test_y)
print("New Test_Y:       ", test_y)
print("\n- Data encoding is complete.")

- Encoding data if necessary (i.e. categorical data converted to numerical values)...

Original Train_Y:  544      0
25471    4
29799    4
28699    4
18439    0
        ..
19390    0
3791     0
41234    4
44866    4
17336    0
Name: label, Length: 40000, dtype: object
New Train_Y:       [0 1 1 ... 1 1 0]
Original Test_Y:   26125    4
8563     0
43160    4
41936    4
8868     0
        ..
37048    4
40067    4
33889    4
27836    4
1305     0
Name: label, Length: 10000, dtype: object
New Test_Y:        [1 0 1 ... 1 1 0]

- Data encoding is complete.


# STEP 7: TF-IDF Feature Vectorization

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)

tfidf.fit(dataset['text'])
tfidf.vocabulary_

{'my': 40382,
 'family': 22207,
 'is': 29723,
 'a': 5328,
 'bunch': 12432,
 'of': 42402,
 'hicks': 27338,
 'oh': 42498,
 'gosh': 25426,
 '!': 0,
 'what': 61792,
 'can': 12979,
 'i': 28413,
 'do': 19139,
 'now': 42027,
 'facebook': 22043,
 'status': 54017,
 ':': 5137,
 'james': 30249,
 'ca': 12719,
 "n't": 40541,
 'get': 24737,
 'into': 29528,
 'his': 27503,
 'email': 20804,
 'account': 5670,
 'well': 61637,
 ',': 679,
 'could': 16101,
 'with': 62329,
 'fresh': 23787,
 'start': 53961,
 '-': 687,
 'the': 56574,
 'old': 42649,
 'one': 42794,
 'was': 61247,
 'getting..': 24759,
 'sniffling': 52772,
 'and': 7349,
 'lonely': 35659,
 '.': 808,
 'great': 25662,
 'just': 32044,
 'how': 28042,
 'wanted': 61190,
 'to': 57730,
 'spend': 53490,
 'day': 17646,
 'you': 63704,
 '@': 5174,
 'bongchi': 11319,
 "'ll": 166,
 'there': 56899,
 'eventually': 21573,
 'but': 12545,
 'it': 29815,
 'still': 54280,
 'kinda': 33149,
 'sucks': 54749,
 'c4': 12709,
 'so': 52835,
 'freakin': 23688,
 "'": 5,
 'annoyin

In [14]:
# These will now contain for each row a list of unique integer number and
# its associated importance as calculated by TF-IDF.
train_x_tfidf = tfidf.transform(train_x)
test_x_tfidf = tfidf.transform(test_x)
print("Word vectorization is complete.")
print("\n- Tfidf vectorized format of training set: ")
print(train_x_tfidf)
print("\n- Tfidf vectorized format of testing set: ")
print(test_x_tfidf)

Word vectorization is complete.

- Tfidf vectorized format of training set: 
  (0, 57773)	0.26580747568220714
  (0, 57730)	0.14180698993600635
  (0, 57365)	0.5782866189330201
  (0, 41959)	0.3779166761922947
  (0, 41940)	0.2364550845116838
  (0, 26649)	0.305973011002048
  (0, 24051)	0.3194201709904071
  (0, 19139)	0.23204337669400973
  (0, 12545)	0.2226532057026917
  (0, 0)	0.2775291352807045
  (1, 54978)	0.7377416662069001
  (1, 39598)	0.49541412629591225
  (1, 25284)	0.4005182088014637
  (1, 0)	0.2233545204986727
  (2, 47454)	0.28190893380499477
  (2, 28116)	0.25724632004098474
  (2, 26649)	0.30196139648961995
  (2, 25662)	0.3070500417535348
  (2, 23836)	0.35058715756898406
  (2, 5137)	0.2268014633520632
  (2, 2159)	0.6916687984078597
  (2, 808)	0.12702474260388927
  (3, 63704)	0.17617846021852218
  (3, 56574)	0.14152510730204554
  (3, 29723)	0.1815037154639709
  :	:
  (39998, 6005)	0.4371364615907571
  (39998, 5174)	0.07537801778420658
  (39998, 817)	0.1253158577502477
  (39999, 6232

# STEP 8: TRAIN & TEST ML MODELS FOR PREDICTING SENTIMENT OF TWEETS

In [15]:
# --- STEP 8: USE ML ALGORITHMS TO PREDICT OUTCOME ---
print("\n********** ML ALGORITHMS **********\n")
print("- Classifier 1: Multinomial Naive Bayes")

# fit the training dataset on the NB classifier
print("|___ Fitting training dataset on the MNB classifier...")
model_MNB = naive_bayes.MultinomialNB()
start = timeit.default_timer()
model_MNB.fit(train_x_tfidf, train_y)
stop = timeit.default_timer()
print("|___ Training is complete. Total time: ", stop - start)

# predict the labels on validation dataset
print("|___ Making predictions...")
start = timeit.default_timer()
predictions_MNB = model_MNB.predict(test_x_tfidf)
stop = timeit.default_timer()
print("|___ Finished making predictions. Total time: ", stop - start)

# calculate effectiveness metrics
print("|___ Naive Bayes Accuracy Score -> ", accuracy_score(test_y, predictions_MNB)*100)
print(classification_report(test_y, predictions_MNB))
print("Confusion Matrix:\n", confusion_matrix(test_y, predictions_MNB))


********** ML ALGORITHMS **********

- Classifier 1: Multinomial Naive Bayes
|___ Fitting training dataset on the MNB classifier...
|___ Training is complete. Total time:  0.014184599999907732
|___ Making predictions...
|___ Finished making predictions. Total time:  0.002373400000124093
|___ Naive Bayes Accuracy Score ->  75.11
              precision    recall  f1-score   support

           0       0.72      0.82      0.77      5058
           1       0.79      0.68      0.73      4942

    accuracy                           0.75     10000
   macro avg       0.76      0.75      0.75     10000
weighted avg       0.76      0.75      0.75     10000

Confusion Matrix:
 [[4161  897]
 [1592 3350]]


In [16]:
# Classifier 2 - Algorithm - SVM
# fit the training dataset on the SVM classifier
print("\n- Classifier 2: Support Vector Machines (SVM)")
print("|___ Fitting training dataset on the SVM classifier...")
model_SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
start = timeit.default_timer()
model_SVM.fit(train_x_tfidf, train_y)
stop = timeit.default_timer()
print("|___ Training is complete. Total time: ", stop-start)
# predict the labels on validation dataset
print("|___ Making predictions...")
start = timeit.default_timer()
predictions_SVM = model_SVM.predict(test_x_tfidf)
start = timeit.default_timer()
print("|___ Finished making predictions. Total time: ", stop-start)
# Use accuracy_score function to get the accuracy
print("|___ SVM Accuracy Score -> ", accuracy_score(test_y, predictions_SVM)*100)
print(classification_report(test_y, predictions_SVM))
print("Confusion Matrix:\n", confusion_matrix(test_y, predictions_SVM))


- Classifier 2: Support Vector Machines (SVM)
|___ Fitting training dataset on the SVM classifier...
|___ Training is complete. Total time:  363.12506180000014
|___ Making predictions...
|___ Finished making predictions. Total time:  -34.777232199999844
|___ SVM Accuracy Score ->  77.64999999999999
              precision    recall  f1-score   support

           0       0.79      0.76      0.78      5058
           1       0.76      0.79      0.78      4942

    accuracy                           0.78     10000
   macro avg       0.78      0.78      0.78     10000
weighted avg       0.78      0.78      0.78     10000

Confusion Matrix:
 [[3856 1202]
 [1033 3909]]


In [17]:
model_LR = LogisticRegression(C=1, max_iter=1000)
# fit the training dataset on the Logistic Regression classifier
print("\nFitting training dataset on the Logistic Regression classifier...")
start = timeit.default_timer()
model_LR.fit(train_x_tfidf, train_y)
stop = timeit.default_timer()
print("Training is complete. Total time: ", stop - start)
# predict the labels on validation dataset (i.e. test dataset)
print("Making predictions...")
start = timeit.default_timer()
predictions_LR = model_LR.predict(test_x_tfidf)
stop = timeit.default_timer()
print("- Finished making predictions! Total Time: ", stop - start)
print("- LR Accuracy Score for C=%s: %s" % (1, accuracy_score(test_y, predictions_LR)*100))
print(classification_report(test_y, predictions_LR))
print("Confusion Matrix:\n", confusion_matrix(test_y, predictions_LR))


prec = metrics.precision_score(test_y, predictions_LR, average='macro')
rec = metrics.recall_score(test_y, predictions_LR, average='macro')
f1 = metrics.f1_score(test_y, predictions_LR, average='macro')
print(prec)
print(rec)
print(f1)


Fitting training dataset on the Logistic Regression classifier...
Training is complete. Total time:  0.8445051999999578
Making predictions...
- Finished making predictions! Total Time:  0.0010948999999982334
- LR Accuracy Score for C=1: 77.36
              precision    recall  f1-score   support

           0       0.79      0.75      0.77      5058
           1       0.76      0.79      0.78      4942

    accuracy                           0.77     10000
   macro avg       0.77      0.77      0.77     10000
weighted avg       0.77      0.77      0.77     10000

Confusion Matrix:
 [[3810 1248]
 [1016 3926]]
0.7741338270298863
0.7738386877338215
0.7735695315161608
