In [98]:
import re     # Regular Expression library
import json
import numpy as np
import nltk   # Natural Language Processing library
import pandas as pd
from matplotlib import pyplot as plt
from nltk.stem import PorterStemmer
import sklearn.linear_model as sk
from sklearn.decomposition import PCA 
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix, roc_curve
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
import string

In [39]:
# This function performs the basic natural language processes of removal of punctuations, tokenization,
# removal of stop words and stemming.

def tokenize(data):

    ps = PorterStemmer()
    stop_words = set(nltk.corpus.stopwords.words('english'))
    temp = data
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    temp = regex.sub(' ', temp)
    temp = "".join(b for b in temp if ord(b) < 128)
    words = nltk.word_tokenize(temp)
    no_stop_words = [w.lower() for w in words if not w in stop_words]
    stemmed = [ps.stem(item) for item in no_stop_words]

    return stemmed

In [85]:
# This function uses regular expressions to map the locations which contain any location.
def location(data):
    temp = data[(data.location.str.contains(r'[.]+ WA$'))
            | (data.location.str.contains(r'[.]+ MA$'))
            | (data.location.str.contains('Boston'))
            | (data.location.str.contains('Seattle'))
            | (data.location.str.contains(r'[.]+ Washington\s'))
            | (data.location.str.contains('Massachusetts'))]
    return temp

In [89]:
# Generates the variable 1 for any tweet having Seattle or Washington in tweet otherwise 0.
def maps(data):
    targets = []
    for location in data.location.apply(lambda x: x.encode('utf-8').strip()):
        if (b'[.]+ WA$' in location) or (b'Seattle' in location) or (b'[.]+ Washington\s' in location):
            targets.append(1)
        else:
            targets.append(0)
    return np.array(targets)

In [87]:
# It is important to balance the dataset as if one class has more examples then the model predicts everything to be of majority class.
def balance(data, targets):
    new_data = data.copy()
    if (len(targets[targets==1])) > (len(targets[targets==0])):
        points_needed = len(targets[targets==1]) - len(targets[targets==0])
        indices = np.where(targets == 0)
    else:
        points_needed = len(targets[targets==0]) - len(targets[targets==1])
        indices = np.where(targets == 1)

    np.random.shuffle(indices)
    indices = np.resize(indices, points_needed)
    new_data = new_data.append(data.iloc[indices])
    targets_to_add = targets[indices]
    new_targets = np.concatenate([targets, targets_to_add])
    return new_data, new_targets

In [43]:
filename = 'tweets_#superbowl.txt'

# Collect tweets from superbowl
tweets_ = []
with open(filename, 'r', encoding="utf8") as f:
    for row in f:
        jrow = json.loads(row)
        d = {
            'tweet': jrow['title'],
            'location': jrow['tweet']['user']['location']
        }
        tweets_.append(d)
all_data = pd.DataFrame(tweets_)


In [88]:
reduced_data = location(all_data)

In [90]:
all_targets = maps(reduced_data)

In [91]:
data, train_targets = balance(reduced_data, all_targets)

In [92]:
# Use the counter vectorizer and tfidf to transform the training data.
vectorizer = CountVectorizer(analyzer='word', stop_words='english', tokenizer=tokenize)
tfidf_transformer = TfidfTransformer()

model = vectorizer.fit(data.tweet)
train_counts=model.transform(data.tweet)
train_tfidf = tfidf_transformer.fit_transform(train_counts)

  'stop_words.' % sorted(inconsistent))


In [93]:
# Use truncated SVD for dimensionality reduction as it deals better with sparse matrix.
svd = TruncatedSVD(n_components=60, random_state=42)
svd_model = svd.fit(train_tfidf)
train_reduced=svd_model.transform(train_tfidf)

In [94]:
# Scale the data to remove any irregular scales for the features. We used min max scaler in this case because every other 
# scalar was introducing negative values which cannot be processed by a naive bayes model.
min_max_scaler = preprocessing.MinMaxScaler()
train_data = min_max_scaler.fit_transform(train_reduced)

In [95]:
# Used 10 fold cross validation with multinomial Naive Bayes to predict the location from the tweet.

n=10
kf = KFold(n_splits=n, shuffle=True, random_state=42)

accuracy = 0
for i, j in kf.split(train_data):
    X_train, X_test = train_data[i], train_data[j]
    y_train, y_test = train_targets[i], train_targets[j]

    clf = MultinomialNB(alpha=0.1).fit(X_train, y_train)
    bayes_pred = clf.predict(X_test)
    bayes_accuracy = np.mean(bayes_pred == y_test)
    accuracy += bayes_accuracy

print ("Average CV-Accuracy of Multinomial Naive Bayes: " + str(accuracy/k))
print((classification_report(y_test, bayes_pred)))
print ("Confusion Matrix: \n",confusion_matrix(y_test, bayes_pred))
bayes_pred

Average CV-Accuracy of Multinomial Naive Bayes: 0.741293977340489
              precision    recall  f1-score   support

           0       0.71      0.85      0.77      1702
           1       0.80      0.64      0.71      1652

   micro avg       0.74      0.74      0.74      3354
   macro avg       0.75      0.74      0.74      3354
weighted avg       0.75      0.74      0.74      3354

Confusion Matrix: 
 [[1441  261]
 [ 598 1054]]


array([0, 1, 1, ..., 0, 1, 1])

In [96]:
# Used 10 fold cross validation with Logistic Regression and generate the confusion matrix.
accuracy = 0
for i, j in kf.split(train_data):
    X_train, X_test = train_data[i], train_data[j]
    y_train, y_test = train_targets[i], train_targets[j]

    logit = sk.LogisticRegression(solver='lbfgs').fit(X_train, y_train)
    # Only picking the predictions where probability is greater than 0.5
    probabilities = logit.predict(X_test)
    lr_pred = (probabilities > 0.5).astype(int)
    lr_accuracy = np.mean(lr_pred == y_test)
    accuracy += lr_accuracy

print ("Average CV-Accuracy of Logistic Regression: " + str(accuracy/k))
print((classification_report(y_test, lr_pred)))
print ("Confusion Matrix: \n",confusion_matrix(y_test, lr_pred))
lr_pred



Average CV-Accuracy of Logistic Regression: 0.8133870005963029
              precision    recall  f1-score   support

           0       0.76      0.93      0.84      1702
           1       0.91      0.70      0.79      1652

   micro avg       0.82      0.82      0.82      3354
   macro avg       0.84      0.82      0.82      3354
weighted avg       0.84      0.82      0.82      3354

Confusion Matrix: 
 [[1588  114]
 [ 492 1160]]




array([0, 1, 1, ..., 0, 1, 1])

In [97]:
# Used linear svm with 10 fold cross validation.

accuracy = 0
for i, j in kf.split(train_data):
    X_train, X_test = train_data[i], train_data[j]
    y_train, y_test = train_targets[i], train_targets[j]

    linear_SVM = LinearSVC(dual=False, random_state=42).fit(X_train, y_train)
    svm_pred = linear_SVM.predict(X_test)
    svm_accuracy = np.mean(svm_pred == y_test)
    accuracy += svm_accuracy

print ("Average CV-Accuracy of Linear SVM: " + str(accuracy/k))
print((classification_report(y_test, svm_pred)))
print ("Confusion Matrix: \n",confusion_matrix(y_test, svm_pred))
svm_pred

Average CV-Accuracy of Linear SVM: 0.8131484794275492
              precision    recall  f1-score   support

           0       0.76      0.93      0.84      1702
           1       0.91      0.70      0.79      1652

   micro avg       0.82      0.82      0.82      3354
   macro avg       0.84      0.82      0.82      3354
weighted avg       0.84      0.82      0.82      3354

Confusion Matrix: 
 [[1591  111]
 [ 496 1156]]


array([0, 1, 1, ..., 0, 1, 1])

In [101]:
accuracy = 0
for i, j in kf.split(train_data):
    X_train, X_test = train_data[i], train_data[j]
    y_train, y_test = train_targets[i], train_targets[j]

    rf = RandomForestClassifier(n_estimators=50,random_state=42).fit(X_train, y_train)
    rf_pred = rf.predict(X_test)
    rf_accuracy = np.mean(svm_pred == y_test)
    accuracy += rf_accuracy

print ("Average CV-Accuracy of Random Forest Classifier: " + str(accuracy/k))
print((classification_report(y_test, rf_pred)))
print ("Confusion Matrix: \n",confusion_matrix(y_test, rf_pred))
rf_pred

Average CV-Accuracy of Linear SVM: 0.6479427549194992
              precision    recall  f1-score   support

           0       0.85      0.92      0.88      1702
           1       0.91      0.83      0.87      1652

   micro avg       0.87      0.87      0.87      3354
   macro avg       0.88      0.87      0.87      3354
weighted avg       0.88      0.87      0.87      3354

Confusion Matrix: 
 [[1564  138]
 [ 283 1369]]


array([0, 1, 1, ..., 1, 1, 1])