In [None]:

#===================================================
# - Takes the pulled data and preprocesses it
# - Creates a Skip-Gram model of vector lengths 100
# - Saves our model to .bin files
# - Saves various text data to pandas data frames
#===================================================

import pytumblr     # Tumblr API library
import json         # JSON module: Converts python dictionaries into strings
import codecs       #
import re           # Regular Expression library
import nltk         # (Natural Language Processing Tool Kit) tokenizes sentences for me
import gensim       # Library for Word Embedding / Word2Vec
import pandas as pd # Used to create data frames with rows and columns
import numpy as np  # 
import matplotlib   # PLotting library
import imblearn     # Our testing sample is umbalanced, methods to create dummy samples
import sklearn      # More Machine Learning Algorithms
import warnings     # Lets me ignore warning messages


from numpy import array
from matplotlib import pyplot as plt
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import Word2Vec
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import svm, datasets
from sklearn.utils.multiclass import unique_labels


# App's credentials to use Tumblr API library
# (tokens and secrets are considered sensitive information; thus, my tokens are not actually shown here)
CONSUMER_KEY = '**************************************************'
CONSUMER_SECRET = '**************************************************'
OAUTH_TOKEN = '**************************************************'
OAUTH_SECRET = '**************************************************'

# Authentictates the app using Tumblr credentials
client = pytumblr.TumblrRestClient(
    CONSUMER_KEY,
    CONSUMER_SECRET,
    OAUTH_TOKEN,
    OAUTH_SECRET,
)


# Removes warning messages from output
warnings.filterwarnings(action = 'ignore')

# Helps print JSON or lists that are all jumbled together (too hard to read)
def print2(o):
    print(json.dumps(o, indent=1))

# Print options for data frames
pd.set_option('display.max_rows', None)       # Prints all rows
#pd.set_option('display.max_columns', None)    # Prints all columns
pd.set_option('display.width', None)          # Doesn't allow the text to wrap around underneath the bottom row
pd.set_option('display.max_colwidth', -1)     # Prints an entire sentence/paragraph


In [None]:
#=========================================================================================================

#****
# Pulls the content from posts and records it to a list (does not pull images, only text)
# "blog_post" is a list-of-dictionaries, each dictionary is a post; 'blog_post' is created using tagSearch()
#
# %% Returns A list of strings (sentences)
#****

def pullPost(blog_post):
    # Lists for following keys: summary, content_raw, comment
    raw_content = []
    # Temporary list, used to hold a 'dictionary' that has the required content inside in a specific 'key'
    temp = []
    
    # Add a blog post's "summary" to the list
    for x in blog_post:
        try:
            raw_content.append (x['summary'])
        except:
            pass
    
    # Add a blog post's "content_raw" to the list
    for x in blog_post:
        try:
            temp.append (x['trail'])
            for x in temp:
                for y in x:
                    raw_content.append (y['content_raw'])
        except:
            pass
    temp.clear()
    
    # A list-of-strings (sentences)
    return raw_content

#=========================================================================================================

In [None]:
#=========================================================================================================

#****
# - Takes raw data and PreProcesses it.
#   Cleans unwanted text and each sentence is represented as its own list-of-words
# - 'list_stringsX' is a list-of-strings (sentences)
#
# %% Returns a list-of-lists; each inner-list represents a single split-sentence
#****
def preProc(list_stringsX):
    
    # Copies the input list to a temporary list; so that we don't overwrite the input list
    list_strings = list_stringsX.copy()
    
    # Removes un-needed text
    # "list_strings" is a list of strings
    for x in range(len(list_strings)):
        list_strings[x] = list_strings[x].lower()                        # Lowercases all alpha-characters
        list_strings[x] = re.sub(r'http\S+', ' ', list_strings[x])       # Removes strings that start with http
        list_strings[x] = re.sub(r'https\S+', ' ', list_strings[x])      # Removes strings that start with https
        list_strings[x] = re.sub(r"'", '', list_strings[x])              # Removes apostrophes
        list_strings[x] = re.sub(r"’", '', list_strings[x])              # Removes apostrophes in a different font
        list_strings[x] = re.sub(r",", '', list_strings[x])              # Removes commas
        list_strings[x] = re.sub(r'@\w+', '', list_strings[x])           # Removes string prefixed '@'
        list_strings[x] = re.sub(r'\s#\w+', '', list_strings[x])         # Removes'#' and anything connected to it
        list_strings[x] = re.sub(r'\W',' ',list_strings[x])              # Removes other special characters
        list_strings[x] = re.sub(r'\s*\d\s*', '', list_strings[x])       # Removes numbers/digits
        list_strings[x] = re.sub(r'\^[a-zA-Z]\s+','', list_strings[x])   # Removes single char from beginning
        list_strings[x] = re.sub(r'^b\s+','', list_strings[x])           # Removes prefixed 'b'
        list_strings[x] = re.sub(r'((?<=^)|(?<= )).((?=$)|(?= ))', '', list_strings[x])  # Removes single chars
        list_strings[x] = re.sub(r'\s+',' ',list_strings[x])             # Replace multiple spaces with one space
        
        
    #====
    # A list of pre-processed sentences, each list has only 1 item: a whole sentence
    #print(list_strings)
    #====


    # A list to hold each sentence list
    list_sentence = []

    # Splits all sentences into individual words
    for x in list_strings:    
        list_sentence.append (x.split())

    #====
    # A list-of-lists, each list is comprised of each word of its respective sentence
    #print(list_sentence)
    #====
    
    # Returns a list-of-lists
    return list_sentence

#=========================================================================================================

In [None]:
#=========================================================================================================

#****
# - Takes the list-of-words and its corresponding set of
#   word-vectors and converts each sentence into a large vector
# - 'split_sent' is a list-of-lists (sentence-words)
# - 'vectList' is the corresponding Skip Gram vector model to the passed 'split_sent'
#
# %% Returns a list-of-vectors; each vector represents a single sentence
#****
def makeMatrix(split_sent, vectList):

    # Used to hold all the vectors of a sentence.
    # Our template accommodates 20 words;
    #  and we'll only use the first 10 elements of each word-vector.
    #  So 20 word-vectors, each of length 10, will make a vector of length 200.
    #  Sentences with less than 20 words will pad the vector with 0s
    templateVector = []

    # This will hold all the 200-long sentence-vectors
    vectorMatrix = []

    # Loops thru each sentence
    for sentenceT in range(len(split_sent)):
        #print(split_sent[sentenceT]) # each sentence

        # Loops thru each sentence's words, adding the first 10 items from each vector to 'templateVector'
        for wordT in range(len(split_sent[sentenceT])):
            #print(split_sent[sentenceT][wordT])         # each word
            #print(model100[split_sent[sentenceT][wordT]]) # each word vector
            templateVector.extend(vectList[split_sent[sentenceT][wordT]][0:10])
            #print(len(templateVector))

        # Pads with 0.0s if less than 200; or slices the list to exactly 200
        templateVector = templateVector[:200] + [0.0]*(200 - len(templateVector))
        #print(len(templateVector))

        # Add the sentence vector to the big list
        vectorMatrix.append(templateVector)
        #print(len(vectorMatrix[sentenceT]))

        # Reset templateVector for the next sentence
        templateVector = []


        # Tests the first few sentences
        #if(sentenceT == 2):
        #    break
        
    return vectorMatrix

#=========================================================================================================

In [None]:

#****
# We're borrowing a set of predefined data from a GitHub repository
# It will be used to train our sentiment prediction model
#****

# Refers to Airline tweets from GitHub
# 'GH_data_frame' is a pandas data frame
AirlineData = "https://raw.githubusercontent.com/kolaveridi/kaggle-Twitter-US-Airline-Sentiment-/master/Tweets.csv"
GH_data_frame = pd.read_csv(AirlineData)

# Assigns the tweets and sentiments to their own variable, we don't need the other data from the data frame
Atweets = GH_data_frame.iloc[:, 10].values
Asentiment = GH_data_frame.iloc[:, 1].values


In [None]:

#****
# Saves more recent data to "TumData3.txt"; "TumData" and "TumData2" has old news
#****

# The current text data file 
file3 = codecs.open('./txtFiles/TumData3.txt', 'w+', "utf-8")

# Holds the textual content of the pulled posts
currentText = []

# Initiates a search in Tumblr, finding the top 20 text-based blog posts for the specified topic
currentText.extend(pullPost(client.tagged('Macintosh', filter = 'text')))

# Overwrites old posts with newly pulled posts
for line in currentText:
    file3.write(line)
    
file3.close()


In [None]:

#****
# Takes data from myself and my other 2 teammates and PreProcesses them, then combines them all together
# Keeps each post intact by saving each one as its own list-of-words
#****

# This will hold all of our data as a list-of-lists; list-of-split-sentences (split-sentences = list-of-words)
all_data1 = [] # Before removing trivial words (Used to train the Skip-Gram model)
all_data2 = [] # After removing trivial words  (Used to train the RandomForest Classifier model)

# This will only hold the data from the social networks
our_data1 = [] # Before removing trivial words
our_data2 = [] # After removing trivial words

# Opens every .txt file
twit = open("./txtFiles/TweetData.txt", "r", encoding="utf8")
redd = open("./txtFiles/RedData.txt", "r", encoding="utf8")
tum  = open("./txtFiles/TumData.txt", "r", encoding="utf8")
tum2 = open("./txtFiles/TumData2.txt", "r", encoding="utf8")
tum3 = open("./txtFiles/TumData3.txt", "r", encoding="utf8")
tum4 = open("./txtFiles/TumData4.txt", "r", encoding="utf8")

# "Ignores" newlines
twit_txt = [x for x in twit.readlines() if x.strip()]
redd_txt = [x for x in redd.readlines() if x.strip()]
tum_txt  = [x for x in tum.readlines() if x.strip()]
tum2_txt = [x for x in tum2.readlines() if x.strip()]
tum3_txt = [x for x in tum3.readlines() if x.strip()]
tum4_txt = [x for x in tum4.readlines() if x.strip()]


# Takes Twitter text file and PreProcesses the data (provided by my teammate)
tweet_data = preProc(twit_txt)

# Takes Reddit text file and PreProcesses the data (provided by my other teammate)
red_data = preProc(redd_txt)

# Takes my first text file and PreProcesses the data
tum_data = preProc(tum_txt)

# Takes my second text file and PreProcesses the data
tum_data2 = preProc(tum2_txt)

# Takes my third text file and PreProcesses the data
tum_data3 = preProc(tum3_txt)

# ...
tum_data4 = preProc(tum4_txt)


# PreProcesses the GitHub's airline data
airline_data1 = preProc(Atweets)
airline_data2 = []


# Combines all our data (including the data from GitHub) for machine learning
all_data1 = airline_data1 + tweet_data + red_data + tum_data + tum_data2 + tum_data3 + tum_data4

# Combines all our data (excluding the data from GitHub) for sentiment analysis
our_data1 = tweet_data + red_data + tum_data + tum_data2 + tum_data3 + tum_data4

# Removes '\n'
all_data1[:] = [x for x in all_data1 if x != '\n']
our_data1[:] = [x for x in our_data1 if x != '\n']


In [None]:

#****
# Takes the extracted data and removes "stop-words"/trivial words from the textual data
#****

# List of trivial-words or "stop-words": prepositions, pronouns, articles, anything with little contextual meaning
stopWords = ['to','and', 'is', 'a', 'of', 'on', 'the', 'for', 'he','did','was',
             'in', 'with', 'about', 'this', 'are','from','his','that','have',
             'they', 'by','has', 'so', 'be', 'will', 'do','also','it', 'but',
             'been','if', 'but', 'you', '...', 'an', 'as', 'at', 'comment',
             'your', 'like', 'what', 'i', 'my','com']

# Removes "stop-words" from all_data
for item in all_data1:
    all_data2.append([x for x in item if x not in stopWords])

# Removes "stop-words" from our_data
for item in our_data1:
    our_data2.append([x for x in item if x not in stopWords])
    
# Removes "stop-words" from airline_data
for item in airline_data1:
    airline_data2.append([x for x in item if x not in stopWords])


In [None]:

#============================================================================
# min_count = Ignores all words with total frequency lower than value
#      size = length of each word's vector
#    window = represents the distance of neighboring words to train the model
#        sg = training algorithm: CBOW (Default) or Skip Gram (1)
#============================================================================

# Skip Gram model, word vectors have a length of 100
model100 = gensim.models.Word2Vec(all_data1, min_count = 1, size = 100, window = 5, sg = 1)

# Saves Word2Vec object ("model100") to a .bin file (to be distributed elsewhere)
model100.save('model100.bin')


In [None]:

#****
# Saves all our textual data to different data frames for later usage
#****

# Holds all the unprocessed text data
posts_data = []

# Opens the files again
twit = open("./txtFiles/TweetData.txt", "r", encoding="utf8")
redd = open("./txtFiles/RedData.txt", "r", encoding="utf8")
tum  = open("./txtFiles/TumData.txt", "r", encoding="utf8")
tum2 = open("./txtFiles/TumData2.txt", "r", encoding="utf8")
tum3 = open("./txtFiles/TumData3.txt", "r", encoding="utf8")
tum4 = open("./txtFiles/TumData4.txt", "r", encoding="utf8")

# Ignores newlines
twit_txt = [x for x in twit.readlines() if x.strip()]
redd_txt = [x for x in redd.readlines() if x.strip()]
tum_txt  = [x for x in tum.readlines() if x.strip()]
tum2_txt = [x for x in tum2.readlines() if x.strip()]
tum3_txt = [x for x in tum3.readlines() if x.strip()]
tum4_txt = [x for x in tum4.readlines() if x.strip()]


# Appends all unprocessed text to the 'posts_data' list
posts_data = twit_txt
posts_data.extend(redd_txt)
posts_data.extend(tum_txt)
posts_data.extend(tum2_txt)
posts_data.extend(tum3_txt)
posts_data.extend(tum4_txt)

# Removes '\n'
posts_data[:] = [x for x in posts_data if x != '\n']
for x in range(len(posts_data)):
    posts_data[x] = re.sub(r'\n', ' ', posts_data[x])
    posts_data[x] = re.sub(r'\s+',' ',posts_data[x])

# Saves all our unprocessed post data to a data frame (in reverse order)
our_og_df = pd.DataFrame(list(zip(posts_data[::-1])), columns=['Our_posts'])

# Saves all our preprocessed data to a data frame (in reverse order)
our_pre_df = pd.DataFrame(list(zip(our_data2[::-1])), columns=['Our_Preprocess'])

# Saves the usable GitHub data to a data frame (split and sentiments)
small_air_df = pd.DataFrame(list(zip(airline_data2, Asentiment)), columns=['Tweet', 'Sentiment'])

# We'll also save the original posts from the airline GitHub data (just the tweets)
og_air_df = pd.DataFrame(list(zip(GH_data_frame.iloc[:, 10].values)), columns=['Github_posts'])


In [None]:

#****
# Saves the processed and unprocessed texts to their own files
#****

our_pre_df.to_csv(('our_pre.csv'),index=False)
our_og_df.to_csv(('our_og.csv'),index=False)


In [None]:

#=====================================================================================
# - Takes all of our data and converts each sentence into their vector representations
#=====================================================================================

small_air_posts = small_air_df.iloc[:,0].values # Split airline posts (these are arrays, not lists)
sentiment = small_air_df.iloc[:,1].values       # Airline sentiments

our_pre_posts = our_pre_df.iloc[:,0].values # Our split posts
    

In [None]:

#****
# Calls makeMatrix() to convert 'small_air_posts', and 'our_pre_posts' into their vector representations  
#****

# Creates vector matrices for the two data sets
testMatrix = makeMatrix(small_air_posts, model100) # Used for training
realMatrix = makeMatrix(our_pre_posts, model100)   # Used for evaluating our social network data


In [None]:

#****
# Converts the matrices into data frames
#****

testMatrix_df = pd.DataFrame(list(zip(testMatrix, sentiment)), columns=['Air_vectors', 'Sentiment'])
realMatrix_df = pd.DataFrame(zip(realMatrix), columns=['Our_vectors'])


In [None]:

#==============================================================================================
# Takes the vector matrices from the previous step and uses them to train our prediction model
# Then it will use the trained model to predict the sentiments of our data sets
#==============================================================================================

sentiment = testMatrix_df.iloc[:,1].values.tolist()  # Sentiments, and keeps them in a list

our_og_posts = our_og_df.iloc[:,0].values.tolist() # The original social network posts


In [None]:

# Converts the sentiment Strings into integers for sklearn
for i, item in enumerate(sentiment):
    if(item == 'negative'):
        sentiment[i] = 0
    elif(item == 'neutral'):
        sentiment[i] = 1
    elif(item == 'positive'):
        sentiment[i] = 2


In [None]:

#****
# We have too many negative samples, so this creates dummies of positive and neutral samples
#****

smote = SMOTE(random_state=42)
testMatrix, sentiment = smote.fit_sample(testMatrix, sentiment)


In [None]:

#****
# Allocate 80% of airline data to train the classifier, and 20% for testing
# Also, set forth some parameters of the training model 
#****

#--------------------------------------------------------------
# Vect_train : train using 80% of data sample
# Vect_test  : test predictions using other 20% of data sample
# Sent_train : label for Vect_train
# Sent_test  : label for Vect_test
#--------------------------------------------------------------
Vect_train, Vect_test, Sent_train, Sent_test = train_test_split(testMatrix, sentiment, test_size=0.2, random_state=0)

# Parameter settings for the prediction/classifier model
text_classifier_test = RandomForestClassifier(n_estimators=100, random_state=0)

#****
# - RandomForest creates decision trees on randomly selected data samples, then
#   gets prediction from each tree and selects the best solution.
# - That's what the documentation says at least, not sure what it means exactly though
#****

# Training set 
text_classifier_test.fit(Vect_train, Sent_train)


In [None]:

#****
# Make predictions on the airline test set
# And determine accuracy of prediction model
#****

# Make predictions on the test set 'Vect_test'
testPredict = text_classifier_test.predict(Vect_test)

# Compares 'predictions' to 'Sent_test' to see how many were correct
# Down hill diagonal represents the correct amount of predictions made
cMatrix = confusion_matrix(Sent_test,testPredict)

# Converts each value in the matrix into a percentage/float-value
cMatrix = cMatrix.astype('float') / cMatrix.sum(axis=1)[:, np.newaxis]

# Convert this list into a numpy array
Sent_test = np.asarray(Sent_test, dtype=np.int)


In [None]:

#****
# Print our test results as a confusion matrix
#****

# 
boxPlot, ax = plt.subplots()

# The tick labels for the box-plot
sent_label = ['negative','neutral','positive']

# sklearn doesn't like lists, so convert to numpy array
sent_label = np.asarray(sent_label, dtype=np.str)

# Takes comparison of 'Sent_test' and 'testPredict' and links them to 'sent_label' when we make the box-plot
sent_label = sent_label[unique_labels(Sent_test, testPredict)] 


# (cmap = plt.cm.Greens) represents the colors/shades used for the plot
im = ax.imshow(cMatrix, interpolation='nearest', cmap=plt.cm.Greens)
ax.figure.colorbar(im, ax=ax)

# Configures the labels and tick marks of the box-plot
ax.set(xticks=np.arange(cMatrix.shape[1]), # cMatrix.shape[1] = number of columns = x-axis
       yticks=np.arange(cMatrix.shape[0]), # cMatrix.shape[0] = number of rows = y-axis
       
       # Label the ticks with their respective sentiment
       xticklabels=sent_label, yticklabels=sent_label,
       
       # Label the sides
       title='Confusion Matrix',
       ylabel='True Sentiments',
       xlabel='Predicted Sentiments')

# Rotates the x-axis tick labels to be 45 degrees and properly aligns them with the tick mark
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

# cMatrix.max() = largest value in the cMatrix
# thresh means threshold; so divide max() by 2 gives puts a threshold at the middle of our values in cMatrix
thresh = cMatrix.max() / 2.

# Takes data from confusion matrix and puts them in the box-plot
for i in range(cMatrix.shape[0]):
    
    for j in range(cMatrix.shape[1]):
        # Places each item of 'cMatrix' into the box-plot
        ax.text(j, i, format(cMatrix[i, j], '.2f'),
                # Horizontal & Vertical alignment of text is centered
                ha="center", va="center",
                
                # Makes text black for bright background, white for dark background
                # Any value higher than 'thresh' becomes white, any value less is black
                color="white" if cMatrix[i, j] > thresh else "black")

# The overall accuracy of our test
print('• Overall accuracy of the prediction/classifier model: ', accuracy_score(Sent_test, testPredict))
print()

# Prints the confusion matrix
boxPlot.tight_layout()
print()


In [None]:

#****
# Make predictions on 'realMatrix'; otherwise known as 'Our data'
#****

# Make predictions on our data set 'realMatrix'
realPredict = text_classifier_test.predict(realMatrix)

# Convert numpy.array into list, so that we can have mismatching data types
realPredict = realPredict.copy().tolist()

# Converts these integers into Strings/Sentiments
for i, item in enumerate(realPredict):
    if(item == 0):
        realPredict[i] = 'negative'
    elif(item == 1):
        realPredict[i] = 'neutral'
    elif(item == 2):
        realPredict[i] = 'positive'
    

# Creates a data frame of the original posts and their predicted sentiments
ourPredict = pd.DataFrame(list(zip(our_og_posts, realPredict)),columns=['Original Posts', 'Predicted Sentiment'])


In [None]:

# Prints our results
ourPredict.head(len(ourPredict['Original Posts']))
