Jupyter Notebook For ReadMyMind, A CS 125 @ Illinois MP7 Project by Isaac Park and Mihir Pandya.

Github: https://github.com/ReadMyMind-dev

Proposed method for keyword extraction:

1. Tokenize each word with part of speech tag. keep only proper nouns, nouns, adjectives, and verbs.
2. Score the nouns and proper nouns based on amount of surrounding adjectives and verbs (using more description tends to indicate importance).
3. Record frequency of each word; only keep words that occur above a certain number of times (frequency threshold). These will be our "keywords".
4. Put the list of keywords for each tweet into the 'Keywords' column of the dataframe.

Ideas for graphing the keywords/frequency/likes/retweets relationships:

1. Bar graph whose y-axis is the avg. # of likes for a given keyword and x-axis is the list of the top ~30 most occurring keywords.

2. Bar graph whose y-axis is the avg. sentiment value for a given keyword and x-axis is the list of the top ~30 most occurring keywords.

3. Simple pie chart to analyze the main content areas that said Twitter account comments on.


Written below is the function/method oriented version of the program. Different parts of the program are organized into multiple functions.

Isaac - Twitter Data Collection & Organization, Visualizations

Mihir - Keyword Extraction & Refinement, Visualizations

In [113]:
import tweepy
import pandas as pd
from textblob import TextBlob
from textblob import Word
from datetime import datetime
import plotly
import plotly.plotly as py
import plotly.graph_objs as go

plotly.tools.set_credentials_file(username='Parkkeo1', api_key='qNMv1LXTLzwfGZ3EyN0U')

auth = tweepy.OAuthHandler('1X5fCqPl7yVvYxQjQJwkvavFD', 'NXbTDPP3HxlXOL5dWdCegEP09odLAkxUWlyRvZqXxtAtdX597G')
auth.set_access_token('925495606931546112-mn3Hda41LsZhbYAKJtddL7TulRKucuj', 'lvCFqSLv5YvOGzCINH6JZ5cBI1CEkPKrRioBn5Iuec3Tt')
api = tweepy.API(auth)


def tweet_collector(username): 
    tweets_df = pd.DataFrame({
        'Timestamp': (),
        'Likes': (),
        'Retweets': (),
        'Text': (),
        'Sentences': (),
        'Sentiment_Total': (),
        'Keywords': ()
    })

    tweets_df = tweets_df[['Timestamp', 'Likes', 'Retweets', 'Text', 'Sentences', 'Sentiment_Total', 'Keywords']]

    recent_tweets = api.user_timeline(screen_name = username, count=500, tweet_mode="extended") # analyzing 500 tweets
    for status in recent_tweets:
        test = status.full_text
        if test[:2] != 'RT': # removing retweets made by the user
            status_data = pd.Series([status.created_at, status.favorite_count, status.retweet_count, status.full_text], 
                                    index=['Timestamp', 'Likes', 'Retweets', 'Text'])
        tweets_df = tweets_df.append(status_data, ignore_index = True)

    tweets_df = tweets_df.drop_duplicates(subset='Text') # just in case, remove any duplicate tweets
    tweets_df = tweets_df.astype('object')
    
    return tweets_df


def keyword_data(tweets_df):
    keywords_dict = {}

    for i in range(len(tweets_df)):
        content = tweets_df.iloc[i]['Text']
        if 'http' in content:
            j = content.index('http')
            content = content[:j] # cleaning text of the tweet by removing the link at the end and newline characters
        content = content.replace('\n', '')
        tweets_df.iloc[i]['Text'] = content

        blob = TextBlob(content)
        tweets_df.iloc[i]['Sentiment_Total'] = blob.sentiment.subjectivity
        sentiments = {}

        for sent in blob.sentences: # generating sentiment polarity values for each sentence in the tweet
            sentiments[str(sent)] = sent.sentiment.subjectivity

        tweets_df.iloc[i]['Sentences'] = sentiments # insert dictionary of sentence: sentiment value into dataframe

        tweets_df.iloc[i]['Timestamp'] = tweets_df.iloc[i]['Timestamp'].to_pydatetime() # convert pandas.tslib.Timestamp object to datetime

        # Keyword extraction goes here
        filtered_words = blob.noun_phrases
        temp = []

        for element in filtered_words:
            for x in range(len(filtered_words)):
                if element != filtered_words[x] and element in filtered_words[x]:
                    temp.append(element)
        parts_of_speech = blob.tags
        for element in temp:
            filtered_words = [x for x in filtered_words if x != element]

        for x in range(len(parts_of_speech)):
            if (parts_of_speech[x])[1] == 'NN':
                enter = True
                for element in filtered_words:
                    if (parts_of_speech[x])[0] in element:
                        enter = False
                if enter:
                    if x > 0 and (parts_of_speech[x - 1])[1] == 'PRP$':
                        filtered_words.append((parts_of_speech[x])[0])
        parenthesis = []
        paren_init = 0
        loc_begin = blob.find("(", paren_init)
        loc_end = blob.find(")", paren_init)

        while loc_end >= 0:
            parenthesis.append(blob[loc_begin:loc_end])
            paren_init = loc_end + 1
            loc_begin = blob.find("(", paren_init)
            loc_end = blob.find(")", paren_init)

        for element in filtered_words:
            for pelement in parenthesis:
                if element in pelement.lower():
                    filtered_words = [x for x in filtered_words if x != element]        

        for word in filtered_words: # stripping important phrases down to important words
            separated = TextBlob(word).words
            for j in separated:
                j = Word(j.strip())
                j = j.singularize().lemmatize() # in case of duplicates singular/plural-wise
                tb = ((TextBlob(j).tags)[0])[1]
                if j.isalpha() and len(j) > 2 and (tb == 'NN' or tb == 'NNS' or tb == 'VBP'): # filtering out keywords that are too short or not nouns/3rd-person verbs.
                    if j in keywords_dict:
                        keywords_dict[j][0] += 1
                        keywords_dict[j][1] += tweets_df.iloc[i]['Likes']
                        keywords_dict[j][2] += tweets_df.iloc[i]['Retweets']
                        keywords_dict[j][3] += tweets_df.iloc[i]['Sentiment_Total']
                    else:
                        keywords_dict[j] = [1, tweets_df.iloc[i]['Likes'], tweets_df.iloc[i]['Retweets'], tweets_df.iloc[i]['Sentiment_Total']]


    for key in keywords_dict: # calculating averages of each statistic
        keywords_dict[key][1] = int(keywords_dict[key][1] / keywords_dict[key][0])
        keywords_dict[key][2] = int(keywords_dict[key][2] / keywords_dict[key][0])
        keywords_dict[key][3] = keywords_dict[key][3] / keywords_dict[key][0]
        
    return keywords_dict


def to_dataframe(keywords_dict):
    keywords_df = pd.DataFrame.from_dict(keywords_dict, orient='index')
    keywords_df.columns = ['Frequency', 'Avg. Likes', 'Avg. Retweets', 'Avg. Sentiment']
    keywords_df.index.name = 'Keywords'
    keywords_df.reset_index(inplace = True)
    keywords_df = keywords_df.sort_values(['Frequency'], ascending = [False], na_position = 'last')
    keywords_df = keywords_df[:30]
    
    return keywords_df


def bar_chart1(keywords_df):
    data = [go.Bar(
            x = list(keywords_df['Keywords']),
            y = list(keywords_df['Frequency'])
    )]
    
    layout = go.Layout(
        height = 400,
        width = 900,
        autosize = False,
        title='Frequencies Of The Top Keywords In The Last 500 Tweets Made By User',
        xaxis=dict(title='Keyword'),
        yaxis=dict(title='# of Occurrences')
    )

    fig = go.Figure(data=data, layout=layout)

    return py.iplot(fig, filename='basic-bar')


def bar_chart2(keywords_df):
    data = [go.Bar(
            x = list(keywords_df['Keywords']),
            y = list(keywords_df['Avg. Likes'])
    )]
    
    layout = go.Layout(
        height = 400,
        width = 900,
        autosize = False,
        title='Avg. Likes For The Top 30 Keywords By User',
        xaxis=dict(title='Keyword'),
        yaxis=dict(title='Average # of Likes')
    )

    fig = go.Figure(data=data, layout=layout)

    return py.iplot(fig, filename='basic-bar')


def bar_chart3(keywords_df):
    data = [go.Bar(
            x = list(keywords_df['Keywords']),
            y = list(keywords_df['Avg. Sentiment'])
    )]
    
    layout = go.Layout(
        height = 400,
        width = 900,
        autosize = False,
        title='Avg. Sentiment For The Top 30 Keywords By User',
        xaxis=dict(title='Keyword'),
        yaxis=dict(title='Average Sentiment Value')
    )

    fig = go.Figure(data=data, layout=layout)

    return py.iplot(fig, filename='basic-bar')

def pie_graph(keywords_df):
    labels = keywords_df['Keywords'][0:10]
    values = keywords_df['Frequency'][0:10]
    traces = []

    trace = go.Pie(labels = labels, values = values, hoverinfo = 'label+percent+name')
    traces.append(trace)

    layout = go.Layout(height = 600,
                       width = 600,
                       autosize = False,
                       title = "User's Most Covered Topics")
    fig = go.Figure(data = traces, layout = layout)
    
    return py.iplot(fig, show_link = False)

def ReadMyMind_main(username):
    data_df = tweet_collector(username)
    keywords = keyword_data(data_df)
    key_df = to_dataframe(keywords)
    
    return key_df



In [114]:
username = 'realDonaldTrump' # working example: President Trump
results = ReadMyMind_main(username)

In [115]:
bar_chart1(results)

In [116]:
bar_chart2(results)

In [117]:
bar_chart3(results)

In [118]:
pie_graph(results)