In [None]:
# PreSetup
!pip install numpy
!pip install networkx
!pip install matplotlib
!pip install sklearn
!pip install seaborn
!pip install pandas
!pip install nltk
!pip install wordcloud
!pip install tweepy

In [None]:
# Representing data
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns

# Models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import numpy as np

# Word Tokenization
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

# NLTK
import nltk

# Data Representation
import pandas as pd
import string
import csv
import json

# Twitter API
from tweepy import Stream, API, OAuthHandler
from tweepy.streaming import StreamListener

# Utils
from datetime import datetime
from dateutil import parser

import warnings
import re
import os
import ssl

import time
import traceback

In [None]:
# Go to http://apps.twitter.com and create an app.
# The consumer key and secret will be generated for you after
consumer_key=""
consumer_secret=""

# After the step above, you will be redirected to your app's page.
# Create an access token under the the "Your access token" section
access_token=""
access_token_secret=""

# Configure the maximum number of tweets
max_number_of_tweets = 1000

# Topic Generator Settings
topics_to_generate = 20
words_per_topic = 7

# We will use this to align the tweets
date_format = '%Y-%m-%d %H:%M:%S'

In [None]:
# Those are the english words that will be omitted
to_track = ["data", "artificial","intelligence", "machile", "learning", "event", "detection", "python"]
default_stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now", "rt", "http", "wa", "nt", "re", "amp"]

In [None]:
# NLTK Data

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('wordnet')

In [None]:
class MemoryListner(StreamListener):
    """ A listener handles tweets that are received from the stream.
    This is a basic listener that just adds received tweets to memory.
    """
    
    def __init__(self, maxNumberOfTweets):
        self.max_tweets = maxNumberOfTweets
        self.tweet_count = 0
        
        self.tweets = []
        
    def on_data(self, data):
        # We want to get just a small number of tweets
        if (self.tweet_count < self.max_tweets):
            self.tweet_count += 1
            
            self.tweets.append(data)
            
            return True
        else:
            return False

    def on_error(self, status):
        print(status)
        
    def get_tweets(self):
        return self.tweets

    def on_status(self, status):
        print(status.text)

In [None]:
# Create a listner for the Stream
my_listner = MemoryListner(max_number_of_tweets)

# Generate an authenticator
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = API(auth)

# We will listen to the Stream only for english and 
# we should add some common words to look for
my_stream = Stream(auth = api.auth, listener=my_listner)
my_stream.filter(track=to_track)

In [None]:
# We will use lemmatization. If you want to know more, please visit:
# https://blog.bitext.com/what-is-the-difference-between-stemming-and-lemmatization/
wordnet_lemmatizer = WordNetLemmatizer()

In [None]:
# Standard ED Text Cleaning
def clean_text(text, stop_words, extra_words = []):
        def tokenize_text(text):
            return [w for s in sent_tokenize(text) for w in word_tokenize(s)]

        def remove_special_characters(text):
            tokens = tokenize_text(text)
            return ' '.join(re.sub('[^a-z]+', '', x) for x in tokens)

        def lemma_text(text, lemmatizer=wordnet_lemmatizer):
            tokens = tokenize_text(text)
            return ' '.join([lemmatizer.lemmatize(t) for t in tokens])

        def remove_stopwords(text, stop_words= (stop_words + extra_words)):
            tokens = [w for w in tokenize_text(text) if w not in stop_words]
            return ' '.join(tokens)

        text = str(text).strip(' ') # strip whitespaces
        text = text.lower() # lowercase
        text = remove_special_characters(text) # remove punctuation and symbols
        text = lemma_text(text) # stemming
        text = remove_stopwords(text) # remove stopwords
        text = text.strip(' ') # strip whitespaces again?

        return text

In [None]:
# Get All tweets and prepare some list for data
data = []
all_tweets = my_listner.get_tweets()

In [None]:
# Parse the tweets and take only the 2 columns of interest
for tweet in all_tweets:
    y = json.loads(tweet)
    try:
        text = clean_text(y['text'], default_stopwords)

        date = parser.parse(y['created_at'])
        datetime = date.strftime(date_format)

        data.append([text, datetime])
        
    except Exception:
        traceback.print_exc()

In [None]:
# Convert to a pandas DataFrame
print('Number of parsed tweets = ' + str(len(data)))

df = pd.DataFrame(data, columns=['text', 'date'])
df.head()

In [None]:
def get_topics(model, count_vectorizer, n_top_words):
        words = count_vectorizer.get_feature_names()
        data_labels = []

        for _, topic in enumerate(model.components_):
            data_labels.append(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

        topics = pd.DataFrame(data=data_labels, columns=['topic'])
        
        return topics

In [None]:
def plot_most_common_10(count_data, count_vectorizer):
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))

    for t in count_data:
        total_counts+=t.toarray()[0]
        
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
        
    figure = plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title='10 most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
        
    figure.show()

In [None]:
def print_cloud(given_text):
    # Create a WordCloud object
    word_cloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue', width=1600, height=800)

    # Generate a word cloud
    cloud = word_cloud.generate(given_text)
        
    figure = plt.figure(figsize=(20,10))
    plt.imshow(cloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    
    figure.show()

In [None]:
def train_OLDA(topics_to_generate, words_per_topic):
        papers = df

        # Join the different processed titles together.
        long_string = ','.join(list(str(x) for x in papers['text'].values))
        
        # Initialise the count vectorizer
        count_vectorizer = CountVectorizer()

        # Fit and transform the processed titles
        count_data = count_vectorizer.fit_transform(papers['text'].values.astype('str'))

        # Visualise the 10 most common words

        # Create and fit the LDA model
        lda = LDA(n_components = topics_to_generate)
        lda.fit(count_data)

        print('OLDA is done')
        # Print the topics found by the LDA model
        
        topics = get_topics(lda, count_vectorizer, words_per_topic)
        print(topics)
        
        print_cloud(long_string)
        plot_most_common_10(count_data, count_vectorizer)        
        
        return topics

In [None]:
olda_topics = train_OLDA(topics_to_generate, words_per_topic)

In [None]:
olda_topics['set'] = olda_topics['topic'].map(lambda x: set(x.split()))

In [None]:
def project_topics(topic_threshold):
    magnitude = {}
    event_start = {}
    event_end = {}
    fine_grained = []

    # We will go trough each tweet and try to match it to a topic
    for _, row in df.iterrows():
        try:
            rowSet = set(row['text'].split())
        except:
            continue

        for _, topic_row in olda_topics.iterrows():
            topic_set = topic_row['set']
            original_topic = topic_row['topic']
            
            if len(topic_set.intersection(rowSet)) > topic_threshold:
                if original_topic not in magnitude:
                    magnitude[original_topic] = 0

                magnitude[original_topic] += 1
                given_date = row['date']
                fine_grained.append([original_topic, given_date, topic_threshold])
                
                if original_topic not in event_start:
                    event_start[original_topic] = given_date
                    
                if given_date < event_start[original_topic]:
                    event_start[original_topic] = given_date
                    
                if original_topic not in event_end:
                    event_end[original_topic] = given_date
                    
                if given_date > event_end[original_topic]:
                    event_end[original_topic] = given_date


    olda_data = []
    for _, row in olda_topics.iterrows():
        topic = row['topic']
        olda_data.append([topic, magnitude.get(topic, 0), event_start.get(topic, 'NULL'), event_end.get(topic, 'NULL'), topic_threshold])

    return (olda_data, fine_grained)

In [None]:
(one_word_match, fine_grained_one_word_match) = project_topics(1)
(two_words_match, fine_grained_two_words_match) = project_topics(2)
(three_words_match, fine_grained_three_words_match) = project_topics(3)

In [None]:
data = one_word_match + two_words_match + three_words_match
fine_grained_data = fine_grained_one_word_match + fine_grained_two_words_match + fine_grained_three_words_match

In [None]:
olda_df = pd.DataFrame(data, columns=['Topic', 'Magnitude', 'StartDate', 'EndDate', 'MatchSize'])

print(olda_df.head())

In [None]:
fine_grained_df = pd.DataFrame(fine_grained_data, columns=['Topic', 'EventDate', 'MatchSize'])

print(fine_grained_df.head())

In [None]:
olda_df.to_csv('output.csv', index=False)
fine_grained_df.to_csv('output_detailed.csv', index=False)