In [None]:
%matplotlib inline

import json
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
import operator 

from collections import Counter
from nltk.corpus import stopwords
import string
import operator 
import json


from nltk import bigrams 
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize 
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

import time
import calendar
import codecs
import datetime
import sys
import gzip
import os
import numpy as np


In [None]:
#READING & COUNTING TWEETS

frequencyMap = {}
globalTweetCounter = 0

timeFormat = "%a %b %d %H:%M:%S +0000 %Y"

reader = codecs.getreader("utf-8")

with open('combined_data.json', 'r') as f:
    tweet = json.load(f)
    for item in tweet:
        if ( "delete" in item.keys() or "status_withheld" in item.keys() ):
            continue

        try:
            currentTime = datetime.datetime.strptime(item['created_at'], timeFormat)
        except:
            print (item)
            raise

        currentTime = currentTime.replace(second=0)
            
        globalTweetCounter += 1
            
        if ( currentTime in frequencyMap.keys() ):
            timeMap = frequencyMap[currentTime]
            timeMap["count"] += 1
            timeMap["list"].append(item)
        else:
            frequencyMap[currentTime] = {"count":1, "list":[item]}
                
times = sorted(frequencyMap.keys())
firstTime = times[0]
lastTime = times[-1]
thisTime = firstTime

timeIntervalStep = datetime.timedelta(0, 60)    # Time step in seconds
while ( thisTime <= lastTime ):
    if ( thisTime not in frequencyMap.keys() ):
        frequencyMap[thisTime] = {"count":0, "list":[]}
        
    thisTime = thisTime + timeIntervalStep

print ("Total Tweet count:", globalTweetCounter)
            
           

In [None]:
#TWITTER TIMELINE

fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)

plt.title("Tweet Time Series")

sortedTimes = sorted(frequencyMap.keys())

postFreqList = [frequencyMap[x]["count"] for x in sortedTimes]

smallerXTicks = range(0, len(sortedTimes), 43200)
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)

ax.plot(range(len(frequencyMap)), [x if x > 0 else 0 for x in postFreqList], color="black", label="Tweets")
ax.grid(b=True, which=u'major')
ax.legend()

plt.show()

In [None]:
#NUMBER OF UNIQUE USERS

globalUserCounter = {}
globalUserMap = {}

for t in sortedTimes:
    timeObj = frequencyMap[t]
    
    for tweet in timeObj["list"]:
        user = tweet["user"]["screen_name"]
        
        if ( user not in globalUserCounter ):
            globalUserCounter[user] = 1
            globalUserMap[user] = [tweet]
        else:
            globalUserCounter[user] += 1
            globalUserMap[user].append(tweet)

print ("Unique Users: " + str(len(globalUserCounter.keys())))

In [None]:
#OP 10 MOST ACTIVE USERS

sortedUsers = sorted(globalUserCounter, key=globalUserCounter.get, reverse=True)
print ("Top most active users:")
for u in sortedUsers[:10]:
    print (u, globalUserCounter[u])

#GETTING INFORMATION ON MOST ACTIVE USERS

import tweepy

consumer_key = ""
consumer_secret = ""
access_token = ""
access_token_secret = ""

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.secure = True
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

print ("User + description:")
for u in sortedUsers[:50]:
    print (u, globalUserCounter[u])

    try:
        user = api.get_user(u)
        print ("\tDescription:", user.description)
    except Exception as te:
        print ("\tDescription Error:", te)
        
    print ("----------")

In [None]:
#HISTOGRAM --> NUMBER OF ALL USERS TWEETS

plt.figure(figsize=(16,8))

plt.hist(
    [globalUserCounter[x] for x in globalUserCounter], 
    bins=100, 
    normed=0, 
    alpha=1,
    label="Tweet Count",
    log=True,
    color='black')

plt.xlabel('Tweets')
plt.ylabel('Users')
plt.title("Histogram of Unique User Tweets")
plt.grid(True)
plt.legend()

plt.show()

In [None]:
#AVERAGE NUMBER OF POSTS

avgPostCount = np.mean([globalUserCounter[x] for x in globalUserCounter])
print("Average Number of Posts: " + str(avgPostCount))

In [None]:
#MOST POPULAR HASHTAGS

hashtagCounter = {}

for t in sortedTimes:
    timeObj = frequencyMap[t]
    
    for tweet in timeObj["list"]:
        hashtagList = tweet["entities"]["hashtags"]
        
        for hashtagObj in hashtagList:
            
            hashtagString = hashtagObj["text"].lower()
            
            if ( hashtagString not in hashtagCounter ):
                hashtagCounter[hashtagString] = 1
            else:
                hashtagCounter[hashtagString] += 1

print ("Unique Hashtags:", len(hashtagCounter.keys()))
sortedHashtags = sorted(hashtagCounter, key=hashtagCounter.get, reverse=True)
print ("Top Fifty Hashtags:")
for ht in sortedHashtags[:50]:
    print ("\t", "#" + ht, hashtagCounter[ht])

In [None]:
#LANGUAGE DISTRIBUTION LIST

languageCounter = {}

for t in sortedTimes:
    timeObj = frequencyMap[t]
    
    for tweet in timeObj["list"]:
        lang = tweet["lang"]
        
        if ( lang not in languageCounter ):
            languageCounter[lang] = 1
        else:
            languageCounter[lang] += 1
            
languages = sorted(languageCounter.keys(), key=languageCounter.get, reverse=True)

for l in languages:
    print (l, languageCounter[l])

In [None]:
#TOKENIZATION
  
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>',
    r'(?:@[\w_]+)',
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)",
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', 
    r"(?:[a-z][a-z'\-_]+[a-z])",
    r'(?:[\w_]+)', 
    r'(?:\S)'
    
]

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)


def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

with open('combined_data.json', 'r') as f:
    tweets_all = []
    tweet = json.load(f)
    for item in tweet:
        
        # <140 : ['text']
        # >140 : ['extended_tweet']['full_text']
        # >140 & retweet (RT) : ['retweeted_status]['extended_tweet']['full_text']
        
        if item.get('retweeted_status'):
            if item['retweeted_status'].get('extended_tweet'):
                tokens_all = preprocess(item['retweeted_status']['extended_tweet']['full_text'].lower())
            else:
                tokens_all = preprocess(item['retweeted_status']['text'].lower())
        else:
            if item.get('extended_tweet'):
                tokens_all = preprocess(item['extended_tweet']['full_text'].lower())
            else:
                tokens_all = preprocess(item['text'].lower()) 
            
        tweets_all.append(tokens_all)
        

In [None]:
# remove stopwords & twitter specific terms
punctuation = list(string.punctuation)
stop = list(stopwords.words('english') + stopwords.words('spanish') + stopwords.words('french') + punctuation + ['RT', 'via', 'amp', 'The', 'I'])

In [None]:
# counting most frequent terms & n-grams

count_all = Counter()
tweets_filtered = []
for term in tweets_all:
    if term not in stop:
        tweets_filtered += term
count_all.update(tweets_filtered)  
print("most frequent terms:", count_all.most_common(100))
print('________________________')
    
terms_bigram = bigrams(tweets_filtered) #exclude stopwords
count_all_bg = Counter()
count_all_bg.update(terms_bigram)
print("most frequent bigrams:", count_all_bg.most_common(300))


In [None]:
#TERM CO-OCCURRENCE

from collections import defaultdict
 
com = defaultdict(lambda : defaultdict(int))
        
with open('combined_data.json', 'r') as f:
    tweets_all = []
    tweet = json.load(f)
    for item in tweet:
        if item.get('retweeted_status'):
            if item['retweeted_status'].get('extended_tweet'):
                tokens_all = preprocess(item['retweeted_status']['extended_tweet']['full_text'].lower())
            else:
                tokens_all = preprocess(item['retweeted_status']['text'].lower())
        else:
            if item.get('extended_tweet'):
                tokens_all = preprocess(item['extended_tweet']['full_text'].lower())
            else:
                tokens_all = preprocess(item['text'].lower())
        
        terms_only = [term for term in tokens_all if term not in stop]
        
        for i in range(len(terms_only)-1):            
            for j in range(i+1, len(terms_only)):
                w1, w2 = sorted([terms_only[i], terms_only[j]])                
                if w1 != w2:
                    com[w1][w2] += 1

                
com_max = []
for t1 in com:
    t1_max_terms = sorted(com[t1].items(), key=operator.itemgetter(1), reverse=True)[:5]
    for t2, t2_count in t1_max_terms:
        com_max.append(((t1, t2), t2_count))
terms_max = sorted(com_max, key=operator.itemgetter(1), reverse=True)
print(terms_max[50:100])


In [None]:
#co-occurence with specific word
with open('combined_data.json', 'r') as f:
    search_word = 'innovation'
    count_search = Counter()
    tweets_all = []
    tweet = json.load(f)
    for item in tweet:
        if item.get('retweeted_status'):
            if item['retweeted_status'].get('extended_tweet'):
                tokens_all = preprocess(item['retweeted_status']['extended_tweet']['full_text'].lower())
            else:
                tokens_all = preprocess(item['retweeted_status']['text'].lower())
        else:
            if item.get('extended_tweet'):
                tokens_all = preprocess(item['extended_tweet']['full_text'].lower())
            else:
                tokens_all = preprocess(item['text'].lower())
        
        terms_only = [term for term in tokens_all if term not in stop]
        if search_word in terms_only:
            count_search.update(terms_only)
    print("Co-occurrence for %s:" % search_word)
    print(count_search.most_common(30))

In [None]:
#keywords
targetKeywords = ["python"]

targetCounts = {x:[] for x in targetKeywords}
totalCount = []

for t in sortedTimes:
    timeObj = frequencyMap[t]
  
    localTargetCounts = {x:0 for x in targetKeywords}
    localTotalCount = 0
    
    for tweetObj in timeObj["list"]:
        if tweetObj.get('retweeted_status'):
            if tweetObj['retweeted_status'].get('extended_tweet'):
                    tweetString = tweetObj['retweeted_status']['extended_tweet']['full_text'].lower()
            else:
                    tweetString = tweetObj['retweeted_status']['text'].lower()
        else:
            if tweetObj.get('extended_tweet'):
                tweetString = tweetObj['extended_tweet']['full_text'].lower()
            else:
                tweetString = tweetObj['text'].lower()

        localTotalCount += 1
        
        for keyword in targetKeywords:
            if ( keyword in tweetString ):
                localTargetCounts[keyword] += 1
 
    totalCount.append(localTotalCount)
    for keyword in targetKeywords:
        targetCounts[keyword].append(localTargetCounts[keyword])
        

fig, ax = plt.subplots()
fig.set_size_inches(18.5,10.5)

plt.title("Time series with term 'Python''")
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)

ax.plot(range(len(frequencyMap)), totalCount, label="Total", color='black')

for keyword in targetKeywords:
    ax.plot(range(len(frequencyMap)), targetCounts[keyword], label=keyword, color='grey')
ax.legend()
ax.grid(b=True, which=u'major')


plt.show()

In [None]:
#NUMBER OF TWEETS WITH GEO DATA

geoFrequencyMap = {}
geoCount = 0

for t in sortedTimes:
    geos = list(filter(lambda tweet: tweet["coordinates"] != None and "coordinates" in tweet["coordinates"], frequencyMap[t]["list"]))
    geoCount += len(geos)
    
    geoFrequencyMap[t] = {"count": len(geos), "list": geos}

print ("Number of tweets with geodata: " + str(geoCount))

In [None]:
fname = 'combined_data.json'
with open(fname, 'r') as f:
 
    users_with_geodata = {
        "data": []
    }
    all_users = []
    total_tweets = 0
    geo_tweets  = 0
    line = json.load(f)
    for tweet in line:
        if tweet['user']['id']:
            total_tweets += 1 
            user_id = tweet['user']['id']
            if user_id not in all_users:
                all_users.append(user_id)
              
                user_data = {
                    "user_id" : tweet['user']['id'],
                    "features" : {
                        "name" : tweet['user']['name'],
                        "id": tweet['user']['id'],
                        "screen_name": tweet['user']['screen_name'],
                        "tweets" : 1,
                        "location": tweet['user']['location'],
                    }
                }
            
                if tweet['coordinates']:
                    user_data["features"]["primary_geo"] = str(tweet['coordinates'][tweet['coordinates'].keys()[1]][1]) + ", " + str(tweet['coordinates'][tweet['coordinates'].keys()[1]][0])
                    user_data["features"]["geo_type"] = "Tweet coordinates"
                elif tweet['place']:
                    user_data["features"]["primary_geo"] = tweet['place']['full_name'] + ", " + tweet['place']['country']
                    user_data["features"]["geo_type"] = "Tweet place"
                else:
                    user_data["features"]["primary_geo"] = tweet['user']['location']
                    user_data["features"]["geo_type"] = "User location"
            
                if user_data["features"]["primary_geo"]:
                    users_with_geodata['data'].append(user_data)
                    geo_tweets += 1
            
            elif user_id in all_users:
                for user in users_with_geodata["data"]:
                    if user_id == user["user_id"]:
                        user["features"]["tweets"] += 1
               
    for user in users_with_geodata["data"]:
        geo_tweets = geo_tweets + user["features"]["tweets"]
    
    print "The file included " + str(len(users_with_geodata['data'])) + " unique users who tweeted with geo data, including 'location'"

with open('geo_data.json', 'w') as fout:
    fout.write(json.dumps(users_with_geodata, indent=4))