In [1]:
import pandas as pd
import numpy as np
import re
import ast
import matplotlib.pyplot as plt
import plotly.plotly as py
import seaborn as sns
import plotly.graph_objs as go
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
%matplotlib inline
init_notebook_mode(connected=True)

In [2]:
tweets_df=pd.read_csv("dataset/javascript_top.csv")
tweets_df.replies.fillna(0,inplace=True)
tweets_df.retweets.fillna(0,inplace=True)
tweets_df.likes.fillna(0,inplace=True)

In [3]:
def get_time(date_time):
    return date_time.split('-')[0].strip()

def get_day(date_time):
    return date_time.split('-')[1].strip().split(' ')[0].strip()

def get_month(date_time):
    return date_time.split('-')[1].strip().split(' ')[1].strip()

def get_year(date_time):
    return date_time.split('-')[1].strip().split(' ')[2].strip()



In [4]:
day=[]
month=[]
year=[]
timestamp=[]
hashtags=[]
mentions=[]
links=[]
hashtags_numbers=[]
mentions_numbers=[]
for i in range(len(tweets_df)):
    tweet_body=tweets_df.iloc[i]['body'].replace('pic',' pic')
    tweet_time=tweets_df.iloc[i]['time']
    timestamp.append(get_time(tweet_time))
    day.append(get_day(tweet_time))
    month.append(get_month(tweet_time))
    year.append(get_year(tweet_time))
    hashtags.append([x.lower() for x in list(set(re.findall(r"#(\w+)", tweet_body)))])
    mentions.append(list(set(re.findall(r"@(\w+)", tweet_body))))
    hashtags_numbers.append(len(hashtags[-1]))
    mentions_numbers.append(len(mentions[-1]))
    urls=re.findall(r"(http://[^ ]+)", tweet_body)
    urls.extend(re.findall(r"(https://[^ ]+)", tweet_body))
    links.append([url.split()[0] for url in urls])
tweets_df['timestamp']=timestamp
tweets_df['year']=year
tweets_df['month']=month
tweets_df['day']=day
tweets_df['hashtags']=hashtags
tweets_df['mentions']=mentions
tweets_df['hashtags_numbers']=hashtags_numbers
tweets_df['mentions_numbers']=mentions_numbers
tweets_df['links']=links
#tweets_df.drop(columns=['time'],inplace=True)

In [22]:
def get_frequency(df,target_column):
    items_frequency={}
    for i in range(len(df)):
        tweet=df.iloc[i]
        for item in tweet[target_column]:
            if(item in items_frequency):
                items_frequency[str(item).lower()]+=1
            else:
                items_frequency[str(item).lower()]=1
    return items_frequency

In [23]:
hashtags_frequency=get_frequency(tweet_time,"hashtags")
top_hashtags=pd.DataFrame(sorted(hashtags_frequency.items(), key=lambda hashtags_frequency: hashtags_frequency[1],reverse=True)[:50],columns=['hashtag','frequency'])

In [24]:
plot([go.Bar(x=top_hashtags.hashtag.values, y=top_hashtags.frequency.values)],filename='hashtags.html')

'file:///home/mahmoud/sipof.ink/JupyterProjects/socialMediaAnalysis/Twitter_Analytics/hashtags.html'

## Hashtags Analysis

In [272]:
nodes_list=[]
for i in range(len(top_hashtags)):
    node_id=top_hashtags.iloc[i]['hashtag']
    node_size=top_hashtags.iloc[i]['frequency']/25
    nodes_list.append({"id":node_id,"size":node_size})

In [273]:
adj_matrix=pd.DataFrame(columns=[item['id'] for item in nodes_list],index=[item['id'] for item in nodes_list])
adj_matrix.fillna(0,inplace=True)

In [274]:
c=0
for tweet_hashtag_list in tweets_df.hashtags:
    for node in nodes_list:
        target_hashtag=node['id']
        if(target_hashtag in tweet_hashtag_list):
            for tweet_hashtag in tweet_hashtag_list:
                if(tweet_hashtag in adj_matrix.columns):
                    adj_matrix.loc[target_hashtag,tweet_hashtag]+=1

In [278]:
edges_list=[]
for i in range(len(adj_matrix)):
    for j in range(i+1,len(adj_matrix)):
        s=adj_matrix.iloc[i].index[i]
        t=adj_matrix.iloc[j].index[j]
        v=adj_matrix.loc[s][t]/10
        if(v>4.5):
            nodes_ids=[item['id'] for item in nodes_list]
            if(s in nodes_ids and t in nodes_ids):
                edges_list.append({"source":s,"target":t,"value":v})

In [279]:
import json
with open("hashtagsGraph.json","w") as f:
    json.dump({"nodes":nodes_list,"links":edges_list},f)

In [280]:
print(len(nodes_list),(len(edges_list)))

50 152


## Mentions Analysis

In [5]:
top_mentioners=tweets_df[tweets_df.mentions_numbers>0].groupby(by="writer").count().sort_values(by="mentions_numbers",ascending=False).reset_index()[['writer','mentions_numbers']]

In [6]:
top_mentioners=top_mentioners[top_mentioners.mentions_numbers>2]

In [7]:
top_mentioners

Unnamed: 0,writer,mentions_numbers
0,@JavaScriptKicks,16
1,@eggheadio,11
2,@AngularInDepth,9
3,@webtech_4u,6
4,@vuejsamsterdam,5
5,@cosmic_js,4
6,@joinindorse,4
7,@Frontend_Love,4
8,@sejournal,3
9,@carsoncgibbons,3


In [13]:
top_mentioners_tweets=tweets_df.merge(top_mentioners,on=['writer'])

In [25]:
mentions_frequency=get_frequency(tweets_df,"mentions")
top_mentions=pd.DataFrame(sorted(mentions_frequency.items(), key=lambda mentions_frequency: mentions_frequency[1],reverse=True)[:50],columns=['mentioned','frequency'])

In [26]:
plot([go.Bar(x=top_mentions.mentioned.values, y=top_mentions.frequency.values)],filename='mentions.html')

'file:///home/mahmoud/sipof.ink/JupyterProjects/socialMediaAnalysis/Twitter_Analytics/mentions.html'

In [None]:
for writer in top_mentioners.writer:
    writer_tweets=[top_mentioners_tweets.writer==writer]
    mentioned_people=get_frequency