# Twitter Data Analysis

In [7]:
from neo4j.v1 import GraphDatabase, basic_auth
import plotly
import plotly.figure_factory as ff
import plotly.graph_objs as go

plotly.offline.init_notebook_mode() 

# connect neo4j
driver = GraphDatabase.driver("bolt://localhost:7687", auth=basic_auth("neo4j", "123456"))
session = driver.session()

# helper fucntion
def plot_graph (result, columns, title):
    data_matrix = [columns]
    x = []
    y = []

    for record in result:
        data_matrix.append([record[columns[0]], record[columns[1]]])
        x.append(record[columns[0]])
        y.append(record[columns[1]])

    table = ff.create_table(data_matrix)
    py.iplot(table) 

    data = [go.Bar( x=x, y=y)]
    layout = go.Layout( title=title )

    fig = go.Figure(data=data, layout=layout)
    plotly.offline.iplot(fig)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


## 1. Which tweet has been retweeted the most, and who posted it?

In [102]:
result = session.run("MATCH (t:Tweet) "
                    "WITH t, t.retweet_count AS Retweets "
                    "ORDER BY Retweets DESC "
                    "LIMIT 1 "
                    "MATCH (u:User)-[:POSTS]->(t) "
                    "RETURN u.screen_name AS User,t.text AS Tweet,Retweets")

data_matrix = [['User', 'Retweets', 'Tweet', '', '']]

for record in result:
    data_matrix.append([record['User'], record['Retweets'], record['Tweet'], '', ''])

    
table = ff.create_table(data_matrix)
plotly.offline.iplot(table) 


## 2. What are the top 3 languages in which the tweets are written?

In [8]:
result = session.run("MATCH (u:User) "
                    "RETURN u.lang AS Language, count(u) as Count "
                    "ORDER BY Count DESC LIMIT 3")
    
columns = ['Language', 'Count']
plot_graph (result, columns, 'Top 3 Languages')

## 3. What are the top 5 countries where users tweet most frequently?

In [66]:
result = session.run("MATCH (t:Tweet)<-[:PLACES]-(p:Place) "
                    "RETURN p.country AS Country, count(p) as Count "
                    "ORDER BY Count DESC LIMIT 5")
    
columns = ['Country', 'Count']
plot_graph (result, columns, 'Top 5 Countries')

## 4. Which utility is used most often to post the tweets by users (i.e.: iphone, android)?

In [65]:
result = session.run("MATCH (s:Source) "
                     "WITH s, SIZE((s)<-[:USING]-()) as Count "
                     "ORDER BY Count DESC LIMIT 5 "
                     "RETURN s.name AS Source, Count")

columns = ['Source', 'Count']
plot_graph (result, columns, 'Top 5 Sources')

## 5. Which are the 5 most popular hashtags?

In [64]:
result = session.run("MATCH (h:Hashtag) "
                    "WITH h, SIZE((h)-[:TAGS]->()) as Count "
                    "ORDER BY Count DESC LIMIT 5 "
                    "RETURN h.name AS Hashtag, Count")

    
columns = ['Hashtag', 'Count']
plot_graph (result, columns, 'Top 5 Hashtags')
