In [1]:
import tweepy
from tweepy import OAuthHandler, StreamListener
import json
import pandas as pd
import numpy as np
import networkx as nx
from scipy import stats
import matplotlib.pyplot as plt

from operator import itemgetter
max_tweets = 20

In [3]:

tweets_df = pd.read_json('#KashmirBleeds.txt', encoding='utf-8')


In [4]:
tweets_df.columns


Index(['created_at', 'id', 'id_str', 'text', 'truncated', 'entities',
       'metadata', 'source', 'in_reply_to_status_id',
       'in_reply_to_status_id_str', 'in_reply_to_user_id',
       'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo',
       'coordinates', 'place', 'contributors', 'retweeted_status',
       'is_quote_status', 'retweet_count', 'favorite_count', 'favorited',
       'retweeted', 'lang', 'possibly_sensitive', 'extended_entities',
       'quoted_status_id', 'quoted_status_id_str', 'quoted_status'],
      dtype='object')

In [5]:
tweets_final = pd.DataFrame(columns = ["created_at", "id", "in_reply_to_screen_name", "in_reply_to_status_id", "in_reply_to_user_id",
                                      "retweeted_id", "retweeted_screen_name", "user_mentions_screen_name", "user_mentions_id", 
                                       "text", "user_id", "screen_name", "followers_count"])

In [6]:
# Columns that are going to be the same
equal_columns = ["created_at", "id", "text"]
tweets_final[equal_columns] = tweets_df[equal_columns]

In [7]:
def get_basics(tweets_final):
    tweets_final["screen_name"] = tweets_df["user"].apply(lambda x: x["screen_name"])
    tweets_final["user_id"] = tweets_df["user"].apply(lambda x: x["id"])
    tweets_final["followers_count"] = tweets_df["user"].apply(lambda x: x["followers_count"])
    return tweets_final

In [8]:
def get_usermentions(tweets_final):
    # Inside the tag 'entities' will find 'user mentions' and will get 'screen name' and 'id'
    tweets_final["user_mentions_screen_name"] = tweets_df["entities"].apply(lambda x: x["user_mentions"][0]["screen_name"] if x["user_mentions"] else np.nan)
    tweets_final["user_mentions_id"] = tweets_df["entities"].apply(lambda x: x["user_mentions"][0]["id_str"] if x["user_mentions"] else np.nan)
    return tweets_final

In [9]:
def get_retweets(tweets_final):
    # Inside the tag 'retweeted_status' will find 'user' and will get 'screen name' and 'id'    
    tweets_final["retweeted_screen_name"] = tweets_df["retweeted_status"].apply(lambda x: x["user"]["screen_name"] if x is not np.nan else np.nan)
    tweets_final["retweeted_id"] = tweets_df["retweeted_status"].apply(lambda x: x["user"]["id_str"] if x is not np.nan else np.nan)
    return tweets_final

In [10]:
def get_in_reply(tweets_final):
    # Just copy the 'in_reply' columns to the new dataframe
    tweets_final["in_reply_to_screen_name"] = tweets_df["in_reply_to_screen_name"]
    tweets_final["in_reply_to_status_id"] = tweets_df["in_reply_to_status_id"]
    tweets_final["in_reply_to_user_id"]= tweets_df["in_reply_to_user_id"]
    return tweets_final

In [11]:
def fill_df(tweets_final):
    get_basics(tweets_final)
    get_usermentions(tweets_final)
    get_retweets(tweets_final)
    get_in_reply(tweets_final)
    return tweets_final

In [12]:
def get_interactions(row):
    # From every row of the original dataframe
    # First we obtain the 'user_id' and 'screen_name'
    user = row["user_id"], row["screen_name"]
    # Be careful if there is no user id
    if user[0] is None:
        return (None, None), []
    
    # The interactions are going to be a set of tuples
    interactions = set()
    
    # Add all interactions 
    # First, we add the interactions corresponding to replies adding the id and screen_name
    interactions.add((row["in_reply_to_user_id"], row["in_reply_to_screen_name"]))
    # After that, we add the interactions with retweets
    interactions.add((row["retweeted_id"], row["retweeted_screen_name"]))
    # And later, the interactions with user mentions
    interactions.add((row["user_mentions_id"], row["user_mentions_screen_name"]))
    
    # Discard if user id is in interactions
    interactions.discard((row["user_id"], row["screen_name"]))
    # Discard all not existing values
    interactions.discard((None, None))
    # Return user and interactions
    return user, interactions

In [13]:
tweets_final = fill_df(tweets_final)

In [14]:
tweets_final = tweets_final.where((pd.notnull(tweets_final)), None)

In [15]:
tweets_final.head(20)

Unnamed: 0,created_at,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,retweeted_id,retweeted_screen_name,user_mentions_screen_name,user_mentions_id,text,user_id,screen_name,followers_count
0,2020-07-06 18:11:15+00:00,1280202665616568320,,,,1.154719634425176e+18,aminsaki10,aminsaki10,1.154719634425176e+18,RT @aminsaki10: صدای هل من ناصر ینصرنی می اید ...,894817947704799234,khafangoo,250
1,2020-07-06 18:11:10+00:00,1280202647962877954,,,,1.1727579843065075e+18,divaneetar,divaneetar,1.1727579843065075e+18,RT @divaneetar: فردا، سه‌شنبه، ساعت ۲۲ الی ۲۴\...,1216076009235271680,hadi_j01,2004
2,2020-07-06 18:10:46+00:00,1280202544711716865,bbr314,1.28009e+18,8.55356e+17,,,bbr314,8.553559385843712e+17,@bbr314 @T_hassanpour_99 #KashmirBleeds,1246477721200480256,mohamma31337298,35
3,2020-07-06 18:09:46+00:00,1280202295796477960,,,,1.0558474173236675e+18,Ma_Mohsenzadeh,Ma_Mohsenzadeh,1.0558474173236675e+18,"RT @Ma_Mohsenzadeh: Today, we are waiting for ...",1045625681755942912,AvAbolfazl,1257
4,2020-07-06 18:09:45+00:00,1280202289874194432,,,,164508549.0,_A_political,_A_political,164508549.0,RT @_A_political: You can't just be concerned ...,164508549,_A_political,218
5,2020-07-06 18:09:22+00:00,1280202193329676293,,,,185159449.0,RadioPakistan,RadioPakistan,185159449.0,RT @RadioPakistan: Posters have appeared in #S...,1270590367927275521,kashifkhalidkh2,0
6,2020-07-06 18:08:31+00:00,1280201981152432135,,,,1.2533943750041147e+18,Mahdi_M314,Mahdi_M314,1.2533943750041147e+18,RT @Mahdi_M314: The children of Kashmir also w...,976540343889027072,MIQUEEN2019,150
7,2020-07-06 18:07:57+00:00,1280201835702300672,,,,9.090990655264237e+17,MojtabaRohizade,MojtabaRohizade,9.090990655264237e+17,RT @MojtabaRohizade: ان شاء الله باز هم صدای م...,1240584055558725632,Hamed57030573,825
8,2020-07-06 18:07:35+00:00,1280201745663213569,,,,,,,,#IranStandsWithKashmir\n#kashmirbleeds https:/...,1185737669508304896,Kobra16054434,264
9,2020-07-06 18:07:35+00:00,1280201743305814016,,,,7.324518819438836e+17,occupiedbypak,occupiedbypak,7.324518819438836e+17,RT @occupiedbypak: Locals accuse that Army rai...,1241073869181448192,Shant_Samundar,29


In [16]:
graph = nx.Graph()

In [17]:
for index, tweet in tweets_final.iterrows():
    user, interactions = get_interactions(tweet)
    user_id, user_name = user
    tweet_id = tweet["id"]
    #tweet_sent = tweet["sentiment"]
    for interaction in interactions:
        int_id, int_name = interaction
        graph.add_edge(user_id, int_id, tweet_id=tweet_id)
        

        graph.nodes[user_id]["name"] = user_name
        graph.nodes[int_id]["name"] = int_name  

In [18]:
print(f"There are {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges present in the Graph")

There are 29 nodes and 18 edges present in the Graph


In [19]:
degrees = [val for (node, val) in graph.degree()]


In [20]:
print(f"The maximum degree of the Graph is {np.max(degrees)}")   
print(f"The minimum degree of the Graph is {np.min(degrees)}")  

The maximum degree of the Graph is 4
The minimum degree of the Graph is 1


In [21]:
print(f"The average degree of the nodes in the Graph is {np.mean(degrees):.1f}")  
print(f"The most frequent degree of the nodes found in the Graph is {stats.mode(degrees)[0][0]}") 

The average degree of the nodes in the Graph is 1.2
The most frequent degree of the nodes found in the Graph is 1


In [22]:
if nx.is_connected(graph):
    print("The graph is connected")
else:
    print("The graph is not connected")

The graph is not connected


In [23]:
print(f"There are {nx.number_connected_components(graph)} connected components in the Graph")  

There are 11 connected components in the Graph


In [24]:
print(f"There are {nx.number_connected_components(graph)} connected components in the Graph")  

There are 11 connected components in the Graph


In [25]:
largest_subgraph = max(nx.connected_component_subgraphs(graph), key=len)

In [26]:
print(f"There are {largest_subgraph.number_of_nodes()} nodes and {largest_subgraph.number_of_edges()} \
edges present in the largest component of the Graph")
n = largest_subgraph.number_of_nodes()
e = largest_subgraph.number_of_edges()

There are 5 nodes and 4 edges present in the largest component of the Graph


In [27]:
g1 = nx.generators.random_graphs.barabasi_albert_graph(n, e)

In [28]:
g2 = nx.generators.random_graphs.barabasi_albert_graph(n, e)

In [29]:
for v in nx.optimize_graph_edit_distance(g1, g2):
    print(v)

0.0
