## Network Analysis

#### Content  
Part 0: Load data and extract features  
Part 1: Data preprocessing for creating network dataset    

In [1]:
import pandas as pd
import ast
import numpy as np
import math

#### Load data

In [12]:
path_json = "data/tweets_json.json"
df = pd.read_json(path_json)[['created_at','id_str','entities','in_reply_to_user_id_str','user','retweeted_status','quoted_status']]
df.to_csv('data/kaggle.csv',index=False)

In [15]:
df = pd.read_csv('data/kaggle.csv')

In [16]:
df.head()

Unnamed: 0,created_at,id_str,entities,in_reply_to_user_id_str,user,retweeted_status,quoted_status
0,2019-11-29 12:09:16,1200386239452585984,"{'hashtags': [], 'symbols': [], 'user_mentions...",14802766.0,"{'id': 46343365, 'id_str': '46343365', 'name':...",,
1,2019-11-29 12:07:58,1200385913928527872,"{'hashtags': [], 'symbols': [], 'user_mentions...",14802766.0,"{'id': 46343365, 'id_str': '46343365', 'name':...",,
2,2019-12-07 16:17:06,1203347712403480576,"{'hashtags': [], 'symbols': [], 'user_mentions...",454933267.0,"{'id': 2510990164, 'id_str': '2510990164', 'na...",,
3,2019-12-07 10:10:36,1203255481164283904,"{'hashtags': [], 'symbols': [], 'user_mentions...",,"{'id': 997256304296189952, 'id_str': '99725630...",{'created_at': 'Fri Dec 06 15:37:03 +0000 2019...,
4,2019-12-07 09:24:20,1203243838317445120,"{'hashtags': [], 'symbols': [], 'user_mentions...",454933267.0,"{'id': 128520495, 'id_str': '128520495', 'name...",,


#### preprocessing

In [17]:
def mention_name(row):
    user_mentions = ast.literal_eval(row['entities']).get('user_mentions')
    if user_mentions != []:
        mention_name = []
        for i in range(len(user_mentions)):
            mention_name.append(user_mentions[i].get('screen_name'))
    else:
        mention_name = math.nan
    return mention_name

def mention_id(row):
    user_mentions = ast.literal_eval(row['entities']).get('user_mentions')
    if user_mentions != []:
        mention_id = []
        for i in range(len(user_mentions)):
            mention_id.append(user_mentions[i].get('id_str'))
    else:
        mention_id = math.nan
    return mention_id

def user_id(row):
    user_id = ast.literal_eval(row['user']).get('id_str')
    return user_id

def retweet_id(row):
    if type(row['retweeted_status']) == str:
        retweet_id = ast.literal_eval(row['retweeted_status']).get('user').get('id_str')
    else:
        retweet_id = math.nan
    return retweet_id

def quote_id(row):
    if type(row['quoted_status']) == str:
        quote_id = ast.literal_eval(row['quoted_status']).get('user').get('id_str')
    else:
        quote_id = math.nan
    return quote_id

def user_name(row):
    user_name = ast.literal_eval(row['user']).get('name')
    return user_name

In [21]:
df['mention_name'] = df.apply(mention_name,axis=1)
df['mention_id'] = df.apply(mention_id,axis=1)
df['user_id'] = df.apply(user_id,axis=1)
df['retweet_id'] = df.apply(retweet_id,axis=1)
df['quote_id'] = df.apply(quote_id,axis=1)
df['user_name'] = df.apply(user_name,axis=1)

In [22]:
df = df.drop(['entities','user','retweeted_status','quoted_status'],axis=1)
df = df.drop_duplicates('id_str')
df = df.drop('id_str',axis=1)

In [25]:
def to_dummy(row):
    if math.isnan(row['in_reply_to_user_id_str']) == False:
        return str(row['in_reply_to_user_id_str'])
    elif type(row['retweet_id']) == str:
        return str(row['retweet_id'])
    elif type(row['quote_id']) == str:
        return str(row['quote_id'])
df['target'] = df.apply(to_dummy,axis=1)

def to_cat(row):
    if math.isnan(row['in_reply_to_user_id_str']) == False:
        return 'reply'
    elif type(row['retweet_id']) == str:
        return 'retweet'
    elif type(row['quote_id']) == str:
        return 'quote'
df['cat'] = df.apply(to_cat,axis=1)

In [26]:
df = df.drop(columns=['in_reply_to_user_id_str','retweet_id','quote_id'])

#### Merge Party dataset

In [27]:
party = pd.read_csv('data/mp_party.csv')
party = party.drop(columns='Account Name')
dict_party = party.set_index('ScreenName').to_dict()['Party']

In [54]:
def get_party(row):
    if type(row.mention_name) == float:
        return 'unknown'
    list_mention_name = ast.literal_eval(str(row.mention_name))
    for i in range(len(list_mention_name)):
        if list_mention_name[i] in dict_party.keys():
            return dict_party[list_mention_name[i]]
    return 'unknown'
df['party'] = df.apply(get_party,axis=1)

In [55]:
df = df.drop(columns=['mention_name','mention_id'])

In [58]:
def get_opinion(row):
    if row.party == 'the brexit party' or row.party == 'conservative party':
        return 'yes'
    elif row.party == 'unknown':
        return 'unknown'
    else:
        return 'no'
df['brexit'] = df.apply(get_opinion,axis=1)

In [61]:
df.head()

Unnamed: 0,created_at,user_id,user_name,target,cat,party,brexit
0,2019-11-29 12:09:16,46343365,Chris Jackson,14802766.0,reply,unknown,unknown
1,2019-11-29 12:07:58,46343365,Chris Jackson,14802766.0,reply,unknown,unknown
2,2019-12-07 16:17:06,2510990164,🇬🇧Mytinyisland 🔱,454933267.0,reply,labour party,no
3,2019-12-07 10:10:36,997256304296189952,Southeast Cambs for Europe #FBPE #DefendDemocracy,9.253127003469699e+17,retweet,labour party,no
4,2019-12-07 09:24:20,128520495,Binita ⚡️,454933267.0,reply,labour party,no


In [60]:
df.to_csv('data/edge.csv', index=False)