In [25]:
import os
import numpy as np
import pandas as pd
import matplotlib as plt
import collections
import operator

# Absolute path to current working directory
home = os.getcwd()

In [7]:
# Import cleaned data (if you used the "10oct Merging files"-script and want to import 10oct data, then use )
data = pd.read_json(home + '/8oct_pre_processed.json')

# Calculate simple influence score for each tweet
Simple influence score is calculated using following formula:  

$ S_{infl} = \alpha*|favorites|+\beta*|retweets| $

, where $ \alpha $ represents the weight given for favorite and $ \beta $ weight given for retweet. Here we are using values $ \alpha = 1 $ and $ \beta = 3 $, which give a very rough approximate of the amount of social influence gained by each tweet.

In [10]:
data['infl_score'] = data['favorite_count'] + (3 * data['retweet_count'])

# Top 20 lists
# 1. Retweet count by person
## Including retweets

In [12]:
# Retweets by person
data.groupby(['retweet_count', 'user_name'])['infl_score','favorite_count','user_followers_count'].sum()[-1:-20:-1]

Unnamed: 0_level_0,Unnamed: 1_level_0,infl_score,favorite_count,user_followers_count
retweet_count,user_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
86211,Irune,258633,0,9
86211,Ferran Quinto,258633,0,61
86210,Eli,258630,0,7
86207,smon.bcn,258621,0,0
86207,SandraCBH,258621,0,314
86207,Pol Roig,258621,0,5
82717,D. Justicia,248151,0,9
78483,al ☀,235449,0,434
58465,thatERguy,175395,0,5508
56994,Lluc Caminacels,170982,0,643


## Excluding retweets

In [13]:
# Retweets by person
data[data['is_retweet'] == 0].groupby(['retweet_count','user_screen_name'])['infl_score','favorite_count','user_followers_count'].sum()[-1:-20:-1]

Unnamed: 0_level_0,Unnamed: 1_level_0,infl_score,favorite_count,user_followers_count
retweet_count,user_screen_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12530,DaniMateoAgain,49459,11869,1298240
11832,SitaJurado,41000,5504,1969
11672,MACamposP,43801,8785,6539
8840,KofiAnnan,35931,9411,196645
7375,policia,35971,13846,2977466
7264,SPAINonymous,23908,2116,5386
6880,Ruptly,22611,1971,49574
6259,depera1,23814,5037,6408
6203,directe,20575,1966,40345
6106,Berlustinho,21624,3306,74843


# 2. Favorite count by person
## Including retweets

In [14]:
# Favorites by person
data.groupby(['favorite_count', 'user_screen_name'])['infl_score','retweet_count','user_followers_count'].sum()[-1:-20:-1]

Unnamed: 0_level_0,Unnamed: 1_level_0,infl_score,retweet_count,user_followers_count
favorite_count,user_screen_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
13846,policia,35971,7375,2977466
11869,DaniMateoAgain,49459,12530,1298240
9993,VBaena,25779,5262,842
9411,KofiAnnan,35931,8840,196645
8785,MACamposP,43801,11672,6539
6073,JoanTarda,14185,2704,188481
5504,SitaJurado,41000,11832,1969
5054,Elaguijon_,18134,4360,43318
5037,depera1,23814,6259,6408
4070,SandraGolpeA3TV,7067,999,27468


## Excluding retweets

In [15]:
# Favorites by person
data[data['is_retweet'] == 0].groupby(['favorite_count', 'user_screen_name'])['infl_score','retweet_count','user_followers_count'].sum()[-1:-20:-1]

Unnamed: 0_level_0,Unnamed: 1_level_0,infl_score,retweet_count,user_followers_count
favorite_count,user_screen_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
13846,policia,35971,7375,2977466
11869,DaniMateoAgain,49459,12530,1298240
9993,VBaena,25779,5262,842
9411,KofiAnnan,35931,8840,196645
8785,MACamposP,43801,11672,6539
6073,JoanTarda,14185,2704,188481
5504,SitaJurado,41000,11832,1969
5054,Elaguijon_,18134,4360,43318
5037,depera1,23814,6259,6408
4070,SandraGolpeA3TV,7067,999,27468


# 3. Persons with high social influence (over the time period)
## Including retweets

In [16]:
# Influence by person
data.groupby(['infl_score', 'user_screen_name'])['favorite_count','retweet_count','user_followers_count'].sum()[-1:-20:-1]

Unnamed: 0_level_0,Unnamed: 1_level_0,favorite_count,retweet_count,user_followers_count
infl_score,user_screen_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
258633,ferran_quinto03,0,86211,61
258633,Irune05,0,86211,9
258630,Eeliixx,0,86210,7
258621,polroigr,0,86207,5
258621,bakitamelbourne,0,86207,314
258621,ahorasimeveo,0,86207,0
248151,diez_justicia,0,82717,9
235449,aleana_cataluna,0,78483,434
175395,thatERguy,0,58465,5508
170982,Catalonia_Free,0,56994,643


## Excluding retweets

In [17]:
data[data['is_retweet'] == 0].groupby(['infl_score', 'user_screen_name'])['favorite_count','retweet_count','user_followers_count'].sum()[-1:-20:-1]

Unnamed: 0_level_0,Unnamed: 1_level_0,favorite_count,retweet_count,user_followers_count
infl_score,user_screen_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
49459,DaniMateoAgain,11869,12530,1298240
43801,MACamposP,8785,11672,6539
41000,SitaJurado,5504,11832,1969
35971,policia,13846,7375,2977466
35931,KofiAnnan,9411,8840,196645
25779,VBaena,9993,5262,842
23908,SPAINonymous,2116,7264,5386
23814,depera1,5037,6259,6408
22611,Ruptly,1971,6880,49574
21624,Berlustinho,3306,6106,74843


# 4. Most frequent tweeters

## Including retweets

In [27]:
counted_inc = collections.Counter(data['user_screen_name'])
tweet_freq_inc = sorted(counted_inc.items(), key=operator.itemgetter(1))

In [28]:
tweet_freq_inc[:-20:-1]

[('CatalanRobot', 276),
 ('Rojaconarte', 188),
 ('REscolaMarco', 181),
 ('SCC_Cerdanyola', 179),
 ('Belit88', 169),
 ('Anday26209982', 157),
 ('LolaLop22360512', 147),
 ('JuntsPelSi_Cat', 145),
 ('nu_damian', 140),
 ('kinbota', 135),
 ('huguet_gabriel', 129),
 ('Diequi', 125),
 ('Wittgenstein_jm', 124),
 ('asiandafrica', 123),
 ('pallaron12', 121),
 ('AngelesBN3', 120),
 ('Mazius069', 117),
 ('marisaparrilla', 116),
 ('aneshali', 116)]

## Excluding retweets

In [29]:
counted_exc = collections.Counter(data['user_screen_name'][data['is_retweet'] == 0])
tweet_freq_exc = sorted(counted_exc.items(), key=operator.itemgetter(1))

In [30]:
tweet_freq_exc[:-20:-1]

[('lextresabogados', 76),
 ('CaraotaDigital', 66),
 ('smujal', 58),
 ('JuntsPelSi_Cat', 56),
 ('falcarazfer', 47),
 ('notiven', 46),
 ('Estrellas_Siete', 45),
 ('bitMomentum', 45),
 ('larosadereus', 44),
 ('Nuria_amb_seny', 43),
 ('Sanfermin00', 43),
 ('oscargeek', 41),
 ('rogerangela', 38),
 ('fractaltrend', 38),
 ('Mela_Lombardi', 38),
 ('ElNacionalWeb', 37),
 ('Marta_catalonia', 37),
 ('maximors63', 36),
 ('BlaancaNiieves', 35)]

## Top50 tweeters with no original tweets

In [31]:
top50 = set([tupl[0] for tupl in tweet_freq_inc[:-51:-1]])
top50_no_orig = top50 - set([tupl[0] for tupl in tweet_freq_exc])

In [32]:
top50_no_orig

{'Anday26209982',
 'AndreuViu',
 'AngelesBN3',
 'CatalanRobot',
 'Fl1chy',
 'LolaLop22360512',
 'LolaRicoMoral1',
 'MarcoDaCostaFX',
 'Mazius069',
 'SBachs',
 'carmelomayorv',
 'cfrd1909cat',
 'cienfuegos66',
 'dle9',
 'huguet_gabriel',
 'jasavalena',
 'josezpt',
 'kamipony',
 'lolibejarano',
 'nu_damian',
 'pallaron12',
 'quela68',
 'roserous_rosa',
 'vidaconpasion8'}

### Likely bots?
Let's see how much these users hold influence in the social media

In [33]:
data_top_bool = [True if str(row) in top50_no_orig else False for row in data['user_screen_name']]

In [34]:
data_in_top = data[data_top_bool]

In [35]:
data_in_top.groupby(['retweet_count', 'user_id'])['favorite_count','retweet_count','user_followers_count'].sum()[:-24:-1]

Unnamed: 0_level_0,Unnamed: 1_level_0,favorite_count,retweet_count,user_followers_count
retweet_count,user_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
29631,887705657440444416,0,29631,161
29631,574681319,0,29631,1498
29630,3256639401,0,29630,433
18162,3256639401,0,18162,433
12531,887705657440444416,0,12531,161
12530,772670580562784256,0,12530,401
12530,3256639401,0,12530,433
11832,887705657440444416,0,11832,161
11832,772670580562784256,0,11832,401
11832,574681319,0,11832,1497


# 5. Averaged Social Influence per tweet

# Hashtags

In [37]:
all_htags = [htag for row in list(data['hashtags']) for htag in row]
all_htags = list(set(all_htags)) #Select unique ones

In [38]:
len(all_htags) # Amount of all unique hashtags

6789

# Most frequent hashtags

## Including retweets

In [39]:
counted = collections.Counter([htag for row in list(data['hashtags']) for htag in row])

In [40]:
sorted_hashtags_freq = sorted(counted.items(), key=operator.itemgetter(1))

In [41]:
sorted_hashtags_freq[:-30:-1]

[('recuperemelseny', 158901),
 ('cataluña', 21163),
 ('8oct', 19891),
 ('sanlúcar', 17175),
 ('barcelona', 14493),
 ('8octbcnespanola', 8812),
 ('catalonia', 8322),
 ('recuperem', 4911),
 ('parlemhablemos', 4554),
 ('spain', 3828),
 ('catalanreferendum', 3540),
 ('españa', 2315),
 ('objetivocataluña', 2034),
 ('envivo', 1850),
 ('catalunya', 1506),
 ('vivaespaña', 1412),
 ('8octbcnespañola', 1189),
 ('alertaultra', 1171),
 ('cataluñaallímitea3', 1164),
 ('viscacatalunya', 1127),
 ('estamosporti', 1078),
 ('estamosportodos', 1050),
 ('1o', 1038),
 ('10odeclaració', 1013),
 ('ligadelajusticia', 960),
 ('felizdomingo', 895),
 ('mayoriasilenciosa', 753),
 ('urquinaona', 753),
 ('parlem', 620)]

## Excluding retweets

In [42]:
data_wo_rt = data[data['is_retweet'] == 0]['hashtags']
counted = collections.Counter([htag for row in list(data_wo_rt) for htag in row])
sorted_hashtags_freq = sorted(counted.items(), key=operator.itemgetter(1))

In [43]:
sorted_hashtags_freq[:-50:-1] 

[('recuperemelseny', 9342),
 ('catalonia', 1190),
 ('8oct', 790),
 ('8octbcnespanola', 781),
 ('cataluña', 762),
 ('objetivocataluña', 756),
 ('barcelona', 574),
 ('catalanreferendum', 562),
 ('parlemhablemos', 516),
 ('spain', 364),
 ('españa', 339),
 ('catalunya', 319),
 ('justiceleague', 235),
 ('felizdomingo', 219),
 ('diesclautv3', 178),
 ('democracia', 177),
 ('rajoy', 141),
 ('puigdemont', 129),
 ('news', 119),
 ('mayoriasilenciosa', 112),
 ('1oct', 100),
 ('independence', 97),
 ('dui', 96),
 ('noestaissolos', 96),
 ('الجمهور_السعودي_يساند_مصر', 95),
 ('independencia', 95),
 ('parlem', 95),
 ('cataluna', 90),
 ('missyouonedirection', 87),
 ('15oct', 86),
 ('referendumcat', 85),
 ('independència', 74),
 ('mesaza', 72),
 ('justicia', 69),
 ('cataluñaallímitea3', 68),
 ('8octbcnespañola', 67),
 ('noticias', 63),
 ('catalan', 63),
 ('recuperarelseny', 61),
 ('borrell', 60),
 ('8octubre', 60),
 ('internacional', 60),
 ('vivaespaña', 59),
 ('seny', 58),
 ('venezuela', 56),
 ('vivalavi

# Picking hashtags for further analysis
Maybe we could try and categorize these to __pro, against and neutral__ categories for further analysis?

Here's a quick hand-picked list of what seems relevant:

In [44]:
rel_htags = ['catalonia','8oct','barcelona','8octbcnespanola','cataluña','catalunya',
             'democracia','recuperemelseny','objetivocataluña','recuperem','parlemhablemos',
             'catalanreferendum','puigdemont','rajoy','independence','parlem','independencia'
             'catalan','recuperarelseny','referendumcat','independència','justicia']