In [2]:
import pandas as pd
import re
import time

In [3]:
df = pd.read_csv('tweets_users_period.csv')
df.sample()

Unnamed: 0,id,text,user_id,user_name,to_date
34502,1050762007727751168,"Republicans decry Dems 'mob rule,' flipping sc...",219913524,moraro456,2018-10-12


## Detect rt

In [4]:
def detect_rt(row):
    PATTERN = 'RT @'
    
    if PATTERN in row['text']:
        initial_rt = row['text'].index(PATTERN)
        text_from_rt = row['text'][initial_rt + len(PATTERN):]
        rt_from = text_from_rt[:text_from_rt.find(':')].strip()
        row['rt_from'] = rt_from

    return row

In [5]:
df = df.apply(detect_rt, axis=1)
df.head()

Unnamed: 0,id,rt_from,text,to_date,user_id,user_name
0,1006016894560763904,,I don't know where the weekend went. I was so ...,2018-06-11,27823262,donkolenda
1,1006017578194612224,,"God our Father, as we have celebrated today th...",2018-06-11,27823262,donkolenda
2,1006120674786725888,,@netdog713 @GOP We should abolish political pa...,2018-06-11,17322758,rob_blue
3,1006125685675646976,,Bring it. https://t.co/Rf4Bsi2V0r,2018-06-11,17322758,rob_blue
4,1006130726725672960,,My God he looks like a manatee with an ill-fit...,2018-06-11,17322758,rob_blue


## Detect mentions

In [6]:
def detect_mentions(row):
    PATTERN = '@'

    tweet_text = row['text']
    tweet_text = tweet_text.replace('RT @', '')
    if PATTERN in tweet_text:

        mentions_users = []

        indexes = [m.start() for m in re.finditer(PATTERN, tweet_text)]
        for index in indexes:
            text_from_mt = tweet_text[index + len(PATTERN):]
            mt_from = text_from_mt[:text_from_mt.find(' ')].strip()
            mentions_users.append(mt_from)
        
        row['mentions'] = mentions_users

    return row

In [7]:
df = df.apply(detect_mentions, axis=1)
df.sample()

Unnamed: 0,id,mentions,rt_from,text,to_date,user_id,user_name
39175,1055640460792713216,"[nytimesphoto:, realdonaldtrump]",,From '@nytimesphoto: #LIAR #moroninchief #sexu...,2018-10-26,32552440,MichelleMazuros


## Get followers

In [8]:
import tweepy

# Go to http://apps.twitter.com and create an app.
# The consumer key and secret will be generated for you after
consumer_key = ''
consumer_secret = ''

# The access tokens can be found on your applications's Details
# page located at https://dev.twitter.com/apps (located
# under "Your access token")
access_token = ''
access_token_secret = ''

In [9]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

In [10]:
api = tweepy.API(auth)

In [11]:
user_names = df['user_name'].unique()
user_followers = []

for username in user_names:
    try:
        user = api.get_user(username)
        followers_counts = user.followers_count
        user_followers.append({'user_name':username,
                               'followers_count':followers_counts})
        time.sleep(1)
    except:
        print('User {} not found'.format(username))

User GAngeli1993 not found
User slbasketballmom not found
User robsaghafi not found
User JohnnyTSollitto not found


In [12]:
df_user_followers = pd.DataFrame(user_followers)
df_user_followers.sample()

Unnamed: 0,followers_count,user_name
108,4544,ECHOisthename


## Interactor Ratio
- RT3 Number of users who have retweeted author’s tweets.
- M4 Number of users mentioning the author
- F1 Number of followers

In [13]:
user_names = df['user_name'].unique()

### RT3

In [14]:
rt3 = []

for user in user_names:
    df_rt_users = df.dropna(subset=['rt_from'])[['rt_from','user_name']].drop_duplicates()
    n_rt = len(df_rt_users.loc[df_rt_users['rt_from'] == user.strip()])
    rt3.append({'user_name':user, 'rt3':n_rt})

In [16]:
df_rt3 = pd.DataFrame(rt3)
df_rt3.sort_values('rt3', ascending=False).sample()

Unnamed: 0,rt3,user_name
197,0,KatiaPriceless


### M4

In [17]:
df_mentions = df.dropna(subset=['mentions'])[['mentions','user_name']]
df_mentions.sample()

Unnamed: 0,mentions,user_name
30288,[],Naztitle01


In [18]:
df_mentions_tuples = df_mentions.mentions.apply(pd.Series) \
                        .merge(df_mentions, right_index = True, left_index = True) \
                        .drop(["mentions"], axis = 1) \
                        .melt(id_vars = ['user_name'], value_name = "user_mention") \
                        .drop("variable", axis = 1) \
                        .dropna().drop_duplicates()
    
df_mentions_tuples.sample()

Unnamed: 0,user_name,user_mention
14423,StaceyPaige27,patricksean333


In [20]:
m4 = []

for user in user_names:
    n_rt = len(df_mentions_tuples.loc[df_mentions_tuples['user_mention'] == user.strip()])
    m4.append({'user_name':user, 'm4':n_rt})

In [21]:
df_m4 = pd.DataFrame(m4)
df_m4.sort_values('m4', ascending=False).sample()

Unnamed: 0,m4,user_name
130,0,WJSClanton


### Interactor Ratio

In [22]:
df_ff_rt3 = pd.merge(df_user_followers, df_rt3, how='outer', on='user_name')
df_ir = pd.merge(df_ff_rt3, df_m4, how='outer', on='user_name')
df_ir.sample()

Unnamed: 0,followers_count,user_name,rt3,m4
104,1997.0,paddywhackspub,0,0


In [23]:
df_ir['interactor_ratio']  = df_ir.apply(lambda x:(x.rt3 + x.m4)/x.followers_count ,axis=1 )

In [24]:
df_ir.sort_values('interactor_ratio', ascending=False).sample()

Unnamed: 0,followers_count,user_name,rt3,m4,interactor_ratio
103,1435.0,ILLANOIZ,0,1,0.000697


## Retweet and Mention Ratio

- #tweets of i retweeted
- #tweets of i replied
- #tweets of i

In [25]:
df_user_tweets = pd.DataFrame(df['user_name'].value_counts())\
                    .reset_index().rename({'index':'user_name', 
                                           'user_name':'n_tweets'}, axis='columns')
    
df_user_tweets.sample()

Unnamed: 0,user_name,n_tweets
148,ILLANOIZ,28


In [26]:
# tweets of i replied
# start with @
tweet_i_replied = []

for user in user_names:
    n_tweets = 0
    pattern = '@' + user.strip()
    for text in df['text'].unique():
        if pattern in text:
            if text.index(pattern) == 0:
                n_tweets = n_tweets + 1
    
    tweet_i_replied.append({'user_name':user, 'tweets_replied': n_tweets})

In [27]:
tweet_i_replied = pd.DataFrame(tweet_i_replied)
tweet_i_replied.sample()

Unnamed: 0,tweets_replied,user_name
57,0,EarlPdxPearl


### RTM ratio

In [28]:
df_tweets_replied = pd.merge(df_user_tweets, tweet_i_replied, how='outer', on='user_name')
df_rtm_ratio = pd.merge(df_tweets_replied, df_rt3, how='outer', on='user_name')
df_rtm_ratio.sample()

Unnamed: 0,user_name,n_tweets,tweets_replied,rt3
241,BostonGreekgirl,2,0,0


In [29]:
df_rtm_ratio['rtm_ratio'] = df_rtm_ratio.apply(lambda x: (x.rt3 + x.tweets_replied)/x.n_tweets, axis=1)

In [30]:
df_rtm_ratio.sample()

Unnamed: 0,user_name,n_tweets,tweets_replied,rt3,rtm_ratio
255,Kanedanny09,1,0,0,0.0


### SNP

In [31]:
df_snp = pd.merge(df_ir, df_rtm_ratio, how='outer', on='user_name')
df_snp = df_snp.fillna(0)
df_snp.sample()

Unnamed: 0,followers_count,user_name,rt3_x,m4,interactor_ratio,n_tweets,tweets_replied,rt3_y,rtm_ratio
194,1215.0,KatiaPriceless,0,0,0.0,10,0,0,0.0


In [32]:
df_snp['SNP'] = (df_snp['interactor_ratio'] + df_snp['rtm_ratio'])/2

In [33]:
df_snp.sort_values('SNP', ascending=False).sample()

Unnamed: 0,followers_count,user_name,rt3_x,m4,interactor_ratio,n_tweets,tweets_replied,rt3_y,rtm_ratio,SNP
71,171.0,ModestDweeb,0,0,0.0,33,0,0,0.0,0.0
