# Twitter MBTI web scraper

<br>

<div class="alert alert-info">
    
<p><b>Introduction:</b> using Twitter's API and tweepy (the Python wrapper) we looked for some users who have in their twitter bios the MBTI type they consider themselves to be. After finding these people, we used their screen name to extract 100 of their tweets or as many as they had (without considering re-tweets). This dataset will allow us to then try our models to see how they behave.</p>
    
</div>

In [257]:
import tweepy
import webbrowser
import time
import credentials
import pandas as pd
import datetime
from tqdm import tqdm, tqdm_pandas

**Note:** This notebook will not work unless you create your own credentials.py file with the consumer_key, consumer_secret. This document has been left out of the repository on purpose.

In [2]:
consumer_key = credentials.Consumer_Key
consumer_secret = credentials.Consumer_Secret
callback_uri = 'oob' # https://cf.sh/twitter/callback

In [4]:
#this is to connect to twitter's service
auth = tweepy.OAuthHandler(consumer_key,consumer_secret, callback_uri) 

In [6]:
#will return the url
redirect_url = auth.get_authorization_url()
print(redirect_url)

https://api.twitter.com/oauth/authorize?oauth_token=rFAH3wAAAAABFLRyAAABdY6qF5w


In [7]:
webbrowser.open(redirect_url)

True

In [None]:
user_pint_input = input("What's the pin value?")

In [None]:
#these keys do not change (only if you change the consumer key or secret)
auth.get_access_token(user_pint_input)

In [10]:
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
me = api.me()

In [None]:
# we test everything by checking our own twitter account
print(me.screen_name)
print(me.description)

In [13]:
user = api.get_user(" ") #include the username in between the brakets

In [14]:
def extract_timeline_as_df(user, pages):
    user_list = []
    for i in range(pages):
        user_list.append(user.timeline(page=i))
    
    columns = set()
    allowed_types = [str,int]
    tweets_data = []
    for timeline_list in user_list:
        for status in timeline_list:
            #print(status.text)
            #print(vars(status))
            status_dict = dict(vars(status))
            keys = vars(status).keys()
            single_tweet_data = {'user': status.user.screen_name, 'author': status.author.screen_name}
            for key in keys:
                try:
                    v_type = type(status_dict[key])
                except:
                    v_type = None
                if v_type != None:
                    if v_type in allowed_types:
                        single_tweet_data[key] = status_dict[key]
                        columns.add(key)

            tweets_data.append(single_tweet_data)

    header_cols = list(columns)
    header_cols.append('user')
    header_cols.append('author')

    df = pd.DataFrame(tweets_data, columns=header_cols)

    return df

In [15]:
df_user =extract_timeline_as_df(user,2)

In [None]:
df_user.head()

In [17]:
user_friends = user.friends()

In [18]:
types = ['ISTJ', 'ISFJ', 'INFJ', 'INTJ', 'ISTP', 'ISFP', 'INFP', 'INTP', 'ESTP', 'ESFP', 'ENFP', 'ENTP', 'ESTJ', 'ESFJ', 'ENFJ', 'ENTJ']

In [20]:
for friend in user_friends:
    tester = any(x in friend.description.upper() for x in types)
    if tester == True:
        print(friend.description.upper())
        print()

INTP LII 9W8

YOUTUBE CHANNEL: THE INTROVERTED THINKER

ORIGINALLY INTP ACCOUNT (WITH HOBBY LEVEL INTEREST IN WEB DEVELOPMENT), BUT ONE DAY... EVERYTHING CHANGED WHEN I TURNED INTO THE MOON.

ENFJ | 2W3 |♌︎ ♈︎ ♋︎ | PSYCHIC BISEXUAL ENERGY | JUNGIAN PERSONALITY THEORY, INDIVIDUATION, SHADOW INTEGRATION, ASTROLOGY | BRAND STRATEGY @NOVASOLISCO

LIMINAL

INTP 4W3

INFJ | IEI-FE | 9W1 SX/SP (945) | H-EDS | ARIES ☀️ AQUARIUS 🌙 GEMINI ⤴️ | BLACK LIVES MATTER

THERE'S MORE TO TYPE THAN MEETS THE EYE. INFJ 
CERTIFIED MBTI PRACTITIONER & PHILOSOPHER OF LIFE

YOU ARE TERRIFYING AND STRANGE AND BEAUTIFUL, SOMETHING NOT EVERYONE KNOWS HOW TO LOVE • ENFP, IEE • 4W5, SX/SO, 479 • WITCH • ♓️♏️♒️



## Extracting Users with MBTI in their description

In [22]:
user = api.get_user("mbti_insights") #we find a twitter account whose followers are potential candidates to have an MBTI type in their bios

In [31]:
followers = []

c = tweepy.Cursor(api.followers_ids, screen_name='mbti_insights').items(2500)
    
while True:
    try:
        follower = c.next()
        followers.append(follower)
    except tweepy.TweepError:
        time.sleep(60 * 15)
        continue
    except StopIteration:
        break


In [None]:
followers_mbti_insights = {}
for follower in followers:
    try:
        description = any(x in api.get_user(follower).description.upper() for x in types)
        if description == True:
            followers_mbti_insights[follower] = api.get_user(follower).description
            
    except tweepy.TweepError:
        time.sleep(60 * 15)
        continue
        
    except StopIteration:
        break

In [58]:
user = api.get_user("16Personalities")

In [59]:
followers_with_mbti = {}
for follower in tqdm(followers):
    try:
        description = any(x in api.get_user(follower).description.upper() for x in types)
        if description == True:
            followers_with_mbti[follower] = api.get_user(follower).description
            
    except tweepy.TweepError:
        time.sleep(60 * 15)
        continue
        
    except StopIteration:
        break

  3%|▎         | 64/2500 [08:09<5:29:04,  8.11s/it]Rate limit reached. Sleeping for: 375
 10%|▉         | 244/2500 [1:05:52<4:08:45,  6.62s/it] Rate limit reached. Sleeping for: 389
 12%|█▏        | 311/2500 [1:25:44<9:53:44, 16.27s/it]  Rate limit reached. Sleeping for: 101
 15%|█▌        | 375/2500 [1:41:44<5:06:14,  8.65s/it] Rate limit reached. Sleeping for: 37
 19%|█▉        | 487/2500 [2:30:14<3:01:53,  5.42s/it]   Rate limit reached. Sleeping for: 528
 22%|██▏       | 552/2500 [2:45:02<3:04:56,  5.70s/it]  Rate limit reached. Sleeping for: 546
 25%|██▍       | 615/2500 [2:59:14<3:12:45,  6.14s/it]  Rate limit reached. Sleeping for: 602
 27%|██▋       | 681/2500 [3:13:49<2:28:42,  4.91s/it]  Rate limit reached. Sleeping for: 635
 30%|██▉       | 745/2500 [3:29:17<2:03:22,  4.22s/it]  Rate limit reached. Sleeping for: 611
 32%|███▏      | 808/2500 [3:43:44<1:54:43,  4.07s/it]  Rate limit reached. Sleeping for: 649
 35%|███▍      | 874/2500 [3:58:32<1:12:47,  2.69s/it]  Rate limit 

In [74]:
df = pd.DataFrame(followers_with_mbti, index=[0]).T.reset_index()
df.columns = ['twitter_id','bio']
df['types'] = df["bio"].str.upper().str.findall(r"|".join(types)).apply(" ".join)

In [93]:
df['types'].value_counts()

INFJ                                  166
INFP                                  134
INTP                                   78
ENFP                                   59
INTJ                                   59
ENTP                                   41
ENTJ                                   21
ENFJ                                   19
ISFJ                                   13
ISTP                                   13
ISTJ                                   11
ISFP                                   11
ESTP                                    9
ESFP                                    8
ESFJ                                    5
INFJ INFJ                               4
INFP INFP                               3
ESTJ                                    3
ISFJ ISFJ                               2
INFJ INFP                               1
INFP ISFP                               1
ISFJ INTP                               1
INTJ INTP                               1
INFJ ESFJ                         

In [101]:
corrected_types = []
for i in df['types']:
    words = i.split()
    mbti_type = " ".join(sorted(set(words), key=words.index))
    corrected_types.append(mbti_type)

In [103]:
df['types'] = corrected_types

In [105]:
today = datetime.datetime.today().strftime('%Y_%m_%d')
df.to_csv('twitter_users/results_'+today+'.csv', index=0, header=True)

In [273]:
def extract_followers(user=user, number_followers=2500):
    
    followers_list = []

    c = tweepy.Cursor(api.followers_ids, screen_name=user).items(number_followers)
    
    while True:
        try:
            follower = c.next()
            followers_list.append(follower)
        except tweepy.TweepError:
            time.sleep(60 * 15)
            continue
        except StopIteration:
            break

    return followers_list

In [281]:
def extract_descriptions(list_followers):
    
    dict_user_description = {}
    
    for follower in tqdm(list_followers):
        try:
            description = any(x in api.get_user(follower).description.upper() for x in types)
            if description == True:
                dict_user_description[follower] = api.get_user(follower).description

        except tweepy.TweepError:
            time.sleep(60 * 15)
            continue

        except StopIteration:
            break
            
    return dict_user_description

In [294]:
user = "mbtitime"

followers_list = extract_followers(user=user, number_followers=4000)
print('Total number of followers', len(followers_list))
res = [i for i in followers_list if i not in df.twitter_id.to_list()]
print('Followers minus already analyzed', len(res))

Total number of followers 4000
Followers minus already analyze 3918


In [296]:
dict_descriptions = extract_descriptions(res)

  2%|▏         | 63/3918 [04:18<3:56:01,  3.67s/it]Rate limit reached. Sleeping for: 638
  3%|▎         | 123/3918 [19:14<3:40:54,  3.49s/it]  Rate limit reached. Sleeping for: 649
  5%|▍         | 182/3918 [34:16<3:25:00,  3.29s/it]   Rate limit reached. Sleeping for: 651
  6%|▌         | 242/3918 [49:26<3:58:09,  3.89s/it]   Rate limit reached. Sleeping for: 646
  8%|▊         | 304/3918 [1:04:21<3:31:40,  3.51s/it]   Rate limit reached. Sleeping for: 657
  9%|▉         | 363/3918 [1:19:26<4:34:55,  4.64s/it]   Rate limit reached. Sleeping for: 654
 11%|█         | 422/3918 [1:35:24<4:44:29,  4.88s/it]   Rate limit reached. Sleeping for: 600
 12%|█▏        | 484/3918 [1:49:54<3:26:43,  3.61s/it]   Rate limit reached. Sleeping for: 637
 14%|█▍        | 545/3918 [2:05:02<3:44:41,  4.00s/it]   Rate limit reached. Sleeping for: 635
 16%|█▌        | 611/3918 [2:20:49<4:20:12,  4.72s/it]   Rate limit reached. Sleeping for: 594
 17%|█▋        | 671/3918 [2:35:59<5:21:01,  5.93s/it]   Rate l

In [298]:
def create_dataframe(dictionary_to_convert):
    """Converts dictionary into pandas dataframe with renamed columns"""
    df1 = pd.DataFrame(dictionary_to_convert, index=[0]).T.reset_index()
    df1.columns = ['twitter_id','bio']
    df1['types'] = df1["bio"].str.upper().str.findall(r"|".join(types)).apply(" ".join)
    return df1

In [303]:
def clean_mbti_types(dataframe, column):
    """Some twitter descriptions have repeated mbti types, this converts them into one"""
    corrected_types = []
    for i in dataframe[column]:
        words = i.split()
        mbti_type = " ".join(sorted(set(words), key=words.index))
        corrected_types.append(mbti_type)
    return corrected_types

In [299]:
df_mbtitime = create_dataframe(dict_descriptions)
df_mbtitime['types'] = clean_mbti_types(dataframe=df_mbtitime, column="types")

In [306]:
today = datetime.datetime.today().strftime('%Y_%m_%d')
df_mbtitime.to_csv('twitter_users/results_'+today+'.csv', index=0, header=True)

## Extracting tweets from selected users

In [249]:
data = pd.DataFrame(columns=['user','name','tweets'])

In [250]:
def extract_tweets(twitter_user):
    row = {}
    row = {}
    user = api.get_user(twitter_user)
    row['user'] = user.id
    row['name'] = user.screen_name
    pages = tweepy.Cursor(api.user_timeline, screen_name=row['name'],include_rts=False).items(100)

    tweets = []
    for page in pages:
        tweets.append(page.text)

    row['tweets'] = ' ||| '.join(tweets)

    return row

In [251]:
for i in tqdm(df['twitter_id'].to_list()):
    try:
        data = data.append(extract_tweets(i), ignore_index=True)
    
    except tweepy.TweepError:
        time.sleep(60 * 15)
        continue
        
    except StopIteration:
        break

100%|██████████| 687/687 [23:22:46<00:00, 122.51s/it]    


In [264]:
today = datetime.datetime.today().strftime('%Y_%m_%d')
data.to_csv('tweets_from_users/tweets_'+today+'.csv', index=0, header=True)

In [307]:
df_mbtitime_tweets = pd.DataFrame(columns=['user','name','tweets'])

for i in tqdm(df_mbtitime['twitter_id'].to_list()):
    try:
        df_mbtitime_tweets = df_mbtitime_tweets.append(extract_tweets(i), ignore_index=True)
    
    except tweepy.TweepError:
        time.sleep(60 * 15)
        continue
        
    except StopIteration:
        break

100%|██████████| 550/550 [39:17:29<00:00, 257.18s/it]      


In [310]:
today = datetime.datetime.today().strftime('%Y_%m_%d')
df_mbtitime_tweets.to_csv('tweets_from_users/tweets_'+today+'.csv', index=0, header=True)