In [64]:
import pandas as pd
import os
import numpy as np
import csv
import pytz
import tweepy
import time
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from datetime import datetime
# A package which could be used to check whether an account is a bot
import botometer

from collections import Counter

In [2]:
tweet_2018_path_raw = r"XXXXX"
tweet_2017_path_raw = r"XXXXX"
tweet_2016_path_raw = r"XXXXX"
tweet_2017_path = r"XXXXX"
tweet_combined_path = r"XXXXX"
check_file_path = r"XXXXX"
desktop = r"XXXXX"
saving_path = r"XXXXX"
tweet_filtering_path = r"XXXXX"

In [3]:
# Hong Kong and Shanghai share the same time zone.
# Hence, we transform the utc time in our dataset into Shanghai time
time_zone_hk = pytz.timezone('Asia/Shanghai')

In [4]:
# Set some account information
# For more details, please go to: https://github.com/IUNetSci/botometer-python
mashape_key = "XXXXX"
twitter_app_auth = {
    'consumer_key': "XXXXX",
    'consumer_secret': "XXXXX",
    'access_token': "XXXXX",
    'access_token_secret': "XXXXX",
  }
bom = botometer.Botometer(wait_on_ratelimit=True,
                          mashape_key=mashape_key,
                          **twitter_app_auth)

auth = OAuthHandler("XXXXX", "XXXXX")

auth.set_access_token("XXXXX", "XXXXX")

api = tweepy.API(auth)

# Construct the API instance
api = tweepy.API(auth)

## 1. Firstly, define some functions here

In [5]:
# read multiple csv files from a local directory
def read_text_from_multi_csvs(path):
    all_csv_files = os.listdir(path)
    dataframes = []
    for file in all_csv_files:
        dataframe = pd.read_csv(os.path.join(path, file), encoding='latin-1', dtype='str', 
                                quoting=csv.QUOTE_NONNUMERIC)
        dataframes.append(dataframe)
    combined_dataframes = pd.concat(dataframes, sort=True)
    return combined_dataframes

# Function used to output a pandas dataframe for each user based on the user account number
def derive_dataframe_for_each_user(df, all_users):
    dataframes = []
    for user in all_users:
        dataframes.append(df.loc[df['user_id_str']==user])
    return dataframes


# Based on the dataframe for each user, compute the time range between his or her first tweet and last tweet
def compute_time_range_for_one_user(df):
    user_id_str = list(df['user_id_str'])[0]
    first_row_time_object = list(df.head(1)['hk_time'])[0]
    end_row_time_object = list(df.tail(1)['hk_time'])[0]
    time_range = end_row_time_object - first_row_time_object
    return (user_id_str, time_range.days)

# Add a new colume named hk_time
def get_hk_time(df):
    changed_time_list = []
    for _, row in df.iterrows():
        time_to_change = datetime.strptime(row['created_at'], '%a %b %d %H:%M:%S %z %Y')
        # get the hk time
        changed_time = time_to_change.astimezone(time_zone_hk)
        changed_time_list.append(changed_time)
    df['hk_time'] = changed_time_list
    return df

def check_id_diff(tweet_id_set, bot_id_set):
    wrong_id_list = []
    for index in tweet_id_set:
        if index in bot_id_set:
            pass
        else:
            wrong_id_list.append(index)
    return wrong_id_list

In [6]:
def delete_bots_have_same_geoinformation(df, prop_threshold=0.70):
    users = set(list(df['user_id_str']))
    bot_account = []
    for user in users:
        dataframe = df.loc[df['user_id_str']==user]
        lat_counter = Counter(dataframe['lat'])
        lon_counter = Counter(dataframe['lon'])
        decide = (compute_the_highest_proportion_from_counter(lat_counter, prop_threshold)) or (compute_the_highest_proportion_from_counter(lon_counter, prop_threshold))
        # If only one unqiue geoinformation is found and more than 10 tweets are posted, we regard this account as bot
        if decide:
            bot_account.append(user)
        else:
            pass
    cleaned_df = df.loc[~df['user_id_str'].isin(bot_account)]
#     cleaned_df.to_pickle(os.path.join(saving_path, file_name))
    return cleaned_df

def compute_the_highest_proportion_from_counter(counter_dict, prop_threshold):
    total_count = sum(counter_dict.values())
    result = False
    for latitude in list(counter_dict.keys()):
        if counter_dict[latitude]/total_count > prop_threshold:
            result = True
            return result
        else:
            pass
    return result

def number_of_tweet_user(df):
    user_num = len(set(df['user_id_str']))
    tweet_num = df.shape[0]
    print('Total number of tweet is: {}; Total number of user is {}'.format(tweet_num, user_num))

In [7]:
def check_bot(id_str):
    result = bom.check_account(id_str)
    return result['cap']['universal']

## 2. Read raw tweets files

### 2.1 Load the tweet 2018 raw datasets

In [12]:
%%time
combined_dataframe_2018 = read_text_from_multi_csvs(path=tweet_2018_path_raw)

Wall time: 2min 46s


In [13]:
number_of_tweet_user(combined_dataframe_2018)

Total number of tweet is: 1323013; Total number of user is 95832


### 2.2 Load the tweet 2017 raw datasets

In [17]:
%%time
combined_dataframe_2017 = read_text_from_multi_csvs(path=tweet_2017_path_raw)

Wall time: 3min 36s


In [18]:
number_of_tweet_user(combined_dataframe_2017)

Total number of tweet is: 1602627; Total number of user is 115846


### 2.2 Load the tweet 2016 raw datasets

In [22]:
%%time
combined_dataframe_2016 = read_text_from_multi_csvs(path=tweet_2016_path_raw)

Wall time: 4min 2s


In [23]:
number_of_tweet_user(combined_dataframe_2016)

Total number of tweet is: 1281281; Total number of user is 84029


### 2.3 Combine the 2016 and 2017 datasets

In [26]:
selected_name_list = ['created_at', 'id_str', 'lang', 'lat', 
       'lon', 'place_id', 'place_lat', 'place_lon', 'place_name', 'text', 'time_zone', 'truncated', 'url',
       'user_created_at', 'user_id_str', 'user_lang', 'user_url',
       'verified']
len(selected_name_list)

18

In [27]:
combined_dataframe_2018 = combined_dataframe_2018[selected_name_list]
combined_dataframe_2017 = combined_dataframe_2017[selected_name_list]
combined_dataframe_2016 = combined_dataframe_2016[selected_name_list]

In [28]:
total_dataframe = pd.concat([combined_dataframe_2018, combined_dataframe_2017, combined_dataframe_2016], sort=True)

In [29]:
number_of_tweet_user(total_dataframe)

Total number of tweet is: 4206921; Total number of user is 249912


In [31]:
total_dataframe.to_csv(os.path.join(tweet_combined_path, 'tweet_combined.csv'), encoding='utf-8', 
                       quoting=csv.QUOTE_NONNUMERIC)

Remove memory

In [32]:
del combined_dataframe_2018
del combined_dataframe_2017
del combined_dataframe_2016

## 3. Some initial steps

- Only consider the English and Chinese tweets
- Delete the verified accounts
- Remove tweets which don't have latitude and longitude

In [33]:
total_dataframe_zh_en = total_dataframe.loc[total_dataframe['lang'].isin(['zh', 'en'])]
total_dataframe_zh_en.shape

(2467054, 18)

In [34]:
Counter(total_dataframe_zh_en['verified'])

Counter({'FALSE': 2392527, 'TRUE': 74527})

In [35]:
total_dataframe_without_verified = total_dataframe_zh_en.loc[total_dataframe_zh_en['verified'].isin(['FALSE'])]
total_dataframe_without_verified.shape

(2392527, 18)

In [36]:
total_dataframe_with_geo = total_dataframe_without_verified.dropna(axis=0, subset=['lat'])
total_dataframe_with_geo.shape

(762646, 18)

Then we could save the file to a local directory by:

```Python
total_dataframe_with_geo.to_csv(os.path.join(desktop, 'dataframe.csv'), encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC)
```

In [38]:
total_dataframe_with_geo.to_csv(os.path.join(tweet_combined_path, 'total_dataframe_with_geo.csv'), encoding='utf-8', 
                                quoting=csv.QUOTE_NONNUMERIC)

## 4. Create timestamps for each tweet

Get the hk time of each tweet

In [39]:
total_dataframe_with_geo_copy = total_dataframe_with_geo.copy()
total_dataframe_with_hk_time = get_hk_time(total_dataframe_with_geo_copy)

In [40]:
total_dataframe_with_hk_time['year'] = total_dataframe_with_hk_time.apply(lambda row: int(row['hk_time'].year), axis=1)
total_dataframe_with_hk_time['month'] = total_dataframe_with_hk_time.apply(lambda row: int(row['hk_time'].month), axis=1)
total_dataframe_with_hk_time['month_plus_year'] = total_dataframe_with_hk_time.apply(
    lambda row: str(row['year'])+'_'+str(row['month']), axis=1)

In [42]:
total_dataframe_with_hk_time['day'] = total_dataframe_with_hk_time.apply(lambda row: int(row['hk_time'].day), axis=1)

In [43]:
total_dataframe_with_hk_time.head(10)[['hk_time', 'created_at', 'year', 'month', 'day', 'month_plus_year']]

Unnamed: 0,hk_time,created_at,year,month,day,month_plus_year
1,2018-01-01 07:00:08+08:00,Sun Dec 31 23:00:08 +0000 2017,2018,1,1,2018_1
5,2018-01-01 07:02:03+08:00,Sun Dec 31 23:02:03 +0000 2017,2018,1,1,2018_1
9,2018-01-01 07:03:06+08:00,Sun Dec 31 23:03:06 +0000 2017,2018,1,1,2018_1
10,2018-01-01 07:04:05+08:00,Sun Dec 31 23:04:05 +0000 2017,2018,1,1,2018_1
11,2018-01-01 07:05:03+08:00,Sun Dec 31 23:05:03 +0000 2017,2018,1,1,2018_1
13,2018-01-01 07:05:57+08:00,Sun Dec 31 23:05:57 +0000 2017,2018,1,1,2018_1
25,2018-01-01 07:10:17+08:00,Sun Dec 31 23:10:17 +0000 2017,2018,1,1,2018_1
29,2018-01-01 07:13:25+08:00,Sun Dec 31 23:13:25 +0000 2017,2018,1,1,2018_1
32,2018-01-01 07:14:25+08:00,Sun Dec 31 23:14:25 +0000 2017,2018,1,1,2018_1
35,2018-01-01 07:15:51+08:00,Sun Dec 31 23:15:51 +0000 2017,2018,1,1,2018_1


**Then**, sort the dataset based on the hk time column

In [44]:
total_dataframe_with_hk_time_sorted = total_dataframe_with_hk_time.sort_values(by='hk_time')

In [47]:
total_dataframe_with_hk_time_sorted.tail(10)[['hk_time', 'created_at', 'year', 'month', 'day', 'month_plus_year']]

Unnamed: 0,hk_time,created_at,year,month,day,month_plus_year
7,2018-12-19 05:21:18+08:00,Tue Dec 18 21:21:18 +0000 2018,2018,12,19,2018_12
13,2018-12-19 05:47:11+08:00,Tue Dec 18 21:47:11 +0000 2018,2018,12,19,2018_12
15,2018-12-19 05:54:21+08:00,Tue Dec 18 21:54:21 +0000 2018,2018,12,19,2018_12
0,2018-12-19 06:00:35+08:00,Tue Dec 18 22:00:35 +0000 2018,2018,12,19,2018_12
13,2018-12-19 06:32:24+08:00,Tue Dec 18 22:32:24 +0000 2018,2018,12,19,2018_12
20,2018-12-19 06:43:05+08:00,Tue Dec 18 22:43:05 +0000 2018,2018,12,19,2018_12
24,2018-12-19 06:49:55+08:00,Tue Dec 18 22:49:55 +0000 2018,2018,12,19,2018_12
25,2018-12-19 06:51:54+08:00,Tue Dec 18 22:51:54 +0000 2018,2018,12,19,2018_12
27,2018-12-19 06:54:09+08:00,Tue Dec 18 22:54:09 +0000 2018,2018,12,19,2018_12
31,2018-12-19 06:57:02+08:00,Tue Dec 18 22:57:02 +0000 2018,2018,12,19,2018_12


In [48]:
total_dataframe_with_hk_time_sorted.to_csv(os.path.join(tweet_combined_path, 'tweet_geocoded.csv'), encoding='utf-8', 
                                          quoting=csv.QUOTE_NONNUMERIC)

In [44]:
total_dataframe_with_hk_time_sorted = pd.read_csv(os.path.join(tweet_combined_path, 'tweet_geocoded.csv'), encoding='utf-8', 
                                          quoting=csv.QUOTE_NONNUMERIC, dtype='str', index_col=0)

In [45]:
total_dataframe_with_hk_time_sorted.shape

(762646, 23)

### Get the latitude & longitude of total dataframe

In [49]:
total_dataframe_with_hk_time_sorted_geo = total_dataframe_with_hk_time_sorted[['lat', 'lon']]

In [52]:
total_dataframe_with_hk_time_sorted_geo_copy = total_dataframe_with_hk_time_sorted_geo.copy()
total_dataframe_with_hk_time_sorted_geo_copy['index_num'] = list(range(0, total_dataframe_with_hk_time_sorted_geo_copy.shape[0]))

In [54]:
total_dataframe_with_hk_time_sorted_geo_copy.head()

Unnamed: 0,lat,lon,index_num
12,22.2788499,114.18462,0
19,22.31530176,113.9348316,1
20,22.27680815,113.9161873,2
21,22.27564274,114.1711743,3
23,22.270978,113.576678,4


In [55]:
total_dataframe_with_hk_time_sorted_geo_copy.to_csv(os.path.join(tweet_combined_path, 'tweet_combined_geoinfo.csv'), 
                                                   encoding='utf-8')

In [56]:
total_dataframe_with_hk_time_sorted_geo_copy.shape

(762646, 3)

release some memory

In [66]:
del total_dataframe_with_geo_copy
del total_dataframe_with_hk_time
del total_dataframe_with_hk_time_sorted

## 5. Use ArcMap to find all the tweets posted in Hong Kong

Load the spatial join result

In [22]:
longitudinal_path = os.path.join(tweet_combined_path, 'shapefiles', 'longitudinal')
cross_sectional_path = os.path.join(tweet_combined_path, 'shapefiles', 'cross_sectional')

In [72]:
longitudinal_result_dataframe = pd.DataFrame(columns=['lat', 'lon', 'TPU', 'index_num'])
with open(os.path.join(longitudinal_path, 'longitudinal_spatial_join_result.txt'), encoding='utf-8') as longitudinal_data:
    header = longitudinal_data.readline()
    tweet_lines = longitudinal_data.readlines()
    lat_list = []
    lon_list = []
    longitudinal_tpu_list = []
    index_num_list = []
    for line in tweet_lines:
        line_list = line.split(',')
        lat_list.append(line_list[3])
        lon_list.append(line_list[4])
        longitudinal_tpu_list.append(line_list[7])
        index_num_list.append(line_list[5])
    longitudinal_result_dataframe['lat'] = lat_list
    longitudinal_result_dataframe['lon'] = lon_list
    longitudinal_result_dataframe['TPU'] = longitudinal_tpu_list
    longitudinal_result_dataframe['index_num'] = index_num_list

In [23]:
cross_sectional_result_dataframe = pd.DataFrame(columns=['lat', 'lon', 'TPU', 'index_num'])
with open(os.path.join(cross_sectional_path, 'cross_sectional_tpu_info.txt'), 'r', encoding='utf-8') as cross_sectional_data:
    header = cross_sectional_data.readline()
    tweet_lines = cross_sectional_data.readlines()
    lat_list = []
    lon_list = []
    cross_sectional_tpu_list = []
    index_num_list = []
    for line in tweet_lines:
        line_list = line.split(',')
        lat_list.append(line_list[3])
        lon_list.append(line_list[4])
        cross_sectional_tpu_list.append(line_list[7][:-1])
        index_num_list.append(line_list[5])
    cross_sectional_result_dataframe['lat'] = lat_list
    cross_sectional_result_dataframe['lon'] = lon_list
    cross_sectional_result_dataframe['TPU'] = cross_sectional_tpu_list
    cross_sectional_result_dataframe['index_num'] = index_num_list

In [24]:
tpu_info_longitudinal_path = os.path.join(tweet_combined_path, 'tpu_info_dataframes', 'longitudinal')
tpu_info_cross_sectional_path = os.path.join(tweet_combined_path, 'tpu_info_dataframes', 'cross_sectional')

In [25]:
cross_sectional_result_dataframe['TPU'] = cross_sectional_result_dataframe['TPU'].astype(str)

In [101]:
longitudinal_result_dataframe.to_csv(os.path.join(tpu_info_longitudinal_path, 'tpu_info_longitudinal.csv'), encoding='utf-8')

In [26]:
tpu_name_match_dict = {'121': '121 and 123 - 124',
 '146': '146 - 147',
 '156': '156 and 158',
 '164': '164 - 165',
 '175': '175 - 176',
 '181': '181 - 182',
 '183': '183 - 184',
 '193': '193, 195 and 198',
 '194': '190, 192 and 194',
 '216': '213 and 215 - 216',
 '251': '251 and 256',
 '255': '255 and 269',
 '288': '288 - 289',
 '293': '293 and 296',
 '310': '310 and 321',
 '320': '320, 324 and 329',
 '340': '331 - 334, 336 and 340',
 '411': '411 - 416 and 427',
 '421': '421 - 422',
 '423': '423 and 428',
 '431': '431 - 434',
 '543': '543 and 546',
 '610': '610, 621 and 632',
 '620': '620, 622 and 641',
 '631': '631 and 633',
 '651': '651 - 653',
 '711': '711 - 712, 721 and 728',
 '722': '722 and 727',
 '731': '731, 733 and 754',
 '732': '732, 751 and 753',
 '741': '741 - 744',
 '756': '756 and 761 - 762',
 '811': '811 - 815',
 '824': '824 and 829',
 '826': '826 and 828',
 '832': '832 and 834',
 '911': '911 - 913',
 '931': '931 and 933',
 '932': '932 and 934',
 '941': '941 - 943',
 '950': '950 - 951',
 '961': '961 - 963',
 '971': '971 - 974'}

In [27]:
cross_sectional_result_dataframe_social_demographic_tpu = cross_sectional_result_dataframe.replace({'TPU':tpu_name_match_dict})

In [28]:
cross_sectional_result_dataframe_social_demographic_tpu.to_csv(os.path.join(tpu_info_cross_sectional_path, 
                                                                            'tpu_cross_sectional_info.csv'), encoding='utf-8')

In [29]:
cross_sectional_result_dataframe_social_demographic_tpu = pd.read_csv(os.path.join(tpu_info_cross_sectional_path, 
                                                                            'tpu_cross_sectional_info.csv'), encoding='utf-8')
longitudinal_result_dataframe = pd.read_csv(os.path.join(tpu_info_longitudinal_path, 'tpu_info_longitudinal.csv'), encoding='utf-8')

In [30]:
cross_sectional_result_dataframe_social_demographic_tpu_sorted = cross_sectional_result_dataframe_social_demographic_tpu.sort_values(by='index_num')
longitudinal_result_dataframe_tpu_sorted = longitudinal_result_dataframe.sort_values(by='index_num')

In [31]:
longitudinal_result_dataframe_tpu_sorted.shape

(762646, 5)

In [32]:
cross_sectional_result_dataframe_social_demographic_tpu_sorted.shape

(762646, 5)

In [46]:
total_dataframe_with_hk_time_sorted['TPU_longitudinal'] = list(longitudinal_result_dataframe_tpu_sorted['TPU'])
total_dataframe_with_hk_time_sorted['TPU_cross_sectional'] = list(cross_sectional_result_dataframe_social_demographic_tpu_sorted['TPU'])

In [47]:
total_dataframe_with_hk_time_sorted.head()

Unnamed: 0,created_at,id_str,lang,lat,lon,place_id,place_lat,place_lon,place_name,text,...,user_lang,user_url,verified,hk_time,year,month,month_plus_year,day,TPU_longitudinal,TPU_cross_sectional
12,Sat May 07 06:18:59 +0000 2016,7.28831e+17,en,22.2788499,114.18462,,22.271674,114.185178,,#Working #Saturday #Afternoon! #Final #Touch i...,...,en,http://www.facebook.com/derekhysteric525,False,2016-05-07 14:18:59+08:00,2016,5,2016_5,7,146,146 - 147
19,Sat May 07 07:02:19 +0000 2016,7.28842e+17,en,22.31530176,113.9348316,,22.2465325,114.064237,,I'm at Hong Kong International Airport <U+9999...,...,en,http://manishmaurya89.blogspot.com/?m=1,False,2016-05-07 15:02:19+08:00,2016,5,2016_5,7,951,950 - 951
20,Sat May 07 07:02:34 +0000 2016,7.28842e+17,en,22.27680815,113.9161873,,22.2465325,114.064237,,The cable car ride... #cablecar #mountain #360...,...,en,http://kotakitam.wordpress.com,False,2016-05-07 15:02:34+08:00,2016,5,2016_5,7,943,941 - 943
21,Sat May 07 07:03:11 +0000 2016,7.28843e+17,en,22.27564274,114.1711743,,22.271674,114.185178,,Love Roses! <ed><U+00A0><U+00BD><ed><U+00B8><U...,...,en,http://www.denkipenki.vsco.co,False,2016-05-07 15:03:11+08:00,2016,5,2016_5,7,131,131
23,Sat May 07 07:03:28 +0000 2016,7.28843e+17,en,22.270978,113.576678,,22.869936,113.4197245,,We're #hiring! Read about our latest #job open...,...,en,http://www.careerarc.com/job-seeker,False,2016-05-07 15:03:28+08:00,2016,5,2016_5,7,0,0


In [48]:
total_dataframe_with_hk_time_sorted.to_csv(os.path.join(tweet_combined_path, 'total_dataframe_with_tpuinfo.csv'), 
                                          encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC)

In [49]:
total_dataframe_in_hk = total_dataframe_with_hk_time_sorted.loc[total_dataframe_with_hk_time_sorted['TPU_longitudinal'] != 0]

In [50]:
total_dataframe_in_hk.to_csv(os.path.join(tweet_combined_path, 'total_dataframe_in_hk.csv'), 
                                          encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC)

In [55]:
number_of_tweet_user(total_dataframe_in_hk)

Total number of tweet is: 562578; Total number of user is 62820


## 6. Delete bots

### 6.1 Pass through my own function

In [51]:
%%time
total_dataframe_without_bot_step1 = delete_bots_have_same_geoinformation(total_dataframe_in_hk)

Wall time: 1h 12min 22s


In [52]:
number_of_tweet_user(total_dataframe_without_bot_step1)

Total number of tweet is: 445972; Total number of user is 31679


In [53]:
Counter(total_dataframe_without_bot_step1['lang'])

Counter({'en': 332848, 'zh': 113124})

In [56]:
number_of_tweet_user(total_dataframe_without_bot_step1)

Total number of tweet is: 445972; Total number of user is 31679


In [54]:
total_dataframe_without_bot_step1.to_csv(os.path.join(tweet_combined_path, 'tweet_without_bot_step1.csv'), encoding='utf-8', 
                                        quoting=csv.QUOTE_NONNUMERIC)

In [108]:
total_dataframe_without_bot_step1 = pd.read_csv(os.path.join(tweet_combined_path, 'tweet_without_bot_step1.csv'), encoding='utf-8', 
                                        quoting=csv.QUOTE_NONNUMERIC, dtype='str')

In [109]:
total_dataframe_without_bot_step1.shape

(445972, 26)

### 6.2 Use botometer API

In [57]:
user_id_list = list(set(total_dataframe_without_bot_step1['user_id_str']))

In [61]:
print('Check bots starts....')
bot_result_list = []
processed_account_list = []
account_with_error = []
# The input of the check bot function should be integers
account_integers = [int(number) for number in user_id_list]
# Get a set of unique users and transform it to list
account_integer_set_list = list(set(account_integers))
account_integer_set_string_list = [str(num) for num in account_integer_set_list]

Check bots starts....


Then we run the following codes to use bototmeter API to do the tweet filtering:

```Python
botometer_check_path = r'F:\CityU\Datasets\Hong Kong Tweets Combined\botometer_check'

assert len(bot_result_list) == 0
assert len(processed_account_list) == 0
assert len(account_with_error) == 0

for index, user in enumerate(account_integer_set_list):
    print('-----------------------------------------------')
    print("Coping with the ", index+1, 'th user: ', user)
    try:
        bot_likelihood = check_bot(int(user))
        print('The botlikelihood score is: {}'.format(bot_likelihood))
        bot_result_list.append(bot_likelihood)
        processed_account_list.append(user)
        print('This account could be processed...')
        print('-----------------------------------------------')
    except Exception as e:
        # In this case, the api shows that this page is not authorized or does not exit
        # We record these accouts as Not Authorized in the bot_result_list
        bot_result_list.append('Not Authorized')
        processed_account_list.append(user)
        account_with_error.append(user)
        print('This account has some problem...')
        print('-----------------------------------------------')
    assert len(processed_account_list) == len(bot_result_list)
    if ((index+1) % 1000 == 0):
        print('The first {} users have been processed'.format(index+1))
        print('Have a break!')
        # You **MUST** specify the dtype='str' here  
        check_bot_dataframe = pd.DataFrame({'account':processed_account_list, 'bot_score': bot_result_list}, dtype='str')
        check_bot_dataframe.to_csv(os.path.join(botometer_check_path, 'tweet_combined_check_first_{}.csv'.format(index+1)), 
                                   encoding='utf-8')
        time.sleep(30) # sleep for 30 seconds
        print('I am OK now!')
    else:
        pass
    
final_bot_likelihood_second_dataframe = pd.DataFrame({'account':processed_account_list, 'bot_score': bot_result_list}, dtype='str')
final_bot_likelihood_second_dataframe.to_csv(os.path.join(botometer_check_path, 'tweet_combined_botometer_check.csv'), encoding='utf-8')
```

Filter out the bots

In [89]:
bot_likelihood_file = pd.read_csv(os.path.join(botometer_check_path, 'tweet_combined_botometer_check.csv'), encoding='utf-8', 
                                 index_col=0, quoting=csv.QUOTE_NONNUMERIC)

In [93]:
bot_likelihood_without_duplicates = bot_likelihood_file.drop_duplicates(subset='account', keep='first')

In [94]:
bot_likelihood_without_duplicates.shape

(31679, 2)

In [95]:
# Here we see the not authorized accounts as the common users, not the bots
decision1 = (bot_likelihood_without_duplicates['bot_score'] != 'Not Authorized')
bot_likelihood_without_not_authorized = bot_likelihood_without_duplicates.loc[decision1]
bot_likelihood_without_not_authorized_copy = bot_likelihood_without_not_authorized.copy()
bot_likelihood_without_not_authorized_copy['bot_score'] = bot_likelihood_without_not_authorized_copy['bot_score'].astype(float)
# Then the loglikelihood score of bots should be greater than 0.4
decision2 = (bot_likelihood_without_not_authorized_copy['bot_score'] > 0.4)
bot_accounts_dataframe = bot_likelihood_without_not_authorized_copy[decision2]

In [96]:
bot_account_list = list(bot_accounts_dataframe['account'])

In [111]:
bot_account_int_list = [np.int64(account) for account in bot_account_list]

In [112]:
total_dataframe_without_bot_step1_copy = total_dataframe_without_bot_step1.copy()
total_dataframe_without_bot_step1_copy['user_id_str'] = total_dataframe_without_bot_step1_copy['user_id_str'].astype(np.int64)

In [113]:
total_dataframe_without_bot_step2 = total_dataframe_without_bot_step1_copy.loc[~total_dataframe_without_bot_step1_copy['user_id_str'].isin(bot_account_int_list)]

In [114]:
number_of_tweet_user(total_dataframe_without_bot_step2)

Total number of tweet is: 431074; Total number of user is 30836


In [116]:
total_dataframe_without_bot_step2.to_csv(os.path.join(tweet_combined_path, 'tweet_combined_in_hk_withoutbot_step2.csv'), 
                                         encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC)

In [117]:
Counter(total_dataframe_without_bot_step2['lang'])

Counter({'en': 324167, 'zh': 106907})