## Import all modules

In [10]:
import yaml
import json
import sqlite3
from datetime import datetime
import pandas as pd
import sklearn
from json_flatten import flatten
from sklearn import tree
import numpy as np
from sklearn.cluster import KMeans

from dateutil.parser import parse as date_parse
import calendar


In [9]:
pip install python-dateutil

Note: you may need to restart the kernel to use updated packages.


## Retrieve all bots from SQL database

In [3]:
conn = sqlite3.connect('goodbot_table.db')
cur = conn.cursor()

cur.execute("SELECT * FROM user_info WHERE bot_rating > 3.5")

#Check how many users in total
#cur.execute("SELECT * FROM user_info WHERE bot_rating IS NOT NULL")

user_info_all_result = cur.fetchall()
user_info_column_names = [description[0] for description in cur.description]
user_info_bot_dataframe = pd.DataFrame(user_info_all_result, \
                                   columns=user_info_column_names)
print(f"No of user info stored: {len(user_info_all_result)}\n")

conn.close()

No of user info stored: 101



## Load JSON attributes for goodbot into a dataframe

In [54]:
clean_arr = []

files = ['./GoodBot_parameters/Bot_001_019.json',
         './GoodBot_parameters/Bot_020_059.json',
         './GoodBot_parameters/Bot_060_088.json',
         './GoodBot_parameters/Bot_088_132.json',
         './TBT_parameters/TBT_001_050.json',
         './TBT_parameters/TBT_051_100.json',
         './TBT_parameters/TBT_101_150.json',
         './TBT_parameters/TBT_151_200.json',
         './TBT_parameters/TBT_201_250.json',
         './TBT_parameters/TBT_251_300.json',
         './TBT_parameters/TBT_301_350.json',
         './TBT_parameters/TBT_351_400.json',
         './TBT_parameters/TBT_401_450.json',
         './TBT_parameters/TBT_451_500.json',
         './TBT_parameters/TBT_501_550.json',
         './TBT_parameters/TBT_551_600.json',
         './TBT_parameters/TBT_601_650.json',
         './BTC_parameters/botometer_cohort_1.json',
         './BTC_parameters/botometer_cohort_2.json',
         './BTC_parameters/botometer_cohort_3.json',
         './BTC_parameters/botometer_cohort_4.json',
         './BTC_parameters/botometer_cohort_5.json',
         './BTC_parameters/botometer_cohort_6.json',
         './BTC_parameters/botometer_cohort_7.json',
         './BTC_parameters/botometer_cohort_8.json',
         './BTS_parameters/botometer_cohort_1.json',
         './BTS_parameters/botometer_cohort_2.json',
         './BTS_parameters/botometer_cohort_3.json',
         './BTS_parameters/botometer_cohort_4.json',
         './BTS_parameters/botometer_cohort_5.json',
         './BTS_parameters/botometer_cohort_6.json',
         './BTS_parameters/botometer_cohort_7.json',
         './BTS_parameters/botometer_cohort_8.json',
         './BTS_parameters/botometer_cohort_9.json',
         './BTS_parameters/botometer_cohort_10.json',
         './BTS_parameters/botometer_cohort_11.json',
         './BTS_parameters/botometer_cohort_12.json',
         './BTS_parameters/botometer_cohort_13.json',
         './BTS_parameters/botometer_cohort_14.json',
         './BTS_parameters/botometer_cohort_15.json',
         './BTS_parameters/botometer_cohort_16.json',
         './BTS_parameters/botometer_cohort_17.json',
         './BTS_parameters/botometer_cohort_18.json',
         './BTS_parameters/botometer_cohort_19.json',
         './Covid_parameters/botometer_cohort_1.json',
         './Covid_parameters/botometer_cohort_2.json',
         './Covid_parameters/botometer_cohort_3.json',
         './Covid_parameters/botometer_cohort_4.json'
        ]

bot_files = ['./GoodBot_parameters/Bot_001_019.json',
         './GoodBot_parameters/Bot_020_059.json',
         './GoodBot_parameters/Bot_060_088.json',
         './GoodBot_parameters/Bot_088_132.json']

tbt_files = ['./TBT_parameters/TBT_001_050.json',
         './TBT_parameters/TBT_051_100.json',
         './TBT_parameters/TBT_101_150.json',
         './TBT_parameters/TBT_151_200.json',
         './TBT_parameters/TBT_201_250.json',
         './TBT_parameters/TBT_251_300.json',
         './TBT_parameters/TBT_301_350.json',
         './TBT_parameters/TBT_351_400.json',
         './TBT_parameters/TBT_401_450.json',
         './TBT_parameters/TBT_451_500.json',
         './TBT_parameters/TBT_501_550.json',
         './TBT_parameters/TBT_551_600.json',
         './TBT_parameters/TBT_601_650.json',
        ]

btc_files = ['./BTC_parameters/botometer_cohort_1.json',
         './BTC_parameters/botometer_cohort_2.json',
         './BTC_parameters/botometer_cohort_3.json',
         './BTC_parameters/botometer_cohort_4.json',
         './BTC_parameters/botometer_cohort_5.json',
         './BTC_parameters/botometer_cohort_6.json',
         './BTC_parameters/botometer_cohort_7.json',
         './BTC_parameters/botometer_cohort_8.json']

bts_files = ['./BTS_parameters/botometer_cohort_1.json',
         './BTS_parameters/botometer_cohort_2.json',
         './BTS_parameters/botometer_cohort_3.json',
         './BTS_parameters/botometer_cohort_4.json',
         './BTS_parameters/botometer_cohort_5.json',
         './BTS_parameters/botometer_cohort_6.json',
         './BTS_parameters/botometer_cohort_7.json',
         './BTS_parameters/botometer_cohort_8.json',
         './BTS_parameters/botometer_cohort_9.json',
         './BTS_parameters/botometer_cohort_10.json',
         './BTS_parameters/botometer_cohort_11.json',
         './BTS_parameters/botometer_cohort_12.json',
         './BTS_parameters/botometer_cohort_13.json',
         './BTS_parameters/botometer_cohort_14.json',
         './BTS_parameters/botometer_cohort_15.json',
         './BTS_parameters/botometer_cohort_16.json',
         './BTS_parameters/botometer_cohort_17.json',
         './BTS_parameters/botometer_cohort_18.json',
         './BTS_parameters/botometer_cohort_19.json']

covid_files = ['./Covid_parameters/botometer_cohort_1.json',
               './Covid_parameters/botometer_cohort_2.json',
               './Covid_parameters/botometer_cohort_3.json',
               './Covid_parameters/botometer_cohort_4.json']

for file in files:

    f = open(file)
    if file in bot_files:
        label = "#bot"
    if file in tbt_files:
        label = "#tbt"
    if file in btc_files:
        label = "#btc"
    if file in bts_files:
        label = "#bts"
    if file in covid_files:
        label = "#covid"
    
    arr = json.load(f)

    for elem in arr['result']:

        #print(elem)
        if elem['score'] > 0.7:
            clean_elem = {}

            clean_elem['user_screen_name'] = elem['user']['screen_name']
            clean_elem['lang'] = elem['user']['lang']                        # Encode with hot one encoding
            clean_elem['hashtag'] = label
            clean_elem['user_id'] = elem['user']['id_str']
            clean_elem['score'] = elem['score']
            clean_elem['astroturf'] = elem['categories']['astroturf']
            clean_elem['fake_follower'] = elem['categories']['fake_follower']
            clean_elem['financial'] = elem['categories']['financial']
            clean_elem['other'] = elem['categories']['other']
            clean_elem['overall'] = elem['categories']['overall']
            clean_elem['self_declared'] = elem['categories']['self_declared']
            clean_elem['spammer'] = elem['categories']['spammer']
            clean_elem['most_recent_post_time'] = calendar.timegm(date_parse(elem['analysis']['timeOfMostRecentPost']).timetuple()) #Normalize
            clean_elem['recent_tweets_per_week'] = float(elem['analysis']['recentTweetsPerWeek']) #Normalize
            clean_elem['number_of_tweets'] = elem['analysis']['numberOfTweets'] #Normalize
            clean_elem['following'] = elem['analysis']['numberOfFollowees'] #Normalize
            clean_elem['followers'] = elem['analysis']['numberOfFollowers'] #Normalize
            clean_elem['numberOfLikes'] = elem['analysis']['numberOfLikes'] #Normalize
            
            
            clean_arr.append(clean_elem)

    f.close()

df = pd.DataFrame(clean_arr)


# # Data Exploration

In [55]:
df.head()

Unnamed: 0,user_screen_name,lang,hashtag,user_id,score,astroturf,fake_follower,financial,other,overall,self_declared,spammer,most_recent_post_time,recent_tweets_per_week,number_of_tweets,following,followers,numberOfLikes
0,TnNguynNgc12,vi,#bot,1406983013095776259,0.92,0.14,0.92,0.34,0.74,0.92,0.13,0.21,1639170119,560.0,15081,3282,64,148
1,PienaNashetania,in,#bot,4586856613,1.0,0.01,0.45,0.125125,1.0,1.0,0.99,0.49,1639170072,27.0,8298,2,54,0
2,inthreek,en,#bot,1516297345,0.83,0.02,0.68,0.16,0.75,0.83,0.83,0.51,1639152641,27.0,12333,46,54,0
3,a_kitsumaru,ja,#bot,2158721414,0.97,0.0,0.61,0.07,0.91,0.97,0.97,0.7,1639175007,100.0,34086,10,12,209
4,cephalopodluke2,en,#bot,983177740965773312,0.8,0.71,0.7,0.54,0.8,0.8,0.3,0.19,1639176037,670.0,128366,4744,1794,0


In [56]:
# 

print(max(df["most_recent_post_time"])) # recent

print(min(df["most_recent_post_time"])) # older

print(max(df["most_recent_post_time"]) - min(df["most_recent_post_time"])) # Range of time ~73 days (bit strange???)

print(df["lang"].unique())

print(len(df))

lang_list = list(df["lang"].unique())
min_date = min(df["most_recent_post_time"])
date_delta = max(df["most_recent_post_time"]) - min(df["most_recent_post_time"])

print("---------------------")

print(max(df["recent_tweets_per_week"]))

print(min(df["recent_tweets_per_week"]))

print(df["recent_tweets_per_week"].dtype)

print("---------------------")

print(max(df["number_of_tweets"]))

print(min(df["number_of_tweets"]))

print("---------------------")

print(max(df["following"]))

print(min(df["following"]))

print("---------------------")

print(max(df["followers"]))

print(min(df["followers"]))

print("---------------------")

print(max(df["numberOfLikes"]))

print(min(df["numberOfLikes"]))

1639473219
1633132239
6340980
['vi' 'in' 'en' 'ja' 'und' 'zh' 'ko' 'es' 'pt' 'tr' 'ht' 'de' 'fr' 'fa'
 'ar' 'th' 'nl' 'it' 'bn' 'hi']
518
---------------------
42000.0
0.067
float64
---------------------
783384
1
---------------------
26302
0
---------------------
483963
0
---------------------
820099
0


## Noramalize data

In [57]:
# Give each language a column
for l in lang_list:
    df[l] = [0] * 518
    


In [58]:
df.head()

Unnamed: 0,user_screen_name,lang,hashtag,user_id,score,astroturf,fake_follower,financial,other,overall,...,ht,de,fr,fa,ar,th,nl,it,bn,hi
0,TnNguynNgc12,vi,#bot,1406983013095776259,0.92,0.14,0.92,0.34,0.74,0.92,...,0,0,0,0,0,0,0,0,0,0
1,PienaNashetania,in,#bot,4586856613,1.0,0.01,0.45,0.125125,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,inthreek,en,#bot,1516297345,0.83,0.02,0.68,0.16,0.75,0.83,...,0,0,0,0,0,0,0,0,0,0
3,a_kitsumaru,ja,#bot,2158721414,0.97,0.0,0.61,0.07,0.91,0.97,...,0,0,0,0,0,0,0,0,0,0
4,cephalopodluke2,en,#bot,983177740965773312,0.8,0.71,0.7,0.54,0.8,0.8,...,0,0,0,0,0,0,0,0,0,0


In [59]:
def normalize_data(row):
    # Set language encoding
    row[row['lang']] = 1
    
    # Normalize date
    row['most_recent_post_time'] = (row['most_recent_post_time'] - min_date)/date_delta
    
    # Normalize recent tweets per week
    row['recent_tweets_per_week'] = (float(row['recent_tweets_per_week']) - 0.067)/(42000.0 - 0.067)
    
    # Normalize 
    row['number_of_tweets'] = (row['number_of_tweets'] - 1)/(783384 - 1)
    
    # Normalize
    row['following'] = (row['following'])/(26302)
    
    # Normalize
    row['followers'] = (row['followers'])/(483963)
    
    # Normalize
    row['numberOfLikes'] = (row['numberOfLikes'])/(820099)
    
    return row

df = df.apply(normalize_data, axis=1)

In [60]:
# Double checking
df['most_recent_post_time'].head()
print(max(df['most_recent_post_time']))
print(min(df['most_recent_post_time']))

print("-------------------------------")

df['recent_tweets_per_week'].head()
print(max(df['recent_tweets_per_week']))
print(min(df['recent_tweets_per_week']))

print("-------------------------------")

df['number_of_tweets'].head()
print(max(df['number_of_tweets']))
print(min(df['number_of_tweets']))

print("-------------------------------")

df['following'].head()
print(max(df['following']))
print(min(df['following']))

print("-------------------------------")

df['followers'].head()
print(max(df['followers']))
print(min(df['followers']))

print("-------------------------------")

df['numberOfLikes'].head()
print(max(df['numberOfLikes']))
print(min(df['numberOfLikes']))

print("-------------------------------")

df['lang'].head()
print(max(df['lang']))
print(min(df['lang']))

1.0
0.0
-------------------------------
1.0
0.0
-------------------------------
1.0
0.0
-------------------------------
1.0
0.0
-------------------------------
1.0
0.0
-------------------------------
1.0
0.0
-------------------------------
zh
ar


## Save dataframe as csv

In [61]:
df.to_csv('testdata.csv',index=False)

df.head()

Unnamed: 0,user_screen_name,lang,hashtag,user_id,score,astroturf,fake_follower,financial,other,overall,...,ht,de,fr,fa,ar,th,nl,it,bn,hi
0,TnNguynNgc12,vi,#bot,1406983013095776259,0.92,0.14,0.92,0.34,0.74,0.92,...,0,0,0,0,0,0,0,0,0,0
1,PienaNashetania,in,#bot,4586856613,1.0,0.01,0.45,0.125125,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,inthreek,en,#bot,1516297345,0.83,0.02,0.68,0.16,0.75,0.83,...,0,0,0,0,0,0,0,0,0,0
3,a_kitsumaru,ja,#bot,2158721414,0.97,0.0,0.61,0.07,0.91,0.97,...,0,0,0,0,0,0,0,0,0,0
4,cephalopodluke2,en,#bot,983177740965773312,0.8,0.71,0.7,0.54,0.8,0.8,...,0,0,0,0,0,0,0,0,0,0
