## Import all modules

In [2]:
import yaml
import json
import sqlite3
from datetime import datetime
import pandas as pd
import sklearn
from json_flatten import flatten
from sklearn import tree
import numpy as np
from sklearn.cluster import KMeans


## Retrieve all bots from SQL database

In [3]:
conn = sqlite3.connect('goodbot_table.db')
cur = conn.cursor()

cur.execute("SELECT * FROM user_info WHERE bot_rating > 3.5")

#Check how many users in total
#cur.execute("SELECT * FROM user_info WHERE bot_rating IS NOT NULL")

user_info_all_result = cur.fetchall()
user_info_column_names = [description[0] for description in cur.description]
user_info_bot_dataframe = pd.DataFrame(user_info_all_result, \
                                   columns=user_info_column_names)
print(f"No of user info stored: {len(user_info_all_result)}\n")

conn.close()

No of user info stored: 101



## Load JSON attributes for goodbot into a dataframe

In [67]:
clean_arr = []

files = ['./GoodBot_parameters/Bot_001_019.json',
         './GoodBot_parameters/Bot_020_059.json',
         './GoodBot_parameters/Bot_060_088.json',
         './GoodBot_parameters/Bot_088_132.json',
         './TBT_parameters/TBT_001_050.json',
         './TBT_parameters/TBT_051_100.json',
         './TBT_parameters/TBT_101_150.json',
         './TBT_parameters/TBT_151_200.json',
         './TBT_parameters/TBT_201_250.json',
         './TBT_parameters/TBT_251_300.json',
         './TBT_parameters/TBT_301_350.json',
         './TBT_parameters/TBT_351_400.json',
         './TBT_parameters/TBT_401_450.json',
         './TBT_parameters/TBT_451_500.json',
         './TBT_parameters/TBT_501_550.json',
         './TBT_parameters/TBT_551_600.json',
         './TBT_parameters/TBT_601_650.json',
         './BTC_parameters/botometer_cohort_1.json',
         './BTC_parameters/botometer_cohort_2.json',
         './BTC_parameters/botometer_cohort_3.json',
         './BTC_parameters/botometer_cohort_4.json',
         './BTC_parameters/botometer_cohort_5.json',
         './BTC_parameters/botometer_cohort_6.json',
         './BTC_parameters/botometer_cohort_7.json',
         './BTC_parameters/botometer_cohort_8.json',
         './BTS_parameters/botometer_cohort_1.json',
         './BTS_parameters/botometer_cohort_2.json',
         './BTS_parameters/botometer_cohort_3.json',
         './BTS_parameters/botometer_cohort_4.json',
         './BTS_parameters/botometer_cohort_5.json',
         './BTS_parameters/botometer_cohort_6.json',
         './BTS_parameters/botometer_cohort_7.json',
         './BTS_parameters/botometer_cohort_8.json',
         './BTS_parameters/botometer_cohort_9.json',
         './BTS_parameters/botometer_cohort_10.json',
         './BTS_parameters/botometer_cohort_11.json',
         './BTS_parameters/botometer_cohort_12.json',
         './BTS_parameters/botometer_cohort_13.json',
         './BTS_parameters/botometer_cohort_14.json',
         './BTS_parameters/botometer_cohort_15.json',
         './BTS_parameters/botometer_cohort_16.json',
         './BTS_parameters/botometer_cohort_17.json',
         './BTS_parameters/botometer_cohort_18.json',
         './BTS_parameters/botometer_cohort_19.json'
        ]

bot_files = ['./GoodBot_parameters/Bot_001_019.json',
         './GoodBot_parameters/Bot_020_059.json',
         './GoodBot_parameters/Bot_060_088.json',
         './GoodBot_parameters/Bot_088_132.json']

tbt_files = ['./TBT_parameters/TBT_001_050.json',
         './TBT_parameters/TBT_051_100.json',
         './TBT_parameters/TBT_101_150.json',
         './TBT_parameters/TBT_151_200.json',
         './TBT_parameters/TBT_201_250.json',
         './TBT_parameters/TBT_251_300.json',
         './TBT_parameters/TBT_301_350.json',
         './TBT_parameters/TBT_351_400.json',
         './TBT_parameters/TBT_401_450.json',
         './TBT_parameters/TBT_451_500.json',
         './TBT_parameters/TBT_501_550.json',
         './TBT_parameters/TBT_551_600.json',
         './TBT_parameters/TBT_601_650.json',
        ]

btc_files = ['./BTC_parameters/botometer_cohort_1.json',
         './BTC_parameters/botometer_cohort_2.json',
         './BTC_parameters/botometer_cohort_3.json',
         './BTC_parameters/botometer_cohort_4.json',
         './BTC_parameters/botometer_cohort_5.json',
         './BTC_parameters/botometer_cohort_6.json',
         './BTC_parameters/botometer_cohort_7.json',
         './BTC_parameters/botometer_cohort_8.json']

bts_files = ['./BTS_parameters/botometer_cohort_1.json',
         './BTS_parameters/botometer_cohort_2.json',
         './BTS_parameters/botometer_cohort_3.json',
         './BTS_parameters/botometer_cohort_4.json',
         './BTS_parameters/botometer_cohort_5.json',
         './BTS_parameters/botometer_cohort_6.json',
         './BTS_parameters/botometer_cohort_7.json',
         './BTS_parameters/botometer_cohort_8.json',
         './BTS_parameters/botometer_cohort_9.json',
         './BTS_parameters/botometer_cohort_10.json',
         './BTS_parameters/botometer_cohort_11.json',
         './BTS_parameters/botometer_cohort_12.json',
         './BTS_parameters/botometer_cohort_13.json',
         './BTS_parameters/botometer_cohort_14.json',
         './BTS_parameters/botometer_cohort_15.json',
         './BTS_parameters/botometer_cohort_16.json',
         './BTS_parameters/botometer_cohort_17.json',
         './BTS_parameters/botometer_cohort_18.json',
         './BTS_parameters/botometer_cohort_19.json']

for file in files:

    f = open(file)
    if file in bot_files:
        label = "#bot"
    if file in tbt_files:
        label = "#tbt"
    if file in btc_files:
        label = "#btc"
    if file in bts_files:
        label = "#bts"
    
    arr = json.load(f)

    for elem in arr['result']:

        #print(elem)
        if elem['score'] > 0.7:
            clean_elem = {}

            clean_elem['user_screen_name'] = elem['user']['screen_name']
            clean_elem['lang'] = elem['user']['lang']
            clean_elem['hashtag'] = label
            clean_elem['user_id'] = elem['user']['id_str']
            clean_elem['score'] = elem['score']
            clean_elem['astroturf'] = elem['categories']['astroturf']
            clean_elem['fake_follower'] = elem['categories']['fake_follower']
            clean_elem['financial'] = elem['categories']['financial']
            clean_elem['other'] = elem['categories']['other']
            clean_elem['overall'] = elem['categories']['overall']
            clean_elem['self_declared'] = elem['categories']['self_declared']
            clean_elem['spammer'] = elem['categories']['spammer']
            clean_elem['most_recent_post_time'] = elem['analysis']['timeOfMostRecentPost'] #Normalize
            clean_elem['recent_tweets_per_week'] = elem['analysis']['recentTweetsPerWeek'] #Normalize
            clean_elem['number_of_tweets'] = elem['analysis']['numberOfTweets'] #Normalize
            clean_elem['following'] = elem['analysis']['numberOfFollowees'] #Normalize
            clean_elem['followers'] = elem['analysis']['numberOfFollowers'] #Normalize
            clean_elem['numberOfLikes'] = elem['analysis']['numberOfLikes'] #Normalize
            
            clean_arr.append(clean_elem)

    f.close()

df = pd.DataFrame(clean_arr)

df.to_csv('testdata.csv',index=False)

print(df)


    user_screen_name lang hashtag              user_id  score  astroturf  \
0       TnNguynNgc12   vi    #bot  1406983013095776259   0.92       0.14   
1    PienaNashetania   in    #bot           4586856613   1.00       0.01   
2           inthreek   en    #bot           1516297345   0.83       0.02   
3        a_kitsumaru   ja    #bot           2158721414   0.97       0.00   
4    cephalopodluke2   en    #bot   983177740965773312   0.80       0.71   
..               ...  ...     ...                  ...    ...        ...   
408        MaeGamoza   en    #bts  1302198200862793730   0.80       0.10   
409        OcaRafita   en    #bts  1439213752247721988   0.79       0.07   
410       Erika_3697   en    #bts   855094963436236801   0.85       0.22   
411         chi_1698   en    #bts   805002303350505472   0.86       0.11   
412   mori_min_9597_   en    #bts           3012391237   0.88       0.42   

     fake_follower  financial  other  overall  self_declared  spammer  \
0             