## Import all modules

In [2]:
import yaml
import json
import sqlite3
from datetime import datetime
import pandas as pd
import sklearn
from json_flatten import flatten
from sklearn import tree
import numpy as np
from sklearn.cluster import KMeans


## Retrieve all bots from SQL database

In [3]:
conn = sqlite3.connect('goodbot_table.db')
cur = conn.cursor()

cur.execute("SELECT * FROM user_info WHERE bot_rating > 3.5")

#Check how many users in total
#cur.execute("SELECT * FROM user_info WHERE bot_rating IS NOT NULL")

user_info_all_result = cur.fetchall()
user_info_column_names = [description[0] for description in cur.description]
user_info_bot_dataframe = pd.DataFrame(user_info_all_result, \
                                   columns=user_info_column_names)
print(f"No of user info stored: {len(user_info_all_result)}\n")

conn.close()

No of user info stored: 101



## Load JSON attributes for goodbot into a dataframe

In [48]:
clean_arr = []

files = ['./GoodBot_parameters/Bot_001_019.json',
         './GoodBot_parameters/Bot_020_059.json',
         './GoodBot_parameters/Bot_060_088.json',
         './GoodBot_parameters/Bot_088_132.json',
         './TBT_parameters/TBT_001_050.json',
         './TBT_parameters/TBT_051_100.json',
         './TBT_parameters/TBT_101_150.json',
         './TBT_parameters/TBT_151_200.json',
         './TBT_parameters/TBT_201_250.json',
         './TBT_parameters/TBT_251_300.json',
         './TBT_parameters/TBT_301_350.json',
         './TBT_parameters/TBT_351_400.json',
         './TBT_parameters/TBT_401_450.json',
         './TBT_parameters/TBT_451_500.json',
         './TBT_parameters/TBT_501_550.json',
         './TBT_parameters/TBT_551_600.json',
         './TBT_parameters/TBT_601_650.json',
        ]

bot_files = ['./GoodBot_parameters/Bot_001_019.json',
         './GoodBot_parameters/Bot_020_059.json',
         './GoodBot_parameters/Bot_060_088.json',
         './GoodBot_parameters/Bot_088_132.json']

tbt_files = ['./TBT_parameters/TBT_001_050.json',
         './TBT_parameters/TBT_051_100.json',
         './TBT_parameters/TBT_101_150.json',
         './TBT_parameters/TBT_151_200.json',
         './TBT_parameters/TBT_201_250.json',
         './TBT_parameters/TBT_251_300.json',
         './TBT_parameters/TBT_301_350.json',
         './TBT_parameters/TBT_351_400.json',
         './TBT_parameters/TBT_401_450.json',
         './TBT_parameters/TBT_451_500.json',
         './TBT_parameters/TBT_501_550.json',
         './TBT_parameters/TBT_551_600.json',
         './TBT_parameters/TBT_601_650.json',
        ]

for file in files:

    f = open(file)
    if file in bot_files:
        label = "#bot"
    if file in tbt_files:
        label = "#tbt"

    arr = json.load(f)

    for elem in arr['result']:

        #print(elem)
        if elem['score'] > 0.7:
            clean_elem = {}

            clean_elem['user_screen_name'] = elem['user']['screen_name']
            clean_elem['lang'] = elem['user']['lang']
            clean_elem['hashtag'] = label
            clean_elem['user_id'] = elem['user']['id_str']
            clean_elem['score'] = elem['score']
            clean_elem['astroturf'] = elem['categories']['astroturf']
            clean_elem['fake_follower'] = elem['categories']['fake_follower']
            clean_elem['financial'] = elem['categories']['financial']
            clean_elem['other'] = elem['categories']['other']
            clean_elem['overall'] = elem['categories']['overall']
            clean_elem['self_declared'] = elem['categories']['self_declared']
            clean_elem['spammer'] = elem['categories']['spammer']
            #clean_elem['recent_tweets_per_week'] = elem['analysis']['recentTweetsPerWeek'] #Normalize
            #clean_elem['number_of_tweets'] = elem['analysis']['numberOfTweets'] #Normalize
            #clean_elem['following'] = elem['analysis']['numberOfFollowees'] #Normalize
            #clean_elem['followers'] = elem['analysis']['numberOfFollowers'] #Normalize
            #clean_elem['numberOfLikes'] = elem['analysis']['numberOfLikes'] #Normalize

            clean_arr.append(clean_elem)

    f.close()

df = pd.DataFrame(clean_arr)

df.to_csv('testdata.csv',index=False)

print(df)

    user_screen_name lang hashtag              user_id  score  astroturf  \
0       TnNguynNgc12   vi    #bot  1406983013095776259   0.92       0.14   
1    PienaNashetania   in    #bot           4586856613   1.00       0.01   
2           inthreek   en    #bot           1516297345   0.83       0.02   
3        a_kitsumaru   ja    #bot           2158721414   0.97       0.00   
4    cephalopodluke2   en    #bot   983177740965773312   0.80       0.71   
..               ...  ...     ...                  ...    ...        ...   
201      chelsiwonda   en    #tbt            414806837   0.83       0.41   
202  ThelmaBartlet14   en    #tbt  1454861618572140548   0.81       0.29   
203  windychicago123   en    #tbt  1264280222532341761   0.94       0.27   
204         atlxpomx   en    #tbt  1422584353159323649   0.76       0.07   
205        BBiitchhh  und    #tbt  1105688148464459777   0.97       0.10   

     fake_follower  financial  other  overall  self_declared  spammer  
0             0