In [30]:
import torch
import numpy as np
import pandas as pd
import json
import os
from transformers import pipeline
from datetime import datetime as dt
from torch.utils.data import Dataset
from tqdm import tqdm

In [32]:
class Twibot22(Dataset):
    def __init__(self,root='./Data/',device='cpu',process=True,save=True):
        self.root = root
        self.device = device
        if process:
            print('Loading train.json')
            df_train=pd.read_json('./twibot20.json')
            self.df_data_labeled=df_train
            self.df_data=df_train
            self.df_data=self.df_data
            self.df_data_labeled=self.df_data_labeled
            self.save=save
        
    def load_labels(self):
        print('Loading labels...',end='   ')
        path=self.root+'label.pt'
        if not os.path.exists(path):
            labels=torch.LongTensor(self.df_data_labeled['label']).to(self.device)
            if self.save:
                torch.save(labels,'./Data/label.pt')
        else:
            labels=torch.load(self.root+"label.pt").to(self.device)
        print('Finished')
        
        return labels
    
    def Des_Preprocess(self):
        print('Loading raw feature1...',end='   ')
        path=self.root+'description.npy'
        if not os.path.exists(path):
            description=[]
            for i in range (self.df_data.shape[0]):
                if self.df_data['profile'][i] is None or self.df_data['profile'][i]['description'] is None:
                    description.append('None')
                else:
                    description.append(self.df_data['profile'][i]['description'])
            description=np.array(description)
            if self.save:
                np.save(path,description)
        else:
            description=np.load(path,allow_pickle=True)
        print('Finished')
        return description

    def Des_embbeding(self):
        print('Running feature1 embedding')
        path=self.root+"des_tensor.pt"
        if not os.path.exists(path):
            description=np.load(self.root+'description.npy',allow_pickle=True)
            print('Loading RoBerta')
            feature_extraction = pipeline('feature-extraction', model="distilroberta-base", tokenizer="distilroberta-base",device=0)
            des_vec=[]
            #for (j,each) in tqdm(enumerate(description)):
            for each in tqdm(description):
                feature=torch.Tensor(feature_extraction(each))
                for (i,tensor) in enumerate(feature[0]):
                    if i==0:
                        feature_tensor=tensor
                    else:
                        feature_tensor+=tensor
                feature_tensor/=feature.shape[1]
                des_vec.append(feature_tensor)
                #if (j%1000==0):
                    #print('[{:>6d}/229580]'.format(j+1))
            des_tensor=torch.stack(des_vec,0).to(self.device)
            if self.save:
                torch.save(des_tensor,'./Data/des_tensor.pt')
        else:
            des_tensor=torch.load(self.root+"des_tensor.pt").to(self.device)
        print('Finished')
        return des_tensor
    
    def tweets_preprocess(self):
        print('Loading raw feature2...',end='   ')
        path=self.root+'tweets.npy'
        if not os.path.exists(path):
            tweets=[]
            for i in range (self.df_data.shape[0]):
                one_usr_tweets=[]
                if self.df_data['tweet'][i] is None:
                    one_usr_tweets.append('')
                else:
                    for each in self.df_data['tweet'][i]:
                        one_usr_tweets.append(each)
                tweets.append(one_usr_tweets)
            tweets=np.array(tweets)
            if self.save:
                np.save(path,tweets)
        else:
            tweets=np.load(path,allow_pickle=True)
        print('Finished')
        return tweets
    
    def tweets_embedding(self):
        print('Running feature2 embedding')
        path=self.root+"tweets_tensor.pt"
        if not os.path.exists(path):
            tweets=np.load("./Data/tweets.npy",allow_pickle=True)
            print('Loading RoBerta')
            feature_extract=pipeline('feature-extraction',model='roberta-base',tokenizer='roberta-base',device=0,padding=True, truncation=True,max_length=500, add_special_tokens = True)
            tweets_list=[]
            for each_person_tweets in tqdm(tweets):
                for j,each_tweet in enumerate(each_person_tweets):
                    each_tweet_tensor=torch.tensor(feature_extract(each_tweet))
                    for k,each_word_tensor in enumerate(each_tweet_tensor[0]):
                        if k==0:
                            total_word_tensor=each_word_tensor
                        else:
                            total_word_tensor+=each_word_tensor
                    total_word_tensor/=each_tweet_tensor.shape[1]
                    if j==0:
                        total_each_person_tweets=total_word_tensor
                    else:
                        total_each_person_tweets+=total_word_tensor
                total_each_person_tweets/=len(each_person_tweets)
                tweets_list.append(total_each_person_tweets)
                #if (i%500==0):
                    #print('[{:>6d}/229580]'.format(i+1))
            tweet_tensor=torch.stack(tweets_list).to(self.device)
            if self.save:
                torch.save(tweet_tensor,path)
        else:
            tweets_tensor=torch.load(self.root+"tweets_tensor.pt").to(self.device)
        print('Finished')
        return tweets_tensor
    
    def num_prop_preprocess(self):
        print('Processing feature3...',end='   ')
        path0=self.root+'num_properties_tensor.pt'
        if not os.path.exists(path0):
            path=self.root
            if not os.path.exists(path+"followers_count.pt"):
                followers_count=[]
                for i in range (self.df_data.shape[0]):
                    if self.df_data['profile'][i] is None or self.df_data['profile'][i]['followers_count'] is None:
                        followers_count.append(0)
                    else:
                        followers_count.append(self.df_data['profile'][i]['followers_count'])
                followers_count=torch.tensor(np.array(followers_count,dtype=np.float32)).to(self.device)
                if self.save:
                    torch.save(followers_count,path+"followers_count.pt")
            
                friends_count=[]
                for i in range (self.df_data.shape[0]):
                    if self.df_data['profile'][i] is None or self.df_data['profile'][i]['friends_count'] is None:
                        friends_count.append(0)
                    else:
                        friends_count.append(self.df_data['profile'][i]['friends_count'])
                friends_count=torch.tensor(np.array(friends_count,dtype=np.float32)).to(self.device)
                if self.save:
                    torch.save(friends_count,path+'friends_count.pt')
            
                screen_name_length=[]
                for i in range (self.df_data.shape[0]):
                    if self.df_data['profile'][i] is None or self.df_data['profile'][i]['screen_name'] is None:
                        screen_name_length.append(0)
                    else:
                        screen_name_length.append(len(self.df_data['profile'][i]['screen_name']))
                screen_name_length=torch.tensor(np.array(screen_name_length,dtype=np.float32)).to(self.device)
                if self.save:
                    torch.save(screen_name_length,path+'screen_name_length.pt')
            
                favourites_count=[]
                for i in range (self.df_data.shape[0]):
                    if self.df_data['profile'][i] is None or self.df_data['profile'][i]['favourites_count'] is None:
                        favourites_count.append(0)
                    else:
                        favourites_count.append(self.df_data['profile'][i]['favourites_count'])
                favourites_count=torch.tensor(np.array(favourites_count,dtype=np.float32)).to(self.device)
                if self.save:
                    torch.save(favourites_count,path+'favourites_count.pt')
                
                active_days=[]
                date0=dt.strptime('Tue Sep 1 00:00:00 +0000 2020 ','%a %b %d %X %z %Y ')
                for i in range (self.df_data.shape[0]):
                    if self.df_data['profile'][i] is None or self.df_data['profile'][i]['created_at'] is None:
                        active_days.append(0)
                    else:
                        date=dt.strptime(self.df_data['profile'][i]['created_at'],'%a %b %d %X %z %Y ')
                        active_days.append((date0-date).days)
                active_days=torch.tensor(np.array(active_days,dtype=np.float32)).to(self.device)
                if self.save:
                    torch.save(active_days,path+'active_days.pt')
                
                statuses_count=[]
                for i in range (self.df_data.shape[0]):
                    if self.df_data['profile'][i] is None or self.df_data['profile'][i]['statuses_count'] is None:
                        statuses_count.append(0)
                    else:
                        statuses_count.append(int(self.df_data['profile'][i]['statuses_count']))
                statuses_count=torch.tensor(np.array(statuses_count,dtype=np.float32)).to(self.device)
                if self.save:
                    torch.save(statuses_count,path+'statuses_count.pt')
                
            else:
                active_days=torch.load(path+"active_days.pt")
                screen_name_length=torch.load(path+"screen_name_length.pt")
                favourites_count=torch.load(path+"favourites_count.pt")
                followers_count=torch.load(path+"followers_count.pt")
                friends_count=torch.load(path+"friends_count.pt")
                statuses_count=torch.load(path+"statuses_count.pt")
            
            active_days=pd.Series(active_days.to('cpu').detach().numpy())
            active_days=(active_days-active_days.mean())/active_days.std()
            active_days=torch.tensor(np.array(active_days))

            screen_name_length=pd.Series(screen_name_length.to('cpu').detach().numpy())
            screen_name_length_days=(screen_name_length-screen_name_length.mean())/screen_name_length.std()
            screen_name_length_days=torch.tensor(np.array(screen_name_length_days))

            favourites_count=pd.Series(favourites_count.to('cpu').detach().numpy())
            favourites_count=(favourites_count-favourites_count.mean())/favourites_count.std()
            favourites_count=torch.tensor(np.array(favourites_count))

            followers_count=pd.Series(followers_count.to('cpu').detach().numpy())
            followers_count=(followers_count-followers_count.mean())/followers_count.std()
            followers_count=torch.tensor(np.array(followers_count))

            friends_count=pd.Series(friends_count.to('cpu').detach().numpy())
            friends_count=(friends_count-friends_count.mean())/friends_count.std()
            friends_count=torch.tensor(np.array(friends_count))

            statuses_count=pd.Series(statuses_count.to('cpu').detach().numpy())
            statuses_count=(statuses_count-statuses_count.mean())/statuses_count.std()
            statuses_count=torch.tensor(np.array(statuses_count))

            num_prop=torch.cat((followers_count.reshape([229580,1]),friends_count.reshape([229580,1]),favourites_count.reshape([229580,1]),statuses_count.reshape([229580,1]),screen_name_length_days.reshape([229580,1]),active_days.reshape([229580,1])),1).to(self.device)

            if self.save:
                torch.save(num_prop,"./Data/num_prop.pt")
            
        else:
            num_prop=torch.load(self.root+"num_properties_tensor.pt").to(self.device)
        print('Finished')
        return num_prop
    
    def cat_prop_preprocess(self):
        print('Processing feature4...',end='   ')
        path=self.root+'cat_properties_tensor.pt'
        if not os.path.exists(path):
            category_properties=[]
            properties=['protected','geo_enabled','verified','contributors_enabled','is_translator','is_translation_enabled','profile_background_tile','profile_use_background_image','has_extended_profile','default_profile','default_profile_image']
            for i in range (self.df_data.shape[0]):
                prop=[]
                if self.df_data['profile'][i] is None:
                    for i in range(11):
                        prop.append(0)
                else:
                    for each in properties:
                        if self.df_data['profile'][i][each] is None:
                            prop.append(0)
                        else:
                            if self.df_data['profile'][i][each] == "True ":
                                prop.append(1)
                            else:
                                prop.append(0)
                prop=np.array(prop)
                category_properties.append(prop)
            category_properties=torch.tensor(np.array(category_properties,dtype=np.float32)).to(self.device)
            if self.save:
                torch.save(category_properties,self.root+'category_properties.pt')
        else:
            category_properties=torch.load(self.root+"cat_properties_tensor.pt").to(self.device)
        print('Finished')
        return category_properties
    
    def Build_Graph(self):
        print('Building graph',end='   ')
        path=self.root+'edge_index.pt'
        if not os.path.exists(path):
            id2index_dict={id:index for index,id in enumerate(self.df_data['ID'])}
            edge_index=[]
            edge_type=[]
            for i,relation in enumerate(self.df_data['neighbor']):
                if relation is not None:
                    for each_id in relation['following']:
                        try:
                            target_id=id2index_dict[int(each_id)]
                        except KeyError:
                            continue
                        else:
                            edge_index.append([i,target_id])
                        edge_type.append(0)
                    for each_id in relation['follower']:
                        try:
                            target_id=id2index_dict[int(each_id)]
                        except KeyError:
                            continue
                        else:
                            edge_index.append([i,target_id])
                        edge_type.append(1)
                else:
                    continue
            edge_index=torch.tensor(edge_index,dtype=torch.long).t().contiguous().to(self.device)
            edge_type=torch.tensor(edge_type,dtype=torch.long).to(self.device)
            if self.save:
                torch.save(edge_index,self.root+"edge_index.pt")
                torch.save(edge_type,self.root+"edge_type.pt")
        else:
            edge_index=torch.load(self.root+"edge_index.pt").to(self.device)
            edge_type=torch.load(self.root+"edge_type.pt").to(self.device)
            print('Finished')
        return edge_index,edge_type
    
    def train_val_test_mask(self):
        if self.root=='./Data/':
            train_idx=range(8278)
            val_idx=range(8278,8278+2365)
            test_idx=range(8278+2365,8278+2365+1183)
        else:
            train_idx=torch.load(self.root+'train_idx.pt')
            val_idx=torch.load(self.root+'val_idx.pt')
            test_idx=torch.load(self.root+'test_idx.pt')
            
        return train_idx,val_idx,test_idx
        
        
    def dataloader(self):
        labels=self.load_labels()
        #self.Des_Preprocess()
        des_tensor=self.Des_embbeding()
        #self.tweets_preprocess()
        tweets_tensor=self.tweets_embedding()
        num_prop=self.num_prop_preprocess()
        category_prop=self.cat_prop_preprocess()
        edge_index,edge_type=self.Build_Graph()
        train_idx,val_idx,test_idx=self.train_val_test_mask()
        return des_tensor,tweets_tensor,num_prop,category_prop,edge_index,edge_type,labels,train_idx,val_idx,test_idx

In [36]:
dataset=Twibot22(root='./Data20/',device='cpu',process=True,save=True)
#des_tensor,tweets_tensor,num_prop,category_prop,edge_index,edge_type,labels,train_idx,val_idx,test_idx=dataset.dataloader()

Loading train.json


In [38]:
import datetime
from pytz import timezone

bot_accounts = pd.concat([pd.read_csv('data/social_spambots_1.csv'), pd.read_csv('data/social_spambots_2.csv'), pd.read_csv('data/social_spambots_3.csv')]).reset_index(drop=True)
clean_accounts = pd.read_csv('data/geniune_accounts.csv')

requiredColumns = ['screen_name', 'created_at', 'updated', 'location', 'verified', 'statuses_count', 'friends_count','followers_count', 'favourites_count', 'default_profile_image', 'profile_use_background_image', 'protected', 'default_profile']
bot_accounts = bot_accounts#[requiredColumns]
clean_accounts = clean_accounts#[requiredColumns]

def clean_df(df):
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['updated'] = pd.to_datetime(df['updated'])
    #print(df['updated'])
    #print()
    #print(df['created_at'])
    df['age'] = ((df['updated'].dt.tz_localize('UTC') - df['created_at']) / np.timedelta64(1, 'D')).astype('int') #modified this line from original repo code.
    #df['age'] = (df['updated'].dt.tz_localize('UTC') - df['created_at']).astype('timedelta64[D]').astype(int)
    #print(df['age'])
    df['has_location'] = df['location'].apply(lambda x: 0 if x==x else 1)
    df['has_avatar'] = df['default_profile_image'].apply(lambda x: 1 if x==x else 0)
    df['has_background'] = df['profile_use_background_image'].apply(lambda x: 1 if x==x else 0)
    df['is_verified']=df['verified'].apply(lambda x: 1 if x==x else 0)
    df['is_protected']=df['protected'].apply(lambda x: 1 if x==x else 0)
    df['profile_modified'] = df['default_profile'].apply(lambda x: 0 if x==x else 1)

    #properties=['protected','geo_enabled','verified','contributors_enabled','is_translator','is_translation_enabled','profile_background_tile','profile_use_background_image','has_extended_profile','default_profile','default_profile_image']
    
    return df[['screen_name', 'age', 'has_location', 'is_verified', 'statuses_count', 'friends_count', 'followers_count', 'favourites_count', 'has_avatar', 'has_background', 'is_protected', 'profile_modified', 'protected', 'geo_enabled','verified','contributors_enabled','is_translator','profile_background_tile','profile_use_background_image','default_profile','default_profile_image', 'description']]

bot_accounts = clean_df(bot_accounts)
clean_accounts = clean_df(clean_accounts)

bot_accounts['BotOrNot'] = 1
clean_accounts['BotOrNot'] = 0

combined_df = pd.concat([bot_accounts, clean_accounts], ignore_index=True)

  df['created_at'] = pd.to_datetime(df['created_at'])
  df['created_at'] = pd.to_datetime(df['created_at'])


In [40]:
bot_accounts.head()

Unnamed: 0,screen_name,age,has_location,is_verified,statuses_count,friends_count,followers_count,favourites_count,has_avatar,has_background,...,geo_enabled,verified,contributors_enabled,is_translator,profile_background_tile,profile_use_background_image,default_profile,default_profile_image,description,BotOrNot
0,davideb66,2555,1,0,1299,40,22,1,1,1,...,1.0,,,,,1.0,1.0,1.0,,1
1,ElisaDospina,2521,0,0,18665,3442,12561,16358,0,1,...,1.0,,,,1.0,1.0,,,Autrice del libro #unavitatuttacurve dal 9 apr...,1
2,Vladimir65,2497,0,0,22987,755,600,14,0,1,...,,,,,1.0,1.0,,,[Live Long and Prosper],1
3,RafielaMorales,2435,0,0,7975,350,398,11,0,1,...,,,,,1.0,1.0,,,"Cuasi Odontologa*♥,#Bipolar, #Sarcastica & Som...",1
4,FabrizioC_c,2413,0,0,20218,405,413,162,0,1,...,1.0,,,,,1.0,,,"I shall rise from my own death, to avenge hers...",1


In [42]:
clean_accounts.head()

Unnamed: 0,screen_name,age,has_location,is_verified,statuses_count,friends_count,followers_count,favourites_count,has_avatar,has_background,...,geo_enabled,verified,contributors_enabled,is_translator,profile_background_tile,profile_use_background_image,default_profile,default_profile_image,description,BotOrNot
0,0918Bask,1008,0,0,2177,332,208,265,0,0,...,1.0,,,,,,,,15years ago X.Lines24,0
1,1120Roll,672,0,0,2660,485,330,3972,0,1,...,1.0,,,,,1.0,1.0,,保守見習い地元大好き人間。 経済学、電工、仏教を勉強中、ちなDeではいかんのか？ (*^◯^*),0
2,14KBBrown,1776,1,0,1254,177,166,1185,0,1,...,,,,,1.0,1.0,,,Let me see what your best move is!,0
3,wadespeters,2006,0,0,202968,981,2248,60304,0,1,...,1.0,,,,,1.0,,,20. menna: #farida #nyc and the 80s actually y...,0
4,191a5bd05da04dc,403,0,0,82,79,21,5,0,1,...,,,,,,1.0,1.0,,Cosmetologist,0


In [44]:
combined_df.shape[0]

8386

In [46]:
combined_df["followers_count"][0]

22

In [48]:
def load_labels1(df, root='./Data/', save=True, device='cpu'):
    print('Loading labels...',end='   ')
    path=root+'label.pt'
    if not os.path.exists(path):
        labels=torch.LongTensor(df['BotOrNot']).to(device)
        if save:
            torch.save(labels,'./Data/label.pt')
    else:
        labels=torch.load(root+"label.pt").to(device)
    print('Finished')
    
    return labels

In [50]:
def Des_Preprocess1(df, root='./Data/', save=True, device='cpu'):
    print('Loading raw feature1...',end='   ')
    path=root+'description.npy'
    if not os.path.exists(path):
        description=[]
        for i in range (df.shape[0]):
            if df['description'][i] is None:
                description.append('None')
            else:
                description.append(df['description'][i])
        description=np.array(description)
        if save:
            np.save(path,description)
    else:
        description=np.load(path,allow_pickle=True)
    print('Finished')
    return description

def Des_embbeding1(root='./Data/', save=True, device='cpu'):
    print('Running feature1 embedding')
    path=root+"des_tensor.pt"
    if not os.path.exists(path):
        description=np.load(root+'description.npy',allow_pickle=True)
        print('Loading RoBerta')
        feature_extraction = pipeline('feature-extraction', model="distilroberta-base", tokenizer="distilroberta-base",device=0)
        des_vec=[]
        #for (j,each) in tqdm(enumerate(description)):
        for each in tqdm(description):
            feature=torch.Tensor(feature_extraction(each))
            for (i,tensor) in enumerate(feature[0]):
                if i==0:
                    feature_tensor=tensor
                else:
                    feature_tensor+=tensor
            feature_tensor/=feature.shape[1]
            des_vec.append(feature_tensor)
            #if (j%1000==0):
                #print('[{:>6d}/229580]'.format(j+1))
        des_tensor=torch.stack(des_vec,0).to(device)
        if save:
            torch.save(des_tensor,'./Data/des_tensor.pt')
    else:
        des_tensor=torch.load(root+"des_tensor.pt").to(device)
    print('Finished')
    return des_tensor

In [52]:
def tweets_preprocess1(df, root='./Data/', save=True, device='cpu'):
    print('Loading raw feature2...',end='   ')
    path=root+'tweets.npy'
    if not os.path.exists(path):
        tweets=[]
        for i in range(df.shape[0]):
            one_usr_tweets=[]
            one_usr_tweets.append('')
            tweets.append(one_usr_tweets)
        tweets=np.array(tweets)
        if save:
            np.save(path,tweets)
    else:
        tweets=np.load(path,allow_pickle=True)
    print('Finished')
    return tweets

In [54]:
def tweets_embedding1(root='./Data/', save=True, device='cpu'):
    print('Running feature2 embedding')
    path=root+"tweets_tensor.pt"
    if not os.path.exists(path):
        tweets=np.load("./Data/tweets.npy",allow_pickle=True)
        print('Loading RoBerta')
        feature_extract=pipeline('feature-extraction',model='roberta-base',tokenizer='roberta-base',device=0,padding=True, truncation=True,max_length=500, add_special_tokens = True)
        print("pipeline Done")
        tweets_list=[]
        for each_person_tweets in tqdm(tweets):
            for j,each_tweet in enumerate(each_person_tweets):
                print("j:", j)
                each_tweet_tensor=torch.tensor(feature_extract(each_tweet))
                for k,each_word_tensor in enumerate(each_tweet_tensor[0]):
                    if k==0:
                        total_word_tensor=each_word_tensor
                    else:
                        total_word_tensor+=each_word_tensor
                total_word_tensor/=each_tweet_tensor.shape[1]
                if j==0:
                    total_each_person_tweets=total_word_tensor
                else:
                    total_each_person_tweets+=total_word_tensor
            total_each_person_tweets/=len(each_person_tweets)
            tweets_list.append(total_each_person_tweets)
            #if (i%500==0):
                #print('[{:>6d}/229580]'.format(i+1))
        tweet_tensor=torch.stack(tweets_list).to(device)
        if save:
            torch.save(tweet_tensor,path)
    else:
        tweets_tensor=torch.load(root+"tweets_tensor.pt").to(device)
    print('Finished')
    return tweets_tensor

In [56]:
def num_prop_preprocess1(df, root='./Data/', save=True, device='cpu'):
    print('Processing feature3...',end='   ')
    path0=root+'num_properties_tensor.pt'
    if not os.path.exists(path0):
        path=root
        if not os.path.exists(path+"followers_count.pt"):
            followers_count=[]
            for i in range(df.shape[0]):
                if  df['followers_count'][i] is None:
                    followers_count.append(0)
                else:
                    followers_count.append(df['followers_count'][i])
            followers_count=torch.tensor(np.array(followers_count,dtype=np.float32)).to(device)
            if save:
                torch.save(followers_count,path+"followers_count.pt")
        
            friends_count=[]
            for i in range (df.shape[0]):
                if  df['friends_count'][i] is None:
                    friends_count.append(0)
                else:
                    friends_count.append(df['friends_count'][i])
            friends_count=torch.tensor(np.array(friends_count,dtype=np.float32)).to(device)
            if save:
                torch.save(friends_count,path+'friends_count.pt')
        
            screen_name_length=[]
            for i in range (df.shape[0]):
                if  df['screen_name'][i] is None:
                    screen_name_length.append(0)
                else:
                    screen_name_length.append(len(df['screen_name'][i]))
            screen_name_length=torch.tensor(np.array(screen_name_length,dtype=np.float32)).to(device)
            if save:
                torch.save(screen_name_length,path+'screen_name_length.pt')
        
            favourites_count=[]
            for i in range (df.shape[0]):
                if  df['favourites_count'][i] is None:
                    favourites_count.append(0)
                else:
                    favourites_count.append(df['favourites_count'][i])
            favourites_count=torch.tensor(np.array(favourites_count,dtype=np.float32)).to(device)
            if save:
                torch.save(favourites_count,path+'favourites_count.pt')
            
            active_days=[]
            for i in range (df.shape[0]):
                if  df['age'][i] is None:
                    active_days.append(0)
                else:
                    active_days.append(df['age'][i])
            active_days=torch.tensor(np.array(active_days,dtype=np.float32)).to(device)
            if save:
                torch.save(active_days,path+'active_days.pt')
            
            statuses_count=[]
            for i in range (df.shape[0]):
                if  df['statuses_count'][i] is None:
                    statuses_count.append(0)
                else:
                    statuses_count.append(int(df['statuses_count'][i]))
            statuses_count=torch.tensor(np.array(statuses_count,dtype=np.float32)).to(device)
            if save:
                torch.save(statuses_count,path+'statuses_count.pt')
            
        else:
            active_days=torch.load(path+"active_days.pt")
            screen_name_length=torch.load(path+"screen_name_length.pt")
            favourites_count=torch.load(path+"favourites_count.pt")
            followers_count=torch.load(path+"followers_count.pt")
            friends_count=torch.load(path+"friends_count.pt")
            statuses_count=torch.load(path+"statuses_count.pt")
        
        active_days=pd.Series(active_days.to('cpu').detach().numpy())
        active_days=(active_days-active_days.mean())/active_days.std()
        active_days=torch.tensor(np.array(active_days))

        screen_name_length=pd.Series(screen_name_length.to('cpu').detach().numpy())
        screen_name_length_days=(screen_name_length-screen_name_length.mean())/screen_name_length.std()
        screen_name_length_days=torch.tensor(np.array(screen_name_length_days))

        favourites_count=pd.Series(favourites_count.to('cpu').detach().numpy())
        favourites_count=(favourites_count-favourites_count.mean())/favourites_count.std()
        favourites_count=torch.tensor(np.array(favourites_count))

        followers_count=pd.Series(followers_count.to('cpu').detach().numpy())
        followers_count=(followers_count-followers_count.mean())/followers_count.std()
        followers_count=torch.tensor(np.array(followers_count))

        friends_count=pd.Series(friends_count.to('cpu').detach().numpy())
        friends_count=(friends_count-friends_count.mean())/friends_count.std()
        friends_count=torch.tensor(np.array(friends_count))

        statuses_count=pd.Series(statuses_count.to('cpu').detach().numpy())
        statuses_count=(statuses_count-statuses_count.mean())/statuses_count.std()
        statuses_count=torch.tensor(np.array(statuses_count))

        #print("followers_count shape:", followers_count.shape)
        #print("friends_count shape:", friends_count.shape)
        #print("favourites_count shape:", favourites_count.shape)
        #print("statuses_count shape:", statuses_count.shape)
        #print("screen_name_length_days shape:", screen_name_length_days.shape)
        #print("active_days shape:", active_days.shape)
        #num_properties_tensor=torch.cat([followers_count,active_days,screen_name_length,following_count,statues],dim=1)
        num_prop=torch.cat((followers_count.reshape([df.shape[0],1]),
                            active_days.reshape([df.shape[0],1]),
                            screen_name_length_days.reshape([df.shape[0],1]),
                            friends_count.reshape([df.shape[0],1]),
                            statuses_count.reshape([df.shape[0],1])),1).to(device)

        
        #num_prop=torch.cat((followers_count.reshape([df.shape[0],1]),friends_count.reshape([df.shape[0],1]),favourites_count.reshape([df.shape[0],1]),statuses_count.reshape([df.shape[0],1]),screen_name_length_days.reshape([df.shape[0],1]),active_days.reshape([df.shape[0],1])),1).to(device)

        if save:
            torch.save(num_prop,"./Data/num_prop.pt")
        
    else:
        num_prop=torch.load(root+"num_properties_tensor.pt").to(device)
    print('Finished')
    return num_prop

In [58]:
def cat_prop_preprocess1(df, root='./Data/', save=True, device='cpu'):
    print('Processing feature4...',end='   ')
    path=root+'cat_properties_tensor.pt'
    if not os.path.exists(path):
        category_properties=[]
        #properties=['protected','geo_enabled','verified','contributors_enabled','is_translator','is_translation_enabled','profile_background_tile','profile_use_background_image','has_extended_profile','default_profile','default_profile_image']
        properties=['default_profile_image']
        for i in range (df.shape[0]):
            prop=[]
            for each in properties:
                if each == 'is_translation_enabled' or each == 'has_extended_profile':
                    prop.append(0)
                else:
                    if df[each][i] is None:
                        prop.append(0)
                    else:
                        if df[each][i] == 1:
                            prop.append(1)
                        else:
                            prop.append(0)
            prop=np.array(prop)
            category_properties.append(prop)
        category_properties=torch.tensor(np.array(category_properties,dtype=np.float32)).reshape([df.shape[0],1]).to(device)
        if save:
            torch.save(category_properties,root+'category_properties.pt')
    else:
        category_properties=torch.load(root+"cat_properties_tensor.pt").to(device)
    print('Finished')
    return category_properties

In [60]:
def Build_Graph1(root='./Data/', save=True, device='cpu'):
        print('Building graph',end='   ')
        path=root+'edge_index0.pt'
        if not os.path.exists(path):
            edge_index=torch.empty(2, 0)
            edge_type=[]
            #edge_index.append(0,0)
            #edge_index.append([[],[]])
            edge_index=torch.tensor(edge_index,dtype=torch.long).t().contiguous().to(device)
            edge_type=torch.tensor(edge_type,dtype=torch.long).to(device)
            if save:
                torch.save(edge_index,root+"edge_index.pt")
                torch.save(edge_type,root+"edge_type.pt")
        else:
            edge_index=torch.load(root+"edge_index.pt").to(device)
            edge_type=torch.load(root+"edge_type.pt").to(device)
            print('Finished')
        return edge_index,edge_type

In [62]:
labels=load_labels1(combined_df)
print(labels.shape)

Loading labels...   Finished
torch.Size([8386])


In [64]:
Des_Preprocess1(combined_df)
des_tensor=Des_embbeding1()

Loading raw feature1...   Finished
Running feature1 embedding
Finished


In [66]:
tweets_preprocess1(combined_df)
tweets_tensor=tweets_embedding1()

Loading raw feature2...   Finished
Running feature2 embedding
Finished


In [68]:
num_prop=num_prop_preprocess1(combined_df)
print(num_prop.shape)

#num_prop1=num_prop_preprocess1(combined_df)
#print(num_prop1.shape)

Processing feature3...   Finished
torch.Size([8386, 5])


NameError: name 'num_prop1' is not defined

In [None]:
category_prop=cat_prop_preprocess1(combined_df)

In [None]:
edge_index,edge_type=Build_Graph1()
edge_index=torch.empty(2, 0).type(torch.int)
print(edge_index)
print(edge_type)

In [None]:
edge_index1,edge_type1=Build_Graph1()
print(edge_index1)
print(edge_type1)
unique_values, counts = torch.unique(edge_index1, return_counts=True)
print(counts)

In [None]:
train_idx1=torch.load('./Data/'+'train_idx.pt')
val_idx1=torch.load('./Data/'+'val_idx.pt')
test_idx1=torch.load('./Data/'+'test_idx.pt')

index = []
for i in range(num_prop.shape[0]):
    #print(i)
    index.append(i)

index_tens = torch.tensor(index) 
print(index_tens.shape)
print(index_tens)


In [None]:
import torch
from torch import nn
from torch_geometric.nn import RGCNConv,FastRGCNConv,GCNConv,GATConv
import torch.nn.functional as F

class BotRGCN(nn.Module):
    def __init__(self,des_size=768,tweet_size=768,num_prop_size=5,cat_prop_size=3,embedding_dimension=128,dropout=0.3):
        super(BotRGCN, self).__init__()
        self.dropout = dropout
        self.linear_relu_des=nn.Sequential(
            nn.Linear(des_size,int(embedding_dimension/4)),
            nn.LeakyReLU()
        )
        self.linear_relu_tweet=nn.Sequential(
            nn.Linear(tweet_size,int(embedding_dimension/4)),
            nn.LeakyReLU()
        )
        self.linear_relu_num_prop=nn.Sequential(
            nn.Linear(num_prop_size,int(embedding_dimension/4)),
            nn.LeakyReLU()
        )
        self.linear_relu_cat_prop=nn.Sequential(
            nn.Linear(cat_prop_size,int(embedding_dimension/4)),
            nn.LeakyReLU()
        )
        
        self.linear_relu_input=nn.Sequential(
            nn.Linear(embedding_dimension,embedding_dimension),
            nn.LeakyReLU()
        )
        
        self.rgcn=RGCNConv(embedding_dimension,embedding_dimension,num_relations=2)
        
        self.linear_relu_output1=nn.Sequential(
            nn.Linear(embedding_dimension,embedding_dimension),
            nn.LeakyReLU()
        )
        self.linear_output2=nn.Linear(embedding_dimension,2)
        
        
        
    def forward(self,des,tweet,num_prop,cat_prop,edge_index,edge_type):
        d=self.linear_relu_des(des)
        t=self.linear_relu_tweet(tweet)
        n=self.linear_relu_num_prop(num_prop)
        c=self.linear_relu_cat_prop(cat_prop)
        x=torch.cat((d,t,n,c),dim=1)
        
        x=self.linear_relu_input(x)
        x=self.rgcn(x,edge_index,edge_type)
        x=F.dropout(x,p=self.dropout,training=self.training)
        x=self.rgcn(x,edge_index,edge_type)
        x=self.linear_relu_output1(x)
        x=self.linear_output2(x)
            
        return x
            

In [None]:
embedding_size,dropout,lr,weight_decay=32,0.1,1e-2,5e-2

model=BotRGCN(cat_prop_size=1,embedding_dimension=embedding_size).to('cpu')
loss=nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(),
                    lr=lr,weight_decay=weight_decay)

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve,auc

def accuracy(output, labels):
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()
    return correct / len(labels)

def init_weights(m):
    if type(m)==nn.Linear:
        nn.init.kaiming_uniform_(m.weight)


def test(test_idx):
    model.eval()
    output = model(des_tensor,tweets_tensor,num_prop,category_prop,edge_index,edge_type)
    loss_test = loss(output[test_idx], labels[test_idx])
    acc_test = accuracy(output[test_idx], labels[test_idx])
    output=output.max(1)[1].to('cpu').detach().numpy()
    label=labels.to('cpu').detach().numpy()
    f1=f1_score(label[test_idx],output[test_idx])
    #mcc=matthews_corrcoef(label[test_idx], output[test_idx])
    precision=precision_score(label[test_idx],output[test_idx])
    recall=recall_score(label[test_idx],output[test_idx])
    fpr, tpr, thresholds = roc_curve(label[test_idx], output[test_idx], pos_label=1)
    Auc=auc(fpr, tpr)
    print("Test set results:",
            "test_loss= {:.4f}".format(loss_test.item()),
            "test_accuracy= {:.4f}".format(acc_test.item()),
            "precision= {:.4f}".format(precision.item()),
            "recall= {:.4f}".format(recall.item()),
            "f1_score= {:.4f}".format(f1.item()),
            #"mcc= {:.4f}".format(mcc.item()),
            "auc= {:.4f}".format(Auc.item()),
            )

In [None]:
model.load_state_dict(torch.load('BotRGCN_weight.pth'))

In [None]:
test(index_tens)