In [1]:
# import random
import pandas as pd
import gc
from pathlib import Path
import os
from collections import defaultdict
from fastparquet import ParquetFile

verbosity = 5

twibot_path = r"/dataset/twibot22"
twibot_user = r"/dataset/twibot22/user.json"
twibot_label = r"/dataset/twibot22/label.csv"

# Some tasks might be multithreadable. Set the max number of workers here.
concurrent_max_workers = 2

# Files in the path specified by twibot_path, that begin with %twibot_node_identifier_str%, will be assumed as node files and converted if needed.
twibot_node_identifier_str = "tweet_" 

generated_data_output = r"/dataset/twibot22/generated_data" # output is saved in this directory
ls_userdata_output = rf"{generated_data_output}/userdata.jsonl" # the desired filename of bot detail output

sample_set_size_per_label = 50000 # per label, sample this many users
sample_set_stratification = True # If at any point during selection our set becomes unbalanced, should we stratify?
sample_set_strategy = (1,0) # (1over/0under, 1major/0minor)
sample_randomization_before_selection = True # Should we shuffle the samples after selecting and before constraining the set?
sample_randomization_after_constraints = True # Should we shuffle the samples after selecting, constraints, and stratification of the set?

graph_sampling_depth = 3

def debug_print(m, level=5, r=None):
    if level <= verbosity:
        print(m)
        if r:
            raise r

def is_data(name, _dir=generated_data_output):
    file_path = os.path.join(_dir, f"{name}.parquet")
    return os.path.exists(file_path)
    
    
def get_data(name, _dir=generated_data_output,pqargs={},**kwargs):
    if is_data(name, _dir):
        file_path = os.path.join(_dir, f"{name}.parquet")
        print(f"Loading existing data from {file_path}")
        pf = ParquetFile(file_path, **pqargs)
        return pf.to_pandas(**kwargs)
    return False
        
def save_data(name, df, _dir=generated_data_output, **kwargs):
    file_path = os.path.join(_dir, f"{name}.parquet")
    debug_print(f"Saving data to {file_path}", 3)
    os.makedirs(_dir, exist_ok=True)
    fastparquet.write(file_path, df, **kwargs)
    #df.to_parquet(file_path, **kwargs)
    return df        

def _shuffle(df):
    return df.sample(frac = 1)
    
shuffle_method = _shuffle

# To quietly stop cell execution
class StopExecution(Exception):
    def _render_traceback_(self):
        return []


import json
def get_post_counts():
    tweetNodeFilesParquet = list(filter(lambda fileName: twibot_node_identifier_str in fileName, 
                                        [child.name for child in Path(generated_data_output).iterdir()]))
    post_count_dict = defaultdict(int)
    debug_print(f"Called: get_post_counts", 5)
    for targetFile in tweetNodeFilesParquet:
        targetInput = Path(f"{generated_data_output}/{targetFile}")
        try:
            debug_print("Looking in " + targetInput.__str__(), 5)
            pf = ParquetFile(targetInput)
            df = pf.to_pandas(columns=['author_id'])
            for uid in df['author_id']:
                post_count_dict[uid] = post_count_dict[uid] + 1
            del pf, df
            gc.collect()
        except Exception as e:
            debug_print(f"Failed to load node parquet: {e}", 5)
            raise RuntimeError("Error processing Parquet files.")
    debug_print(f"Completed: get_post_counts", 5)
    return post_count_dict
    
NODE_FILE_LIST = list(filter(lambda fileName: twibot_node_identifier_str in fileName, 
                                        [child.name for child in Path(generated_data_output).iterdir()]))

## Lockstep, the full recipe, Part 1: Pre-processing Stage 2
### Step 5: Prepare additional feature columns

In [2]:
import pandas as pd
def ensure_required_columns(target_dataframe):
    new_columns = {
        
    # Phase 1
    'following_followers_ratio' : 'float', 
    'tweet_followers_ratio' : 'float', 
    'tweet_following_ratio' : 'float', 
    'sampled_post_count': 'int32',

    'profile_desc_len': 'uint32', 
    'profile_username_len': 'int16', 
    'profile_has_location': 'boolean', 

    # TODO: Significance debatable
    'profile_desc_mentions_count': 'int16', 
    'profile_desc_hashtag_count' : 'int16', 
    'profile_desc_url_count' : 'int16', 

    # Phase 2
    'tweet_has_media_ratio': 'float', 
    'tweet_has_geo_ratio': 'float', 


    # Phase 3
    'total_rt': 'uint32', 
    'total_likes': 'uint32', 
    'total_quotes': 'uint32',
    'average_rt': 'float', 
    'average_likes': 'float', 
    'average_quotes': 'float', 

    'likes_chi': 'float', 
    'rts_chi': 'float', 

    'likes_zero_ratio': 'float', 
    'rts_zero_ratio' : 'float', 


     # Phase 4
    'entropy_between_post_times': 'float', 
    'entropy_between_post_hours': 'float', 
    'entropy_between_post_days': 'float', 
    'entropy_between_post_weekdays': 'float', 



     # Phase 5
    'tweet_has_hashtags_ratio': 'float',
    'tweet_has_urls_ratio': 'float', 
    'tweet_urls_total': 'uint32', 
    'tweet_hashtags_total': 'uint32', 
    'avg_hashtags_in_tweet': 'float',
    'avg_urls_in_tweet': 'float', 
    'tweet_urls_top_x': 'object', 
    'tweet_hashtags_top_x': 'object', 


    # What happened to these? A.K.A TODO, with a possibility of won't. They just aren't strong enough factors for the computation required.
    'tweet_has_hashtag_weekday_entropy': 'float', 
    'tweet_has_hashtag_hour_entropy': 'float', 
    'tweet_has_url_weekday_entropy': 'float', 
    'tweet_has_url_hour_entropy': 'float', 
    }
    
    for column, dtype in new_columns.items(): 
        if column not in target_dataframe.columns:
            target_dataframe[column] = pd.Series(dtype=dtype) 
        else:
            target_dataframe[column] = pd.astype(dtype)

## Step 6: Populate additional feature columns
### Warning! This step takes -time-. Up to ten minutes+. 

There's only so much you can do on limited hardware when you are already using concurrency to your benefit and vectorized methods as much as possible. Ten minutes is an okay ask to be able to pre-process a million users and seventy million posts on a variety of stats, right?

In [3]:

import pandas as pd
from collections import defaultdict, Counter, OrderedDict
from pathlib import Path
from fastparquet import ParquetFile
import json

from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
from scipy.stats import entropy
from urllib.parse import urlparse
import hashlib
import math
from scipy.stats import chisquare
from decimal import Decimal
import re
import shutil
import datetime as dt
import scipy
import numpy as np
import pickle
import datetime
import time
import threading

# Create a lock to use for updating the user details dataframe, to prevent race conditions.
update_lock = threading.Lock()

# Create a lock to use for reading data from parquet into memory
read_lock = threading.Lock()

chunk_size = 100000
user_detail_data = {}

# Get all of the initial data that we have
debug_print(f"Loading data from users parquet at {generated_data_output}",1)
try:
    user_detail_data = get_data("users")
    user_detail_data['id'] = user_detail_data['id'].astype('UInt64')
    user_detail_data.set_index('id',inplace=True)
    user_detail_data['posts'] = [[] for _ in range(len(user_detail_data))]
except Exception as e:
    debug_print(f"Failed to load user parquet. {e}",5)  
    raise StopExecution

debug_print("Loaded users parquet.",4)
debug_print(f"Shape: {user_detail_data.shape}", 5)
debug_print(f"Types: {user_detail_data.dtypes}", 5)
debug_print(f"Index: {user_detail_data.index.dtype}", 5)

# Unused. At one point, was used to calculate the entropy of intervals between posts out of a set of posts. But then, I realized it was worthless unless you had -all the posts in sequence-. We don't.
def calculate_time_interval_entropy(time_intervals, num_bins='auto'):
    if len(time_intervals) <= 1:
        return 0  # Not enough data to calculate meaningful entropy
    # Create a histogram of time intervals to obtain frequency distribution
    hist, bin_edges = np.histogram(time_intervals, bins=num_bins, density=True)
    hist = hist[hist > 0]
    probabilities = hist / hist.sum()
    return entropy(probabilities)

def get_post_chunks(cols = '*', index="author_id", pqargs={}, pdkwargs={}, margs={}):
    # Result: Dataframe, index with one or more features.
    # Index: from users
    global NODE_FILE_LIST, read_lock
    result_builder = None
    with read_lock:
        for targetFile in NODE_FILE_LIST:
            targetInput = Path(f"{generated_data_output}/{targetFile}")
            debug_print(f"Extracting from {targetInput.__str__()}...", 5)
            
            if cols != '*':
                # Set the columns to pull from the parquet, either through pqargs directly or here, through cols
                pdkwargs['columns'] = cols
                
            pdkwargs['index'] = index       
            try:
                pfinput = ParquetFile(targetInput, **pqargs)    
                process_group = pfinput.to_pandas(**pdkwargs)  
                result_builder = pd.concat([result_builder, process_group])         
            except Exception as e:
                debug_print(f"Failed to load node parquet: {e}", 5)
                raise RuntimeError("Error processing Parquet files.")
    return result_builder

def append_stats_phase_1(target_dataframe):
    global update_lock
    print("Running phase_1")
    with update_lock:
        post_counts = get_post_counts()
        target_dataframe['sampled_post_count'] = target_dataframe.index.map(lambda x: post_counts.get(int(str(x).strip('ut')),0))

        target_dataframe['profile_desc_len'] = target_dataframe['description'].apply(len).fillna(0)
        target_dataframe['profile_username_len'] = target_dataframe['username'].apply(len).fillna(0)
        target_dataframe['profile_has_location'] = target_dataframe['location'].apply(lambda v: len(v)>1 if v is not None else False)
        
        target_dataframe['following_followers_ratio'] = target_dataframe['following_count'].div(target_dataframe['followers_count'], fill_value=0).fillna(0)
        target_dataframe['tweet_followers_ratio'] = target_dataframe['tweet_count'].div(target_dataframe['followers_count'], fill_value=0).fillna(0)
        target_dataframe['tweet_following_ratio'] = target_dataframe['tweet_count'].div(target_dataframe['following_count'], fill_value=0).fillna(0)
        #['following_followers_ratio', 'tweet_followers_ratio', 'tweet_following_ratio']
        #todo:
        #target_dataframe['profile_desc_mentions_count'] = target_dataframe['tweet_count']
        #target_dataframe['profile_desc_hashtag_count']
        #target_dataframe['profile_desc_url_count']
    debug_print(f"Appended new features to dataframe. New shape: {target_dataframe.shape}", 5)
    del post_counts

def append_stats_phase_2e(target_dataframe):
    global update_lock
    print("Running phase_2")
    
    def filt(x):
        """Filter function to check valid entries."""
        return not (x is None or pd.isna(x) or x in ['None', 'nan', '<NA>', []])
    
    columns_to_process = ['geo', 'media']
    post_chunks_ = get_post_chunks(cols=['geo', 'media'])
    
    aggs = {
        f'tweet_has_{col}_ratio': pd.NamedAgg(
            column=col, 
            aggfunc=lambda s: s.apply(filt).mean()
        ) 
        for col in columns_to_process
    }
    
    grpFrame = post_chunks_.groupby("author_id", sort=False)
    countFrame = grpFrame.agg(**aggs)
   
    with update_lock:       
        target_dataframe.update(countFrame)

def append_stats_phase_3(target_dataframe):
    print("Running phase_3")
    def extract_leading_digits(series):
        series = series.dropna()
        series = series[series > 0] 
        if series.empty:
            return None
        leading_digits = (series.astype(int) // 10 ** (np.floor(np.log10(series)).astype(int))).astype(int)
        return leading_digits
    
    def calculate_chi_for_group(series, benford_probs):
        leading_digits = extract_leading_digits(series)
        if leading_digits is None:
            return np.nan
        observed_counts = np.bincount(leading_digits, minlength=10)[1:10]  # Skip 0 (invalid leading digit)
        total_observed = observed_counts.sum()
        if total_observed == 0:
            return np.nan
        chi_squared = np.sum((observed_counts - total_observed * benford_probs) ** 2 / (total_observed * benford_probs))
        return chi_squared    
    
    
    def calculate_chi_vectorized(series, benford_probs):
        """
        Vectorized calculation of the chi-squared distance for a column.
        """
        series = series.dropna()
        series = series[series > 0]

        if series.empty:
            return np.nan

        first_digits = (series.astype(int) // 10 ** (np.floor(np.log10(series)).astype(int))).astype(int)
        observed_counts = np.bincount(first_digits, minlength=10)[1:10]
        total_observed = observed_counts.sum()
        if total_observed == 0:
            return np.nan

        return np.sum((observed_counts - total_observed * benford_probs) ** 2 / (total_observed * benford_probs))

    def chunk_3_optimized(target_dataframe, columns):
        global update_lock
        post_chunks_ = get_post_chunks(cols=columns if isinstance(columns, list) else [columns])
        aggs = {
            'total_rt': ('retweet_count', 'sum'),
            'total_likes': ('like_count', 'sum'),
            'total_quotes': ('quote_count', 'sum'),
            
            'average_rt': ('retweet_count', 'mean'),
            'average_likes': ('like_count', 'mean'),
            'average_quotes': ('quote_count', 'mean'),
        }

        grouped = post_chunks_.groupby('author_id', sort=False)
        agg_results = grouped.agg(**aggs)
        
        # Compute chi-squared for Benford's Law
        benford_probs = np.log10(1 + 1 / np.arange(1, 10))
        
        def calculate_chi(grouped_data, column_name):
            results = {}
            for author_id, group in grouped_data:
                chi_value = calculate_chi_for_group(group[column_name], benford_probs)
                results[author_id] = chi_value
            return pd.Series(results)
        
        likes_chi = calculate_chi(grouped, 'like_count')
        rts_chi = calculate_chi(grouped, 'retweet_count')

        agg_results['likes_chi'] = likes_chi
        agg_results['rts_chi'] = rts_chi
        
        with update_lock:
            target_dataframe.update(agg_results)
        
    chunk_3_optimized(target_dataframe, ['retweet_count', 'like_count', 'quote_count'])


def append_stats_phase_4(target_dataframe):
    print("Running phase_4")

    def calculate_entropy_from_probs(probabilities):
        """Calculate entropy given pre-computed probabilities."""
        return -np.sum(probabilities * np.log2(probabilities + 1e-9))  # Avoid log2(0)

    def chunk_4_(target_dataframe, column):
        # Extract post chunks
        global update_lock
        post_chunks_ = get_post_chunks(cols=[column])
        post_chunks_['hour'] = post_chunks_[column].dt.hour
        post_chunks_['day'] = post_chunks_[column].dt.day
        post_chunks_['weekday'] = post_chunks_[column].dt.weekday

        # Pre-compute value counts and normalize for each group
        grouped = post_chunks_.groupby('author_id', sort=False)

        # Normalize probabilities for each column
        hour_probs = grouped['hour'].value_counts(normalize=True).unstack(fill_value=0)
        day_probs = grouped['day'].value_counts(normalize=True).unstack(fill_value=0)
        weekday_probs = grouped['weekday'].value_counts(normalize=True).unstack(fill_value=0)

        # Calculate entropy for each group
        entropy_hour = -np.sum(hour_probs * np.log2(hour_probs + 1e-9), axis=1)
        entropy_day = -np.sum(day_probs * np.log2(day_probs + 1e-9), axis=1)
        entropy_weekday = -np.sum(weekday_probs * np.log2(weekday_probs + 1e-9), axis=1)

        aggregated = pd.DataFrame({
            'entropy_between_post_hours': entropy_hour,
            'entropy_between_post_days': entropy_day,
            'entropy_between_post_weekdays': entropy_weekday
        })

        # Update the target_dataframe within a locked context
        with update_lock:
            target_dataframe.update(aggregated)

    chunk_4_(target_dataframe, 'created_at')

def append_stats_phase_5(target_dataframe):

    print("Running phase_5")
    def get_bag(series):
        """
        Extracts domain names from the 'expanded_url' or retrieves the 'tag' properties in the series.
        Each entry in the series is a list of dictionaries.
        """
        bag = []
        for item in series.dropna():  # Skip NaN entries
            if item not in ['<NA>', 'nan', '[]']:  # Ignore invalid entries
                try:
                    item = json.loads(item) 
                    for subitem in item:
                        if isinstance(subitem, dict):
                            if 'expanded_url' in subitem:
                                url = subitem['expanded_url']
                                domain = url.split('/')[2] if '//' in url else url
                                bag.append(domain)
                            elif 'tag' in subitem:
                                bag.append(subitem['tag'])
                except (json.JSONDecodeError, TypeError):
                    continue
        return bag

    def get_top_ten(series):
        """
        Counts occurrences of each URL or tag from get_bag() output
        and returns the top ten as a dictionary with counts.
        """
        bag = get_bag(series)
        return dict(Counter(bag).most_common(10))
        
    def filt_to_notna_mask(x):
        if x is None or pd.isna(x):
            return False    
        if x in ['<NA>','[]']:
            return False
        return True

    def chunk_5_(target_dataframe, columns):
        """
            Process a group of columns: urls, hashtags
        """
        post_chunks_ = get_post_chunks(cols=columns)  # Retrieve relevant chunks

        # Define aggregations
        aggs = {}
        for column in columns:
            # tweet_has_x_ratio: number of tweets in which there is at least 1 x as a ratio of all
            # avg_x_in_tweet: average number of x per tweet
            # tweet_x_total: total x in all tweets
            # tweet_x_top_x: the top 10, in frequency of occurence, x among all tweets
            aggs.update({
                f'tweet_has_{column}_ratio': pd.NamedAgg(column=column, aggfunc=lambda x: x.apply(filt_to_notna_mask).mean()),
                f'avg_{column}_in_tweet': pd.NamedAgg(column=column, aggfunc=lambda x: x.str.len().sum() / x.size),
                f'tweet_{column}_total': pd.NamedAgg(column=column, aggfunc=lambda x: x.str.len().sum()),
                f'tweet_{column}_top_x': pd.NamedAgg(column=column, aggfunc=lambda x: get_top_ten(x)) # Start here
            })

        # Group and aggregate with our methods
        grpFrame = post_chunks_.groupby("author_id", sort=False)
        countFrame_update = grpFrame.agg(**aggs)

        with update_lock:
            # update the target dataframe in place
            for column in columns:
                target_dataframe.update(countFrame_update)
        
    # Process each column
    chunk_5_(target_dataframe, ['urls', 'hashtags'])

pd.options.mode.chained_assignment = None
user_list_length = user_detail_data.shape[0]
debug_print(f"Total users: {user_detail_data.shape[0]}. Pre-processing.", 3)
ensure_required_columns(user_detail_data)
append_stats_phase_1(user_detail_data)
from concurrent.futures import ThreadPoolExecutor

class ThreadedStatsProcessor:
    def __init__(self, user_detail_data):
        self.user_detail_data = user_detail_data

    def run_all_phases(self):
        methods = [
            append_stats_phase_2e,
            append_stats_phase_3,
            append_stats_phase_4,
            append_stats_phase_5
        ]

        max_concurrent_threads = concurrent_max_workers # Limit the number of threads
        with ThreadPoolExecutor(max_workers=max_concurrent_threads) as executor:
            # Submit tasks to the executor
            futures = {executor.submit(method, self.user_detail_data): method.__name__ for method in methods}

            for future in futures:
                method_name = futures[future]
                try:
                    future.result() 
                    print(f"{method_name} completed successfully.")
                except Exception as e:
                    print(f"Error in {method_name}: {e} {e.__traceback__} {e.__cause__}")
                    raise e
                    
processor = ThreadedStatsProcessor(user_detail_data)
processor.run_all_phases()

def storeData(db):
    dbfile = open('temp', 'ab')
    pickle.dump(db, dbfile)                    
    dbfile.close()
    
print("Saving all aggregated information for reload...")
storeData(processor.user_detail_data)
print("Finished. Reload kernel and cell 1 to clear memory, then proceed to the next setup steps.")

Loading data from users parquet at /dataset/twibot22/generated_data
Loading existing data from /dataset/twibot22/generated_data/users.parquet
Loaded users parquet.
Shape: (1000000, 17)
Types: created_at              datetime64[ns, UTC]
description                          object
location                             object
name                                 object
url                                  object
username                             object
label                                object
followers_count                       int64
following_count                       int64
tweet_count                           int64
listed_count                          int64
url.urls                             object
description.urls                     object
description.mentions                 object
description.hashtags                 object
description.cashtags                 object
posts                                object
dtype: object
Index: UInt64
Total users: 1000000. Pre-proces

[  60559.0, 2732441.0,  149112.0,  365014.0,      <NA>,      <NA>,  277517.0,
 3987671.0,  323205.0,      49.0,
 ...
      47.0,  168394.0,       2.0,  106157.0,       1.0,      14.0,     248.0,
     579.0,   28504.0,       0.0]
Length: 1000000, dtype: Float64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  target_dataframe.update(agg_results)
[ 18512.0,    239.0,     25.0,   1469.0,     <NA>,     <NA>, 302629.0,
    657.0,   2657.0,    179.0,
 ...
     55.0,      1.0,     67.0,   1943.0,      3.0,     54.0,    456.0,
   1585.0,      4.0,      2.0]
Length: 1000000, dtype: Float64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  target_dataframe.update(agg_results)
[263.0,   3.0,   0.0,  13.0,  <NA>,  <NA>, 865.0,   6.0, 106.0,   4.0,
 ...
   0.0,   0.0,   0.0,   2.0,   0.0,   0.0,   1.0,   0.0,   0.0,   0.0]
Length: 1000000, dtype: Float64' has dtype incompatible with float64, please explicitly cast 

append_stats_phase_3 completed successfully.
append_stats_phase_4 completed successfully.
append_stats_phase_5 completed successfully.
Saving all aggregated information for reload...
Finished. Reload kernel and cell 1 to clear memory, then proceed to the next setup steps.


# Step 7: Reload kernel, load pickled populated data, and then save data in it's final parquet format.
At this point, there is a ton of memory wasted in Jupyter. Perhaps you might be able to run this cell in sequence, but you'll get better and faster results from restarting the kernel, loading the constants/commons in cell 1, and then skipping directly to the cell below!

In [4]:
import pandas as pd
import pickle
import fastparquet
def sdata(name, _dir=generated_data_output, df=None, **kwargs):
    if df is None:
            raise ValueError("No dataframe provided to save.")
    file_path = os.path.join(_dir, f"{name}.parquet")
    print(f"Saving data to {file_path}")
    os.makedirs(_dir, exist_ok=True)  # Ensure the directory exists
    fastparquet.write(file_path, data=df)
    return df        
    
def _shuffle(df):
    return df.sample(frac = 1)
    
def loadData():
    dbfile = open('temp', 'rb')    
    return pickle.load(dbfile)
    
l_data = loadData()
print(l_data.head(1))

sdata("assembled_user_details", df=l_data)

                                   created_at  \
id                                              
1217628182611927040 2020-01-16 02:02:55+00:00   

                                                           description  \
id                                                                       
1217628182611927040  Theoretical Computer Scientist. See also https...   

                          location        name                      url  \
id                                                                        
1217628182611927040  Cambridge, MA  Boaz Barak  https://t.co/BoMip9FF17   

                         username  label  followers_count  following_count  \
id                                                                           
1217628182611927040  boazbaraktcs  human             7316              215   

                     tweet_count  ...  tweet_urls_total tweet_hashtags_total  \
id                                ...                                          
12176281

Unnamed: 0_level_0,created_at,description,location,name,url,username,label,followers_count,following_count,tweet_count,...,tweet_urls_total,tweet_hashtags_total,avg_hashtags_in_tweet,avg_urls_in_tweet,tweet_urls_top_x,tweet_hashtags_top_x,tweet_has_hashtag_weekday_entropy,tweet_has_hashtag_hour_entropy,tweet_has_url_weekday_entropy,tweet_has_url_hour_entropy
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1217628182611927040,2020-01-16 02:02:55+00:00,Theoretical Computer Scientist. See also https...,"Cambridge, MA",Boaz Barak,https://t.co/BoMip9FF17,boazbaraktcs,human,7316,215,3098,...,39940.0,27598.0,24.208772,35.035088,"{'twitter.com': 110, 'horoscoponegro.com': 29,...","{'Aries': 323, 'ARIES': 286, 'aries': 3, 'Part...",,,,
2664730894,2014-07-02 17:56:46+00:00,creative _,🎈,olawale 💨,,wale_io,human,123,1090,1823,...,5154.0,4005.0,4.009009,5.159159,"{'twitter.com': 5, 'xkcd.com': 1, 'swag.github...",{},,,,
1266703520205549568,2020-05-30 12:10:45+00:00,👽,,panagiota_.b,,b_panagiota,human,3,62,66,...,252.0,252.0,4.000000,4.000000,{},{},,,,
1089159225148882949,2019-01-26 13:52:49+00:00,mama to maya. ABIM research pathway fellow @UV...,"Charlottesville, VA","Jacqueline Hodges, MD MPH",,jachodges_md,human,350,577,237,...,12324.0,2976.0,11.534884,47.767442,{'twitter.com': 61},"{'professionalizeMICROBIOLOGY': 3, 'COVID19': ...",,,,
36741729,2009-04-30 19:01:42+00:00,Father / SWT Alumnus / Longhorn Fan,United States,Matthew Stubblefield,,Matthew_Brody,bot,240,297,3713,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1151138281,2013-02-05 14:50:17+00:00,イラストACは高品質イラストアート/年賀状等が全無料DL可能♪AIベクター・EPS形式素材全...,↓利用者600万人無料素材サイト↓　商用利用編集ＯＫ表記不要,フリー素材集かわいい無料イラストAC/おしゃれフレーム枠★IllustACイラストレーター,https://t.co/L6PE11Blkl,Illustratorjpn,human,1877,2057,101849,...,10230.0,126.0,3.000000,243.571429,"{'www.ac-illust.com': 40, 'twitter.com': 4, 'm...",{'artfair': 1},,,,
1339035361,2013-04-09 12:09:34+00:00,next➬未定 紫･緑ﾃﾞｨｯｷ 色々な曲聴きます,OKAYAMA CITY,りょうやん,https://t.co/NjDtATyqGc,_y3oa,human,13952,5334,1137495,...,597.0,163.0,3.790698,13.883721,"{'c.cocacola.co.jp': 1, 'www.nmb48.com': 1, 'g...",{},,,,
318636852,2011-06-16 20:09:29+00:00,Heart of a lion with a Mind of a maniac. Louis...,"Lake Charles, LA",Gavin Cecchini,,GavinCecchini2,human,13743,183,964,...,2934.0,1605.0,38.214286,69.857143,"{'twitter.com': 8, 'www.instagram.com': 5, 'ww...",{'最新記事': 1},,,,
43443354,2009-05-30 00:25:19+00:00,"Marketplace Minister, Christ follower, Indepen...",Rockhampton Australia,Martin Allan,https://t.co/r3R5Bkng9m,MartinfromOz,human,2460,2935,35256,...,1660.0,272.0,5.913043,36.086957,"{'twitter.com': 6, 'youtu.be': 2, 'wordle.dani...","{'UPDATE': 1, 'StructureFire': 1}",,,,


In [5]:
user_detail_data = get_data("assembled_user_details")
display(user_detail_data.loc[(user_detail_data['tweet_has_media_ratio'] > 0) & (user_detail_data['tweet_has_media_ratio'] < 1)].head(10))

Loading existing data from /dataset/twibot22/generated_data/assembled_user_details.parquet


Unnamed: 0_level_0,created_at,description,location,name,url,username,label,followers_count,following_count,tweet_count,...,tweet_urls_total,tweet_hashtags_total,avg_hashtags_in_tweet,avg_urls_in_tweet,tweet_urls_top_x,tweet_hashtags_top_x,tweet_has_hashtag_weekday_entropy,tweet_has_hashtag_hour_entropy,tweet_has_url_weekday_entropy,tweet_has_url_hour_entropy
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1217628182611927040,2020-01-16 02:02:55+00:00,Theoretical Computer Scientist. See also https...,"Cambridge, MA",Boaz Barak,https://t.co/BoMip9FF17,boazbaraktcs,human,7316,215,3098,...,39940.0,27598.0,24.208772,35.035088,"{'twitter.com': 110, 'horoscoponegro.com': 29,...","{'Aries': 323, 'ARIES': 286, 'aries': 3, 'Part...",,,,
2664730894,2014-07-02 17:56:46+00:00,creative _,🎈,olawale 💨,,wale_io,human,123,1090,1823,...,5154.0,4005.0,4.009009,5.159159,"{'twitter.com': 5, 'xkcd.com': 1, 'swag.github...",{},,,,
1089159225148882949,2019-01-26 13:52:49+00:00,mama to maya. ABIM research pathway fellow @UV...,"Charlottesville, VA","Jacqueline Hodges, MD MPH",,jachodges_md,human,350,577,237,...,12324.0,2976.0,11.534884,47.767442,{'twitter.com': 61},"{'professionalizeMICROBIOLOGY': 3, 'COVID19': ...",,,,
15211869,2008-06-23 20:59:59+00:00,"Director, Knowledge Ecology International, an ...","ÜT: 38.911326,-77.04508",James Love,https://t.co/mcNZxOR7gv,jamie_love,human,10299,2166,57397,...,130600.0,65339.0,54.223237,108.381743,"{'twitter.com': 396, 'bit.ly': 93, 'ow.ly': 27...","{'cdntech': 88, 'WITCanada2021': 40, 'CTAConne...",,,,
138814032,2010-04-30 17:36:51+00:00,Militante peronista. Vicepresidenta de la Repú...,Argentina,Cristina Kirchner,https://t.co/P8WemOJelF,CFKArgentina,human,5994250,241,15538,...,80816.0,15119.0,14.939723,79.857708,"{'twitter.com': 292, 'bit.ly': 20, 'penntoday....","{'LISprochat': 22, 'ASEEVC': 17, 'BiotechCommo...",,,,
457554412,2012-01-07 15:05:53+00:00,They/Them\nhttps://t.co/UvtxD9uZtX,"London, England",samsmith,https://t.co/UvtxD9uZtX,samsmith,human,7982826,1302,14644,...,221345.0,111886.0,111.997998,221.566567,"{'twitter.com': 648, 'ow.ly': 392, 'bit.ly': 1...","{'CX': 106, 'CustomerExperience': 83, 'CCM': 6...",,,,
2465283662,2014-04-27 00:20:12+00:00,"paper tweets, dms are open",,AK,,ak92501,bot,45541,1206,9194,...,306610.0,45330.0,24.423491,165.199353,"{'arxiv.org': 622, 'twitter.com': 380, 'github...","{'5G': 41, 'MobiledgeX': 34, 'TelcoEdgeCloud':...",,,,
1467973039883182090,2021-12-06 21:44:04+00:00,https://t.co/Hmg5gBvd9A,,صارا,,_3rw_,human,1573,1688,146,...,26861.0,12236.0,26.6,58.393478,"{'twitter.com': 128, 'bit.ly': 4, 'www.ferrari...","{'RoyalRumble': 129, 'royalrumble': 30, 'Sanre...",,,,
234059290,2011-01-04 19:11:39+00:00,Come for the science (genetics & cell biology)...,"Salt Lake City,UT, USA",Professor Booty PhD,https://t.co/pKcvVO96Yk,ProfBootyPhD,human,4694,4739,91381,...,121595.0,53362.0,39.941617,91.014222,"{'twitter.com': 398, 'bit.ly': 31, 'youtu.be':...","{'COVID19': 54, 'AzadiKaAmritMahotsav': 26, 'I...",,,,
1142890104853106688,2019-06-23 20:20:09+00:00,Quite likely the most eminent worm wrangler in...,,wyomingwormboy,,wyomingwormboy,human,2913,587,3343,...,73917.0,20143.0,17.500434,64.219809,"{'twitter.com': 152, '0.zailuo.cn': 29, 'www.n...","{'PEC': 93, 'ETH': 58, 'BTC': 58, 'neuroscienc...",,,,
