# ***GET HAND CRAFTED FEATURES***

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

Mounted at /content/drive


In [3]:
import os
import numpy as np
import pandas as pd
from csv import writer
import xml.etree.ElementTree as ET

In [4]:
from transformers import BertTokenizer, TFBertModel, logging
import numpy as np

class EmbeddingGenerator:
    tokenizer = None
    model = None

    def __init__(self):
        logging.set_verbosity_error()
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        self.model = TFBertModel.from_pretrained('bert-base-cased')

    def get_individual_tweet_embedding(self, user_tweet):
        encoded_text = self.tokenizer(user_tweet, return_tensors='tf')
        output = self.model(encoded_text)[1][0]
        return output.numpy()

    def get_all_tweet_embeddings_individual(self, user_tweets):
        output_list = []
        for user_tweet in user_tweets:
            encoding = self.get_individual_tweet_embedding(user_tweet)
            output_list.append(encoding)

        return np.array(output_list)

    def get_all_tweet_embeddings_bulk(self, user_tweets):
        input_text = ' '.join(user_tweets)
        encoding = self.get_individual_tweet_embedding(input_text)

        return np.array(encoding)

In [9]:
!pip install httpx
!pip install emoji
!pip install interruptingcow
!pip install vaderSentiment
import os
import re
import httpx
import emoji
import asyncio
import requests
from interruptingcow import timeout
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [10]:
def get_emoji_count(tweet):
    return emoji.emoji_count(tweet)

def get_hashtag_count(tweet):
    regex = "#(\w+)"
    hashtag_list = re.findall(regex, tweet)
    return len(hashtag_list)

def get_sentiment_score(tweet):
    sentiment_dict = SentimentIntensityAnalyzer().polarity_scores(tweet)
    return sentiment_dict['neu'], sentiment_dict['compound']

# async def expand_short_url_async(short_url):
#     async with httpx.AsyncClient() as client:
#         try:
#             with timeout(os.environ['URL_EXPANDER_TIMEOUT'], exception=TimeoutError):  # Timeout after 5 seconds
#                 response = await client.head(short_url)
#                 expanded_url = str(response.url)
#                 return expanded_url
#         except TimeoutError as e:
#             print(f"Request for {short_url} took too long and was interrupted.")
#             return None
#         except Exception as e:
#             print(f"Error: {e}")
#             return None


# async def get_number_of_web_links(tweet):
#     url_list = re.findall("(?P<url>https?://[^\s]+)", tweet)
#     url_count = 0

#     tasks = [expand_short_url_async(short_url) for short_url in url_list]
#     expanded_urls = await asyncio.gather(*tasks)

#     # print(type(expanded_urls), type(expanded_urls[0]))

#     for expanded_url in expanded_urls:
#         if expanded_url:
#             if 'twitter' in expanded_url and ('photo' in expanded_url or 'video' in expanded_url):
#                 continue
#             elif 'twitter' in expanded_url and 'status' in expanded_url:
#                 continue
#             else:
#                 url_count+=1
#         else:
#             continue

#     return url_count

def get_number_of_web_links(tweet):
    url_list = re.findall("(?P<url>https?://[^\s]+)", tweet)
    url_count = len(url_list)
    # url_count = 0
    # for short_url in url_list:
    #     try:
    #         with timeout(int(os.environ['URL_EXPANDER_TIMEOUT_OUTER']), exception=TimeoutError):
    #             response = requests.get(short_url, timeout=int(os.environ['URL_EXPANDER_TIMEOUT_INNER']))
    #             if response.status_code == 200:
    #                 expanded_url = str(response.url)
    #                 if 'twitter' in expanded_url and ('photo' in expanded_url or 'video' in expanded_url):
    #                     continue
    #                 elif 'twitter' in expanded_url and 'status' in expanded_url:
    #                     continue
    #                 else:
    #                     url_count+=1
    #             else:
    #                 continue
    #     except TimeoutError as e:
    #         print(f"Request for {short_url} took too long and was interrupted.")
    #         continue
    #     except Exception as e:
    #         print(f"Error: {e}")
    #         continue
    return url_count


def get_cosine_similarity(tweets):
    tf_idf_vectorizer = TfidfVectorizer()
    tf_idf_matrix = tf_idf_vectorizer.fit_transform(tweets)
    cosine_similarities = cosine_similarity(tf_idf_matrix, tf_idf_matrix)
    num_tweets = len(tweets)
    # print('-------', cosine_similarities.shape, cosine_similarities.sum())
    value = (cosine_similarities.sum()-num_tweets)/(num_tweets*(num_tweets-1))
    return value


def get_handcrafted_features(tweets):
    cosine_similarity_value = 0
    if len(tweets)!=0:
        cosine_similarity_value = get_cosine_similarity(tweets)

    num_emojis = []
    num_hashtags = []
    num_semicolons = []
    tweet_lengths = []
    sentiment_score_compounds = []
    sentiment_score_neutrals = []
    num_weblinks = []
    for tweet in tweets:
        num_emoji = get_emoji_count(tweet)
        num_hashtag = get_hashtag_count(tweet)
        num_semicolon = str(tweet).count(';')
        tweet_length = len(tweet.split())
        sentiment_score_neutral, sentiment_score_compound = get_sentiment_score(tweet)
        num_weblink = get_number_of_web_links(tweet)
        # loop = asyncio.get_event_loop()
        # num_weblink = loop.run_until_complete(get_number_of_web_links(tweet))

        num_emojis.append(num_emoji)
        num_hashtags.append(num_hashtag)
        num_semicolons.append(num_semicolon)
        tweet_lengths.append(tweet_length)
        sentiment_score_compounds.append(sentiment_score_compound)
        sentiment_score_neutrals.append(sentiment_score_neutral)
        num_weblinks.append(num_weblink)

    return cosine_similarity_value, num_emojis, num_hashtags, num_semicolons, tweet_lengths, sentiment_score_compounds, sentiment_score_neutrals, num_weblinks

In [13]:
!pip install dotenv
!pip install python-dotenv
from dotenv import load_dotenv

Collecting dotenv
  Using cached dotenv-0.0.5.tar.gz (2.4 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [None]:
def get_user_tweets(user_file_path):
    tree = ET.parse(user_file_path)
    root = tree.getroot()
    user_tweets = []
    for tweet_row in root.iter('document'):
        tweet = str(tweet_row.text)
        user_tweets.append(tweet)
    return user_tweets

def process_data_to_csv(folder_path, lines, output_csv_name):
    output_csv_path = output_csv_name
    with open(output_csv_path, 'a') as csv_file:
        row = ['user_id',
               'avg_tweet_encodings_joined',
               'avg_emoji',
               'avg_hashtags',
               'avg_semicolons',
               'avg_tweet_length',
               'avg_sentiment_compound',
               'avg_sentiment_neutral',
               'avg_num_weblink',
               'cosine_similarity_value',
               'label']
        writer_object = writer(csv_file)
        writer_object.writerow(row)

        usr_count=1
        for line in lines:
            print(usr_count)
            usr_count+=1
            try:
                details = line.split(':::')
                user_id = details[0]
                label = details[1]
                user_file_path = folder_path + '/' + user_id + '.xml'

                tweets = get_user_tweets(user_file_path)
                embedding_generator = EmbeddingGenerator()
                individual_tweet_encodings = embedding_generator.get_all_tweet_embeddings_individual(tweets)
                avg_tweet_encodings = np.mean(individual_tweet_encodings, axis=0)
                avg_tweet_encodings_joined = " ".join(map(str, avg_tweet_encodings))

                cosine_similarity_value, num_emojis, num_hashtags, num_semicolons, tweet_lengths, sentiment_score_compounds, sentiment_score_neutrals, num_weblinks = get_handcrafted_features(tweets)

                avg_emoji = np.array(num_emojis).mean()
                avg_hashtags = np.array(num_hashtags).mean()
                avg_semicolons = np.array(num_semicolons).mean()
                avg_tweet_length = np.array(tweet_lengths).mean()
                avg_sentiment_compound = np.array(sentiment_score_compounds).mean()
                avg_sentiment_neutral = np.array(sentiment_score_neutrals).mean()
                avg_num_weblink = np.array(num_weblinks).mean()

                row = [str(user_id),
                       str(avg_tweet_encodings_joined),
                       str(avg_emoji),
                       str(avg_hashtags),
                       str(avg_semicolons),
                       str(avg_tweet_length),
                       str(avg_sentiment_compound),
                       str(avg_sentiment_neutral),
                       str(avg_num_weblink),
                       str(cosine_similarity_value),
                       str(label)]
                writer_object = writer(csv_file)
                writer_object.writerow(row)
            except Exception as e:
                print('Error ocurred in ', line, e)
        csv_file.close()

def load_train_dataset():
    folder_path = "/content/drive/MyDrive/PAN/Train/en"
    file_path = folder_path + '/truth-train.txt'
    lines = open(file_path, 'r')
    process_data_to_csv(folder_path, lines, '/content/drive/MyDrive/train.csv')

def load_test_dataset():
    folder_path = "/content/drive/MyDrive/PAN/Test/en"
    file_path = folder_path + '/truth.txt'
    lines = open(file_path, 'r')
    process_data_to_csv(folder_path, lines, '/content/drive/MyDrive/test.csv')

def load_pre_test_dataset():
    folder_path = "/content/drive/MyDrive/PAN/Val/en"
    file_path = folder_path + '/truth.txt'
    lines = open(file_path, 'r')
    process_data_to_csv(folder_path, lines, '/content/drive/MyDrive/test_pre.csv')

def load_from_csv(csv_file_path):
    csv_data = pd.read_csv(csv_file_path, header=0)
    headers = list(csv_data.columns)
    headers.remove('label')
    headers.remove('user_id')
    headers.remove('avg_tweet_encodings_joined')
    feature_set = []
    labels = []
    for row_index, row in csv_data.iterrows():
        user_feat = []
        bert_feat = np.fromstring(row['avg_tweet_encodings_joined'], dtype=float, sep=' ')
        user_feat.extend(bert_feat)
        user_feat.extend(np.array(row[headers]))

        label_val = row['label']
        if str(label_val).strip()=='bot':
            labels.append(0)
        else:
            labels.append(1)

        feature_set.append(user_feat)

    feature_set = np.array(feature_set, dtype='float')
    labels = np.array(labels)

    print(feature_set.shape, labels.shape)
    return feature_set, labels

if __name__ == "__main__":
    print('--loading training data')
    load_train_dataset()
    print('--loading testing data')
    load_test_dataset()
    print('--loading pre-testing data')
    load_pre_test_dataset()


--loading training data
1


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27


In [None]:
import xml.etree.ElementTree as ET
import numpy as np
from csv import writer
import pandas as pd

def get_user_tweets(user_file_path):
    with open(user_file_path) as file:
        for event, element in ET.iterparse(file, events=('start', 'end')):
            if event == 'end' and element.tag == 'document':
                yield element.text
                element.clear()

def process_data_to_csv(folder_path, lines, output_csv_name):
    with open(output_csv_name, 'a') as csv_file:
        writer_object = writer(csv_file)
        headers = ['user_id', 'avg_tweet_encodings_joined', 'avg_emoji', 'avg_hashtags',
                   'avg_semicolons', 'avg_tweet_length', 'avg_sentiment_compound',
                   'avg_sentiment_neutral', 'avg_num_weblink', 'cosine_similarity_value', 'label']
        writer_object.writerow(headers)

        for usr_count, line in enumerate(lines, start=1):
            print(usr_count)
            try:
                user_id, label = line.strip().split(':::')
                user_file_path = f"{folder_path}/{user_id}.xml"

                tweets = list(get_user_tweets(user_file_path))
                embedding_generator = EmbeddingGenerator()
                individual_tweet_encodings = embedding_generator.get_all_tweet_embeddings_individual(tweets)
                avg_tweet_encodings = np.mean(individual_tweet_encodings, axis=0)

                cosine_similarity_value, num_emojis, num_hashtags, num_semicolons, tweet_lengths, sentiment_score_compounds, sentiment_score_neutrals, num_weblinks = get_handcrafted_features(tweets)

                row = [user_id, avg_tweet_encodings, np.mean(num_emojis), np.mean(num_hashtags),
                       np.mean(num_semicolons), np.mean(tweet_lengths), np.mean(sentiment_score_compounds),
                       np.mean(sentiment_score_neutrals), np.mean(num_weblinks), cosine_similarity_value, label]

                writer_object.writerow(row)
            except Exception as e:
                print(f'Error occurred in {line}: {e}')

def load_train_dataset():
    folder_path = "/content/drive/MyDrive/PAN/Train/en"
    file_path = f"{folder_path}/truth-train.txt"
    with open(file_path) as lines:
        process_data_to_csv(folder_path, lines, '/content/drive/MyDrive/train.csv')

def load_test_dataset():
    folder_path = "/content/drive/MyDrive/PAN/Test/en"
    file_path = f"{folder_path}/truth.txt"
    with open(file_path) as lines:
        process_data_to_csv(folder_path, lines, '/content/drive/MyDrive/test.csv')

def load_pre_test_dataset():
    folder_path = "/content/drive/MyDrive/PAN/Val/en"
    file_path = f"{folder_path}/truth.txt"
    with open(file_path) as lines:
        process_data_to_csv(folder_path, lines, '/content/drive/MyDrive/test_pre.csv')

def load_from_csv(csv_file_path):
    csv_data = pd.read_csv(csv_file_path, header=0)
    headers = list(csv_data.columns)
    headers.remove('label')
    headers.remove('user_id')
    headers.remove('avg_tweet_encodings_joined')
    feature_set = []
    labels = []
    for _, row in csv_data.iterrows():
        bert_feat = np.fromstring(row['avg_tweet_encodings_joined'], dtype=float, sep=' ')
        user_feat = np.concatenate([bert_feat, np.array(row[headers])])
        label_val = row['label'].strip()
        labels.append(0 if label_val == 'bot' else 1)
        feature_set.append(user_feat)

    feature_set = np.array(feature_set, dtype='float')
    labels = np.array(labels)
    print(feature_set.shape, labels.shape)
    return feature_set, labels

if __name__ == "__main__":
    print('--loading training data')
    load_train_dataset()
    print('--loading testing data')
    load_test_dataset()
    print('--loading pre-testing data')
    load_pre_test_dataset()