# Twitter Bot Account Classification with CLIP


This notebook demonstrates how to use OpenAI CLIP, for classifying Twitter accounts into binary classes based on multiple features.

We'll install dependencies, define our model configuration, train the model, and visualize performance metrics.

In [1]:
#!nvidia-smi -L

In [2]:
# Disable GPU

#import os

#os.environ["CUDA_VISIBLE_DEVICES"]="GPU-XXXXX" # MIG-GPU-XXXXX

#os.environ["CUDA_VISIBLE_DEVICES"]=""

In [3]:
# Install required packages
!pip install transformers
!pip install torch torchvision
!pip install scikit-learn



In [4]:
# Import libraries
import torch
from transformers import CLIPProcessor, CLIPModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from PIL import Image

In [5]:
# Load the CLIP model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]



In [6]:
# Load the Twitter dataset

dataset_df = pd.read_csv("/kaggle/input/twitter-human-bots-dataset/twitter_human_bots_dataset.csv")


# Append full path to image data

dataset_df['profile_image_path'] = '/kaggle/input/twitter-human-bots-dataset/' + dataset_df['profile_image_path']


# Display the first few rows of the dataset

dataset_df.head()

Unnamed: 0.1,Unnamed: 0,created_at,default_profile,default_profile_image,description,favourites_count,followers_count,friends_count,geo_enabled,id,...,profile_image_url,screen_name,statuses_count,verified,average_tweets_per_day,account_age_days,account_type,profile_image_path,profile_background_image_path,split
0,0,2009-07-04 22:41:51,False,False,,5007,102,168,False,53779179,...,https://pbs.twimg.com/profile_images/128969130...,paty_castroo,3974,False,0.978,4063,bot,,,0
1,1,2010-01-17 22:54:19,False,False,Television producer. Emmy Award winner. Disney...,1038,60,128,False,105916557,...,https://pbs.twimg.com/profile_images/632916759...,CBirckner,259,False,0.067,3866,human,/kaggle/input/twitter-human-bots-dataset/profi...,profile_images/4691ae58668d932bb7e7af22141aecf...,0
2,2,2012-03-01 06:05:32,False,False,,2257,599,449,True,509788597,...,https://pbs.twimg.com/profile_images/128853844...,amf_jay,13627,False,4.406,3093,human,,,0
3,3,2009-09-01 04:52:30,False,False,,6407,116,334,True,70601327,...,https://pbs.twimg.com/profile_images/977012905...,SaraCavolo,4432,False,1.107,4005,human,/kaggle/input/twitter-human-bots-dataset/profi...,profile_images/d1da9220e4aa376dff03b6f12765171...,0
4,4,2010-01-27 17:17:23,False,False,Productor de Televisión - Embajador de @Tienda...,20866,74448,18,True,108999927,...,https://pbs.twimg.com/profile_images/668449819...,DavidHenaoModel,15870,True,4.116,3856,human,/kaggle/input/twitter-human-bots-dataset/profi...,profile_images/c65bf625b066ef5f29e14cfcf1ee91f...,0


In [7]:
from tqdm import tqdm
import numpy as np

# Function to get CLIP embeddings with truncation for text, in batches with a progress bar
def get_clip_embeddings(texts, images, batch_size=32):
    all_embeddings = []
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing Embeddings"):
        # Process each batch
        batch_texts = texts[i:i + batch_size]
        batch_images = images[i:i + batch_size]
        
        # Preprocess text and images separately
        text_inputs = processor(text=batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=77).to(device)
        image_inputs = processor(images=batch_images, return_tensors="pt").to(device)

        # Get embeddings from CLIP for the batch
        text_embeddings = model.get_text_features(**text_inputs)
        image_embeddings = model.get_image_features(**image_inputs)
        
        # Combine text and image embeddings (e.g., by summing or concatenating)
        batch_embeddings = text_embeddings + image_embeddings
        all_embeddings.append(batch_embeddings.cpu().detach().numpy())

    # Concatenate all batch embeddings into a single array
    return np.concatenate(all_embeddings, axis=0)

In [8]:
from PIL import Image, UnidentifiedImageError
from tqdm import tqdm

# Load and preprocess images with a progress bar, converting to RGBA if needed, resizing to 224x224
def load_images(image_paths):
    images = []
    for path in tqdm(image_paths, desc="Loading images"):
        try:
            # Check if the path is a valid string
            if isinstance(path, str):
                img = Image.open(path)
                
                # Convert palette images with transparency to RGBA
                if img.mode == "P" or img.mode == "LA":
                    img = img.convert("RGBA")
                
                # Resize to a standard size for CLIP (e.g., 224x224)
                img = img.resize((224, 224))
                images.append(img)
            else:
                # Append a blank image if the path is invalid
                images.append(Image.new("RGB", (224, 224), color="white"))
        
        # Handle errors (e.g., file not found or unreadable image)
        except (FileNotFoundError, UnidentifiedImageError):
            images.append(Image.new("RGB", (224, 224), color="white"))
            
    return images

# Clean and truncate text
dataset_df['description'] = dataset_df['description'].fillna("")
sample_texts = dataset_df['description'].astype(str).tolist()
sample_images = load_images(dataset_df['profile_image_path'])

# Now, generate the embeddings
embeddings = get_clip_embeddings(sample_texts, sample_images)

Loading images: 100%|██████████| 37438/37438 [03:57<00:00, 157.87it/s]
Processing Embeddings: 100%|██████████| 1170/1170 [03:17<00:00,  5.93it/s]


In [9]:
# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    embeddings, dataset_df['account_type'], test_size=0.2, random_state=42
)

# Train a classifier (e.g., Logistic Regression)
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Predict and evaluate
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

         bot       0.68      0.62      0.65      2455
       human       0.82      0.86      0.84      5033

    accuracy                           0.78      7488
   macro avg       0.75      0.74      0.74      7488
weighted avg       0.78      0.78      0.78      7488

