In [None]:
from apify_client import ApifyClient
import pandas as pd
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Initialize the ApifyClient with your API token
client = ApifyClient(os.getenv("APIFY_API_KEY"))

# Prepare the Actor input
run_input = {
    "usernames": ["daixfit", "sennawhitemanfit", "haytayfitness", "hanns__fitness", "poppyyconnor"],
    "max_count": 500,
}

# Run the Actor and wait for it to finish
run = client.actor("8dqiL379xy0Ldrhdr").call(run_input=run_input)

# Fetch Actor results and store in a list
data = []
for item in client.dataset(run["defaultDatasetId"]).iterate_items():
    data.append(item)

# Pandas dataframe for the global followers tracker
global_followers_df = pd.DataFrame(data)

# Create pandas DataFrame from the collected data
scraped_followers_df = pd.DataFrame(data)

# Drop unnecessary columns
scraped_followers_df = scraped_followers_df.drop(columns=['profile_pic_url', 'latest_story_ts', 'is_verified', 'is_private'], errors='ignore')

[36m[apify.instagram-followers-scraper runId:rZVXTxe2UBxctQzXG][0m -> Status: RUNNING, Message: 
[36m[apify.instagram-followers-scraper runId:rZVXTxe2UBxctQzXG][0m -> 2025-09-20T14:45:55.902Z ACTOR: Pulling Docker image of build PNrmGgW0wcuO7F8t3 from registry.
[36m[apify.instagram-followers-scraper runId:rZVXTxe2UBxctQzXG][0m -> 2025-09-20T14:45:55.904Z ACTOR: Creating Docker container.
[36m[apify.instagram-followers-scraper runId:rZVXTxe2UBxctQzXG][0m -> 2025-09-20T14:45:55.963Z ACTOR: Starting Docker container.
[36m[apify.instagram-followers-scraper runId:rZVXTxe2UBxctQzXG][0m -> 2025-09-20T14:45:57.175Z Actor is running on the Apify platform, `disable_browser_sandbox` was changed to True.
[36m[apify.instagram-followers-scraper runId:rZVXTxe2UBxctQzXG][0m -> 2025-09-20T14:45:57.337Z [90m[apify][0m [32mINFO [0m Initializing Actor...
[36m[apify.instagram-followers-scraper runId:rZVXTxe2UBxctQzXG][0m -> 2025-09-20T14:45:57.340Z [90m[apify][0m [32mINFO [0m System i

In [2]:
display(scraped_followers_df)

Unnamed: 0,full_name,id,username,follower_of
0,,4401400833,mo_lifts_,haytayfitness
1,Kaio Presley Almeida Lima,3295553877,kaio_presley,haytayfitness
2,Fynlay B,28678284278,fynlaybeck,haytayfitness
3,Мария Маринова,60897900916,migal7363,haytayfitness
4,Old Force,58757916721,oldforcethrash,haytayfitness
...,...,...,...,...
2177,George,6098902266,g3org3w,poppyyconnor
2178,Tegan Jade Arnold,220628114,angelzfall,poppyyconnor
2179,,75205198170,samesameswim,poppyyconnor
2180,Jordan Isergin,8029782977,jordan.f30,poppyyconnor


In [3]:
import gender_guesser.detector as gender
import re
from typing import Optional, List, Dict, Any

# Initialize the gender detector
detector = gender.Detector(case_sensitive=False)

def guess_gender_robust(username: str, full_name: Optional[str] = None) -> str:
    """
    Robust gender detection function that tries multiple strategies.
    
    Args:
        username: Instagram username
        full_name: Full name from profile (optional)
    
    Returns:
        'male', 'female', or 'unknown'
    """
    
    def extract_names(text: str) -> List[str]:
        """Extract potential names from text, handling various formats."""
        if not text:
            return []
        
        # Remove common prefixes and suffixes
        cleaned = re.sub(r'(^(mrs?|ms|dr|prof|sir|lady|miss)\.?\s+)|(\d+|_+|\.+)', '', text, flags=re.IGNORECASE)
        
        # Split by common separators and extract alphabetic sequences
        parts = re.split(r'[_\.\-\s\d]+', cleaned)
        names = []
        
        for part in parts:
            # Extract alphabetic sequences of reasonable length (2-20 chars)
            name_matches = re.findall(r'[A-Za-z]{2,20}', part)
            names.extend(name_matches)
        
        # Exclude common non-name words but keep gender-indicating titles
        excluded_words = {
            'the', 'and', 'official', 'real', 'true', 'page', 'account', 'profile',
            'fitness', 'gym', 'workout', 'life', 'love', 'style', 'blog', 'shop'
        }
        
        return [name for name in names if name.lower() not in excluded_words and len(name) >= 2]
    
    def check_gender_keywords(text: str) -> str:
        """Check for gender-indicating keywords in text."""
        if not text:
            return 'unknown'
        
        text_lower = text.lower()
        
        # Male-indicating words
        male_keywords = ['king', 'prince', 'sir', 'mr', 'lord', 'duke']
        # Female-indicating words  
        female_keywords = ['queen', 'princess', 'lady', 'mrs', 'ms', 'miss', 'duchess']
        
        for keyword in male_keywords:
            if keyword in text_lower:
                return 'male'
                
        for keyword in female_keywords:
            if keyword in text_lower:
                return 'female'
                
        return 'unknown'
    
    def classify_gender(gender_result: str) -> str:
        """Classify gender_guesser results into male/female/unknown."""
        if gender_result in ['male', 'mostly_male']:
            return 'male'
        elif gender_result in ['female', 'mostly_female']:
            return 'female'
        else:
            return 'unknown'
    
    # Strategy 1: Check for gender keywords first (in both username and full_name)
    for text in [full_name, username]:
        keyword_result = check_gender_keywords(text)
        if keyword_result != 'unknown':
            return keyword_result
    
    # Strategy 2: Try full_name with name detection
    if full_name:
        names = extract_names(full_name)
        for name in names:
            result = detector.get_gender(name)
            classified = classify_gender(result)
            if classified != 'unknown':
                return classified
    
    # Strategy 3: Try username with name detection
    if username:
        names = extract_names(username)
        for name in names:
            result = detector.get_gender(name)
            classified = classify_gender(result)
            if classified != 'unknown':
                return classified
    
    return 'unknown'

# Apply gender detection to the existing dataframe
print("Applying gender detection to accounts...")
scraped_followers_df['detected_gender'] = scraped_followers_df.apply(
    lambda row: guess_gender_robust(
        row.get('username', ''), 
        row.get('full_name') or row.get('fullname')
    ), 
    axis=1
)

# Display gender distribution before filtering
print("\nGender Distribution (before filtering):")
print(scraped_followers_df['detected_gender'].value_counts())

# Drop profiles with unknown gender
initial_count = len(scraped_followers_df)
scraped_followers_df = scraped_followers_df[scraped_followers_df['detected_gender'] != 'unknown'].copy()
filtered_count = len(scraped_followers_df)

print(f"\nFiltering Results:")
print(f"Initial profiles: {initial_count}")
print(f"Profiles after removing unknown gender: {filtered_count}")
print(f"Removed profiles: {initial_count - filtered_count}")

# Display final gender distribution
print("\nFinal Gender Distribution:")
print(scraped_followers_df['detected_gender'].value_counts())

# Create filtered dataframes for different genders
male_df = scraped_followers_df[scraped_followers_df['detected_gender'] == 'male'].copy()
female_df = scraped_followers_df[scraped_followers_df['detected_gender'] == 'female'].copy()

print(f"\nFiltered Results:")
print(f"Male accounts: {len(male_df)}")
print(f"Female accounts: {len(female_df)}")

# The scraped_followers_df now only contains profiles with detected gender
complete_df = scraped_followers_df.copy()

print(f"\nComplete dataframe shape: {complete_df.shape}")
print(f"Columns: {list(complete_df.columns)}")

# Display sample results
print("\nSample results:")
sample_df = complete_df[['username', 'full_name', 'detected_gender']].head(10)
for _, row in sample_df.iterrows():
    print(f"Username: {row['username']:<20} | Full Name: {str(row['full_name']):<20} | Gender: {row['detected_gender']}")


Applying gender detection to accounts...

Gender Distribution (before filtering):
detected_gender
unknown    971
male       845
female     366
Name: count, dtype: int64

Filtering Results:
Initial profiles: 2182
Profiles after removing unknown gender: 1211
Removed profiles: 971

Final Gender Distribution:
detected_gender
male      845
female    366
Name: count, dtype: int64

Filtered Results:
Male accounts: 845
Female accounts: 366

Complete dataframe shape: (1211, 5)
Columns: ['full_name', 'id', 'username', 'follower_of', 'detected_gender']

Sample results:
Username: kaio_presley         | Full Name: Kaio Presley Almeida Lima | Gender: female
Username: kyle_dodd1998        | Full Name: Kyle Dodd            | Gender: female
Username: luketav              | Full Name: Luke                 | Gender: male
Username: javieregm            | Full Name: Javier Elías Gonzalez Montenegro | Gender: male
Username: kingsleyjon_         | Full Name: Kingsley SJ          | Gender: male
Username: hone

In [4]:
display(scraped_followers_df)

Unnamed: 0,full_name,id,username,follower_of,detected_gender
1,Kaio Presley Almeida Lima,3295553877,kaio_presley,haytayfitness,female
6,Kyle Dodd,1063684108,kyle_dodd1998,haytayfitness,female
9,Luke,192333588,luketav,haytayfitness,male
10,Javier Elías Gonzalez Montenegro,51390449,javieregm,haytayfitness,male
11,Kingsley SJ,52745179747,kingsleyjon_,haytayfitness,male
...,...,...,...,...,...
2174,Lukas Walters,9553513077,yothats_lukas,poppyyconnor,male
2175,Jonis Alexander Al Bearmani,1379998297,ealbearmani,poppyyconnor,male
2177,George,6098902266,g3org3w,poppyyconnor,male
2178,Tegan Jade Arnold,220628114,angelzfall,poppyyconnor,female


In [6]:
# Export the filtered dataframe to CSV
scraped_followers_df.to_csv('scraped_followers_with_gender.csv', index=False)
print(f"Exported {len(scraped_followers_df)} records to 'scraped_followers_with_gender.csv'")

# Also export separate CSV files for male and female followers
male_df.to_csv('male_followers.csv', index=False)
female_df.to_csv('female_followers.csv', index=False)

print(f"Exported {len(male_df)} male followers to 'male_followers.csv'")
print(f"Exported {len(female_df)} female followers to 'female_followers.csv'")

Exported 1211 records to 'scraped_followers_with_gender.csv'
Exported 845 male followers to 'male_followers.csv'
Exported 366 female followers to 'female_followers.csv'
