In [3]:
pip install --upgrade torchvision

[33mDEPRECATION: Loading egg at /opt/anaconda3/lib/python3.11/site-packages/dlib-19.24.99-py3.11-macosx-11.1-arm64.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import os

In [5]:
import os
import pandas as pd

def compile_unique_user_csvs(folder_path, output_filename='compiled_output.csv'):
    """
    Combine all CSV files in a folder into one DataFrame, removing duplicate usernames.

    Args:
        folder_path (str): Path to the folder containing CSV files.
        output_filename (str): Name of the output CSV file to save in the same folder.

    Returns:
        pd.DataFrame: Combined and deduplicated DataFrame.
    """
    all_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    all_dfs = []

    for file in sorted(all_files):
        full_path = os.path.join(folder_path, file)
        try:
            df = pd.read_csv(full_path)
            df['source_file'] = file  # Optional: track file origin
            all_dfs.append(df)
        except Exception as e:
            print(f"Failed to read {file}: {e}")

    if not all_dfs:
        print("No CSV files found or all failed to load.")
        return pd.DataFrame()

    combined_df = pd.concat(all_dfs, ignore_index=True)

    if 'username' in combined_df.columns:
        before = len(combined_df)
        combined_df.drop_duplicates(subset='username', keep='first', inplace=True)
        after = len(combined_df)
        print(f"Removed {before - after} duplicate usernames.")
    else:
        print("Warning: 'username' column not found; no deduplication applied.")

    output_path = os.path.join(folder_path, output_filename)
    combined_df.to_csv(output_path, index=False)
    print(f"Combined CSV saved to: {output_path}")

    return combined_df


In [6]:
def filter_unknown_gender_with_avatar(df):
    """
    Filter DataFrame for rows where gender is 'unknown' and avatar is not null.

    Args:
        df (pd.DataFrame): Input DataFrame with 'gender' and 'avatar' columns.

    Returns:
        pd.DataFrame: Filtered DataFrame.
    """
    filtered_df = df[(df['gender'] == 'unknown') & (df['avatar'].notnull())]
    print(f"Filtered down to {len(filtered_df)} rows with unknown gender and valid avatar.")
    return filtered_df

In [21]:
import os
import requests
import pandas as pd
from PIL import Image
from io import BytesIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# -------- 1. Dataset class --------
class GenderImageDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df.reset_index(drop=True)
        self.transform = transform
        self.label_encoder = LabelEncoder()
        self.df['gender_encoded'] = self.label_encoder.fit_transform(df['new_gender'])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        url = self.df.loc[idx, 'avatar']
        label = self.df.loc[idx, 'gender_encoded']
        try:
            response = requests.get(url, timeout=10)
            image = Image.open(BytesIO(response.content)).convert('RGB')
            if self.transform:
                image = self.transform(image)
            return image, torch.tensor(label)
        except Exception as e:
            print(f"Error loading image from {url}: {e}")
            return torch.zeros(3, 224, 224), torch.tensor(label)  # return black image if fail

# -------- 2. Training pipeline --------
def train_gender_classifier(df, batch_size=16, epochs=5, lr=1e-4):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ])

    train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['new_gender'], random_state=42)
    train_dataset = GenderImageDataset(train_df, transform)
    val_dataset = GenderImageDataset(val_df, transform)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Use pretrained model
    model = models.resnet18(pretrained=True)
    model.fc = nn.Linear(model.fc.in_features, 2)  # Binary classification

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for images, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}'):
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1} loss: {running_loss / len(train_loader):.4f}")

    print("Training complete.")
    return model, train_dataset.label_encoder


In [None]:
folder_path = r'../outputs'
output_filename = r'../outputs/compiled_output.csv'
df = compile_unique_user_csvs(folder_path, output_filename)
filtered_df = filter_unknown_gender_with_avatar(df)
filtered_df.to_csv(r'../outputs/filtered_df.csv')

Failed to read user_attributes_1005.csv: No columns to parse from file
Failed to read user_attributes_1006.csv: No columns to parse from file
Failed to read user_attributes_1007.csv: No columns to parse from file
Failed to read user_attributes_1014.csv: No columns to parse from file
Failed to read user_attributes_1015.csv: No columns to parse from file
Failed to read user_attributes_1016.csv: No columns to parse from file
Failed to read user_attributes_1017.csv: No columns to parse from file
Failed to read user_attributes_1018.csv: No columns to parse from file
Failed to read user_attributes_1019.csv: No columns to parse from file
Failed to read user_attributes_1020.csv: No columns to parse from file
Failed to read user_attributes_1021.csv: No columns to parse from file
Failed to read user_attributes_1022.csv: No columns to parse from file
Failed to read user_attributes_1023.csv: No columns to parse from file
Failed to read user_attributes_1024.csv: No columns to parse from file
Failed

In [None]:
filtered_df_labeled_original = pd.read_csv(r'../outputs/filtered_df_labeled_original.csv')
filtered_df_labeled_original_1 = filtered_df_labeled_original.dropna(subset=['new_gender'])


In [22]:
model, label_encoder = train_gender_classifier(filtered_df_labeled_original_1, batch_size=16, epochs=5, lr=1e-4)    



Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /Users/chestergarettcalingacion/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:14<00:00, 3.24MB/s]
Epoch 1/5: 100%|██████████| 5/5 [03:06<00:00, 37.27s/it]


Epoch 1 loss: 0.8583


Epoch 2/5: 100%|██████████| 5/5 [00:46<00:00,  9.21s/it]


Epoch 2 loss: 0.4172


Epoch 3/5: 100%|██████████| 5/5 [00:45<00:00,  9.03s/it]


Epoch 3 loss: 0.2411


Epoch 4/5: 100%|██████████| 5/5 [00:49<00:00,  9.97s/it]


Epoch 4 loss: 0.1286


Epoch 5/5: 100%|██████████| 5/5 [01:43<00:00, 20.60s/it]

Epoch 5 loss: 0.1103
Training complete.





In [23]:
import joblib
import torch

# Save PyTorch model
torch.save(model.state_dict(), r'../pickles/gender_classifier.pth')

# Save LabelEncoder
joblib.dump(label_encoder, r'../pickles/label_encoder.pkl')


['../pickles/label_encoder.pkl']

In [26]:
import torch
import joblib

# Define the same model architecture used during training
class GenderClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(GenderClassifier, self).__init__()
        self.base = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
        in_features = self.base.fc.in_features
        self.base.fc = nn.Linear(in_features, num_classes)

    def forward(self, x):
        return self.base(x)

# Load LabelEncoder
le = joblib.load(r'../pickles/label_encoder.pkl')
# Load ResNet18 model directly
model = models.resnet18()
in_features = model.fc.in_features
model.fc = nn.Linear(in_features, len(le.classes_))  # Adjust output layer
model.load_state_dict(torch.load(r'../pickles/gender_classifier.pth', map_location=torch.device('cpu')))
model.eval()


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [28]:
def predict_gender_from_url(url, model, label_encoder):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ])

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.eval()
    model.to(device)

    try:
        response = requests.get(url, timeout=10)
        image = Image.open(BytesIO(response.content)).convert('RGB')
        image_tensor = transform(image).unsqueeze(0).to(device)

        with torch.no_grad():
            output = model(image_tensor)
            probs = torch.nn.functional.softmax(output, dim=1)
            pred_idx = torch.argmax(probs, dim=1).item()
            pred_label = label_encoder.inverse_transform([pred_idx])[0]
            confidence = probs[0][pred_idx].item()

        return {
            'predicted_gender': pred_label,
            'confidence': round(confidence, 4)
        }
    except Exception as e:
        return {
            'error': str(e),
            'predicted_gender': 'unknown',
            'confidence': 0.0
        }

In [38]:
filtered_df_labeled_original_2 = filtered_df_labeled_original

for idx, row in filtered_df_labeled_original_2.iterrows():
    url = row['avatar']  # Replace with the correct column name if different
    result = predict_gender_from_url(url, model, label_encoder)
    filtered_df_labeled_original_2.loc[idx, 'predicted_gender'] = result['predicted_gender']
    filtered_df_labeled_original_2.loc[idx, 'confidence'] = result['confidence']




In [39]:
filtered_df_labeled_original_2.to_csv(r'../outputs/gender_predictions.csv')