Gender Identification

In [27]:
import pandas as pd
import random
import joblib
import os
from sklearn.model_selection import KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from collections import Counter

In [1]:
def gender_features(name):
    name = name.lower()

    features = {}

    # BASIC FEATURES (keep existing logic)    
    features["first_letter"] = name[0]
    features["last_letter"] = name[-1]
    features["name_length"] = len(name)

    features["ends_with_vowel"] = name[-1] in "aeiou"
    features["ends_with_consonant"] = name[-1] not in "aeiou"
    features["starts_with_vowel"] = name[0] in "aeiou"

    features["last_2"] = name[-2:] if len(name) >= 2 else name
    features["last_3"] = name[-3:] if len(name) >= 3 else name
    
    # INDIAN-SPECIFIC SUFFIX FEATURES    
    female_suffixes = (
        "a", "i", "aa", "ya", "ni", "ika", "ini", "thi", "thra",
        "mitha", "shree", "rani", "latha", "vani", "sri"
    )

    male_suffixes = (
        "n", "an", "esh", "raj", "shan", "kar", "deep", "dev",
        "kumar", "th", "ran", "eshan"
    )

    for suf in female_suffixes:
        features[f"ends_with_female_{suf}"] = name.endswith(suf)

    for suf in male_suffixes:
        features[f"ends_with_male_{suf}"] = name.endswith(suf)

   
    # VOWEL STATISTICS    
    features["vowel_count"] = sum(1 for c in name if c in "aeiou")
    features["vowel_ratio"] = features["vowel_count"] / len(name)    
   
    # Using boundary markers improves learning
    padded = f"<{name}>"

    # 2-grams, 3-grams, 4-grams
    for n in (2, 3, 4):
        for i in range(len(padded) - n + 1):
            gram = padded[i:i+n]
            features[f"char_{n}gram_{gram}"] = True

    return features


In [None]:


# Load the CSV file
# Replace 'names.csv' with your actual CSV file path
df = pd.read_csv("sample_indian_names.csv")

# Map numeric gender labels to text labels
# 0 -> male, 1 -> female
df["Gender"] = df["Gender"].map({0: "male", 1: "female"})

# Convert to list of (name, gender) tuples
labeled_names = list(zip(df["Name"], df["Gender"]))

# Shuffle the combined data
random.shuffle(labeled_names)

labeled_names[:10]


[('Udayakumari', 'female'),
 ('Rekashan', 'male'),
 ('Charanraj', 'male'),
 ('Mariappan', 'female'),
 ('Layashya', 'female'),
 ('Kajany', 'female'),
 ('Tanavya', 'female'),
 ('Vidoran', 'male'),
 ('Dhavishi', 'female'),
 ('Tasadharen', 'male')]

In [None]:

# Feature extraction
featuresets = [(gender_features(name), gender) for (name, gender) in labeled_names]

# Shuffle once before splitting
random.shuffle(featuresets)


# Train / Test split (90% train, 10% test)
TOTAL_SIZE = len(featuresets)
TEST_SIZE = int(0.10 * TOTAL_SIZE)

test_set = featuresets[:TEST_SIZE]
train_val_set = featuresets[TEST_SIZE:]

print("Total samples:", TOTAL_SIZE)
print("Training+Validation samples:", len(train_val_set))
print("Test samples:", len(test_set))


# K-Fold Cross Validation on Training data
K = 5
kf = KFold(n_splits=K, shuffle=True, random_state=42)

cv_accuracies = []

for fold, (train_idx, val_idx) in enumerate(kf.split(train_val_set), 1):

    # Split folds
    train_fold = [train_val_set[i] for i in train_idx]
    val_fold = [train_val_set[i] for i in val_idx]

    X_train = [f for f, y in train_fold]
    y_train = [y for f, y in train_fold]

    X_val = [f for f, y in val_fold]
    y_val = [y for f, y in val_fold]

    # Vectorize features
    vectorizer = DictVectorizer(sparse=True)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_val_vec = vectorizer.transform(X_val)

    # Logistic Regression model
    classifier = LogisticRegression(
        max_iter=1000,
        solver="liblinear",
        class_weight="balanced"
    )

    classifier.fit(X_train_vec, y_train)

    # Validate
    y_pred = classifier.predict(X_val_vec)
    accuracy = accuracy_score(y_val, y_pred)

    cv_accuracies.append(accuracy)
    print(f"Fold {fold} Accuracy: {accuracy:.4f}")

print("\nAverage CV Accuracy:", sum(cv_accuracies) / K)


# Final model training on full training data

X_train_full = [f for f, y in train_val_set]
y_train_full = [y for f, y in train_val_set]

final_vectorizer = DictVectorizer(sparse=True)
X_train_full_vec = final_vectorizer.fit_transform(X_train_full)

final_classifier = LogisticRegression(
    max_iter=1000,
    solver="liblinear",
    class_weight="balanced"
)

final_classifier.fit(X_train_full_vec, y_train_full)


# Final evaluation on Test set
X_test = [f for f, y in test_set]
y_test = [y for f, y in test_set]

X_test_vec = final_vectorizer.transform(X_test)
y_test_pred = final_classifier.predict(X_test_vec)

test_accuracy = accuracy_score(y_test, y_test_pred)
print("Final Test Accuracy:", test_accuracy)


Total samples: 53982
Training+Validation samples: 48584
Test samples: 5398
Fold 1 Accuracy: 0.9305
Fold 2 Accuracy: 0.9290
Fold 3 Accuracy: 0.9223
Fold 4 Accuracy: 0.9357
Fold 5 Accuracy: 0.9307

Average CV Accuracy: 0.9296476429517723
Final Test Accuracy: 0.9284920340866988


In [30]:

# Create directory to store model artifacts
os.makedirs("model", exist_ok=True)

# Save trained model and vectorizer
joblib.dump(final_classifier, "model/gender_model.joblib")
joblib.dump(final_vectorizer, "model/vectorizer.joblib")

print("Model and vectorizer saved successfully.")


Model and vectorizer saved successfully.


In [31]:
# Create test data with original names preserved
TEST_SIZE = len(test_set)

test_rows = labeled_names[:TEST_SIZE]

for i in range(20):
    name, gender = test_rows[i]
    print(f"{i+1}. Name: {name}, Label: {gender}")



1. Name: Udayakumari, Label: female
2. Name: Rekashan, Label: male
3. Name: Charanraj, Label: male
4. Name: Mariappan, Label: female
5. Name: Layashya, Label: female
6. Name: Kajany, Label: female
7. Name: Tanavya, Label: female
8. Name: Vidoran, Label: male
9. Name: Dhavishi, Label: female
10. Name: Tasadharen, Label: male
11. Name: Anjanappa, Label: male
12. Name: Shkear, Label: male
13. Name: Avatar, Label: male
14. Name: Yuvanesh, Label: male
15. Name: Raghunath, Label: male
16. Name: Pregassen, Label: male
17. Name: Kuvarshan, Label: male
18. Name: Rengaraj, Label: male
19. Name: Suryesh, Label: male
20. Name: Kanthen, Label: male


In [32]:
test_data = labeled_names[:TEST_SIZE]

# Convert test data to DataFrame
test_df = pd.DataFrame(test_data, columns=["Name", "Gender"])

# Optional: convert labels back to numeric if needed
# male -> 0, female -> 1
test_df["Gender"] = test_df["Gender"].map({"male": 0, "female": 1})

# Save to CSV
test_df.to_csv("test_dataset.csv", index=False)

print("Test dataset saved as test_dataset.csv")
print("Total test samples:", len(test_df))


Test dataset saved as test_dataset.csv
Total test samples: 5398


In [33]:
# Predict on test data
y_true = []
y_pred = []

X_test = []
y_test = []

for features, actual_gender in test_set:
    X_test.append(features)
    y_test.append(actual_gender)

# Vectorize test features
X_test_vec = final_vectorizer.transform(X_test)

# Predict
y_pred = final_classifier.predict(X_test_vec)
y_true = y_test


# Test accuracy
test_accuracy = accuracy_score(y_true, y_pred)
print("Test Accuracy:", test_accuracy)


# Confusion matrix (manual)
confusion = Counter()

for actual, predicted in zip(y_true, y_pred):
    confusion[(actual, predicted)] += 1

print("\nConfusion Matrix Counts:")
print("Actual Male -> Predicted Male:", confusion[("male", "male")])
print("Actual Male -> Predicted Female:", confusion[("male", "female")])
print("Actual Female -> Predicted Male:", confusion[("female", "male")])
print("Actual Female -> Predicted Female:", confusion[("female", "female")])


# Show few sample predictions
print("\nSample Predictions (Actual vs Predicted):")
for i in range(10):
    name_features, actual = test_set[i]
    pred = final_classifier.predict(
        final_vectorizer.transform([name_features])
    )[0]

    print(f"{i+1}. Actual: {actual}, Predicted: {pred}")


Test Accuracy: 0.9284920340866988

Confusion Matrix Counts:
Actual Male -> Predicted Male: 2665
Actual Male -> Predicted Female: 237
Actual Female -> Predicted Male: 149
Actual Female -> Predicted Female: 2347

Sample Predictions (Actual vs Predicted):
1. Actual: male, Predicted: male
2. Actual: female, Predicted: female
3. Actual: male, Predicted: female
4. Actual: male, Predicted: male
5. Actual: female, Predicted: female
6. Actual: female, Predicted: female
7. Actual: male, Predicted: male
8. Actual: male, Predicted: male
9. Actual: male, Predicted: male
10. Actual: male, Predicted: male


In [34]:
print("\nSample Predictions (Name | Actual | Predicted):")

for i in range(10):
    name, actual = test_data[i]          # name preserved
    features = gender_features(name)     # dict features

    # Vectorize (VERY IMPORTANT)
    features_vec = final_vectorizer.transform([features])

    # Predict
    predicted = final_classifier.predict(features_vec)[0]

    print(f"{i+1}. Name: {name}, Actual: {actual}, Predicted: {predicted}")



Sample Predictions (Name | Actual | Predicted):
1. Name: Udayakumari, Actual: female, Predicted: female
2. Name: Rekashan, Actual: male, Predicted: male
3. Name: Charanraj, Actual: male, Predicted: male
4. Name: Mariappan, Actual: female, Predicted: male
5. Name: Layashya, Actual: female, Predicted: female
6. Name: Kajany, Actual: female, Predicted: female
7. Name: Tanavya, Actual: female, Predicted: female
8. Name: Vidoran, Actual: male, Predicted: male
9. Name: Dhavishi, Actual: female, Predicted: female
10. Name: Tasadharen, Actual: male, Predicted: male


In [35]:
y_true = []
y_pred = []

for name, actual in test_data:
    features = gender_features(name)
    features_vec = final_vectorizer.transform([features])
    predicted = final_classifier.predict(features_vec)[0]

    y_true.append(actual)
    y_pred.append(predicted)

test_accuracy = accuracy_score(y_true, y_pred)
print("Test Accuracy:", test_accuracy)


Test Accuracy: 0.9562801037421267


In [36]:
def predict_gender_with_unisex(name, threshold=0.85, margin=0.65):
    features = gender_features(name)
    features_vec = final_vectorizer.transform([features])

    probs = final_classifier.predict_proba(features_vec)[0]
    classes = [str(c) for c in final_classifier.classes_]

    prob_dict = {cls: float(prob) for cls, prob in zip(classes, probs)}

    sorted_probs = sorted(prob_dict.items(), key=lambda x: x[1], reverse=True)
    top_label, top_prob = sorted_probs[0]
    second_label, second_prob = sorted_probs[1]

    if top_prob < threshold or (top_prob - second_prob) < margin:
        return {
            "name": name,
            "prediction": "Common",
            "confidence": top_prob,
            "probabilities": prob_dict
        }

    return {
        "name": name,
        "prediction": top_label,
        "confidence": top_prob,
        "probabilities": prob_dict
    }


In [37]:
BASE_DIR = os.getcwd()   # Notebook-safe base directory
print("BASE_DIR set to:", BASE_DIR)


BASE_DIR set to: d:\customapps\ECAT\Gender-Classification


In [38]:
def store_feedback(name, model_prediction, human_verdict, confidence):
    """
    Stores human feedback in feedback.csv
    """

    file_path = os.path.join(BASE_DIR, "feedback-identification.csv")

    row = pd.DataFrame(
        [[name, model_prediction, human_verdict, confidence]],
        columns=[
            "Name",
            "Model_Prediction",
            "Human_Verdict",
            "Confidence"
        ]
    )

    if os.path.exists(file_path):
        row.to_csv(file_path, mode="a", header=False, index=False)
    else:
        row.to_csv(file_path, index=False)

    print(f"Feedback saved at: {file_path}")


In [46]:
# Name input (single source of truth)
name_input = "juspreet"

# Model prediction
result = predict_gender_with_unisex(name_input)

# Human label
human_verdict = "female"

# Store feedback using the same name
store_feedback(
    name=name_input,
    model_prediction=result["prediction"],
    human_verdict=human_verdict,
    confidence=result["confidence"]
)


Feedback saved at: d:\customapps\ECAT\Gender-Classification\feedback-identification.csv


In [47]:
print(result)

{'name': 'juspreet', 'prediction': 'Common', 'confidence': 0.6869017220662444, 'probabilities': {'female': 0.31309827793375555, 'male': 0.6869017220662444}}
