Gender Identification

In [None]:
def gender_features(name):
    name = name.lower()

    features = {}

    # BASIC FEATURES (keep existing logic)    
    features["first_letter"] = name[0]
    features["last_letter"] = name[-1]
    features["name_length"] = len(name)

    features["ends_with_vowel"] = name[-1] in "aeiou"
    features["ends_with_consonant"] = name[-1] not in "aeiou"
    features["starts_with_vowel"] = name[0] in "aeiou"

    features["last_2"] = name[-2:] if len(name) >= 2 else name
    features["last_3"] = name[-3:] if len(name) >= 3 else name
    
    # INDIAN-SPECIFIC SUFFIX FEATURES    
    female_suffixes = (
        "a", "i", "aa", "ya", "ni", "ika", "ini", "thi", "thra",
        "mitha", "shree", "rani", "latha", "vani", "sri"
    )

    male_suffixes = (
        "n", "an", "esh", "raj", "shan", "kar", "deep", "dev",
        "kumar", "th", "ran", "eshan"
    )

    for suf in female_suffixes:
        features[f"ends_with_female_{suf}"] = name.endswith(suf)

    for suf in male_suffixes:
        features[f"ends_with_male_{suf}"] = name.endswith(suf)

   
    # VOWEL STATISTICS    
    features["vowel_count"] = sum(1 for c in name if c in "aeiou")
    features["vowel_ratio"] = features["vowel_count"] / len(name)    
   
    # Using boundary markers improves learning
    padded = f"<{name}>"

    # 2-grams, 3-grams, 4-grams
    for n in (2, 3, 4):
        for i in range(len(padded) - n + 1):
            gram = padded[i:i+n]
            features[f"char_{n}gram_{gram}"] = True

    return features


The returned dictionary, known as a **feature set,** maps from feature names to their values. Feature names are case-sensitive strings that typically provide a short human-readable description of the feature, as in the example `'last_letter'`. Feature values are values with simple types, such as booleans, numbers, and strings.

Now that we've defined a feature extractor, we need to prepare a list of examples and corresponding class labels.

In [25]:
import pandas as pd
import random

# Load the CSV file
# Replace 'names.csv' with your actual CSV file path
df = pd.read_csv("sample_indian_names.csv")

# Map numeric gender labels to text labels
# 0 -> male, 1 -> female
df["Gender"] = df["Gender"].map({0: "male", 1: "female"})

# Convert to list of (name, gender) tuples
labeled_names = list(zip(df["Name"], df["Gender"]))

# Shuffle the combined data
random.shuffle(labeled_names)

labeled_names[:10]


[('Chiranjivi', 'male'),
 ('Avanita', 'female'),
 ('Sivanushan', 'male'),
 ('Abeeshan', 'male'),
 ('Siddhanta', 'male'),
 ('Durwank', 'male'),
 ('Marutatmaj', 'male'),
 ('Amolika', 'female'),
 ('Perarasi', 'female'),
 ('Kartheeswari', 'female')]

Next, we use the feature extractor to process the names data, and divide the resulting list of feature sets into a **training set** and a **test set** The training set is used to train a Naive Bayes classifier.

In [None]:
import random
from sklearn.model_selection import KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


# STEP 1: Feature extraction
featuresets = [(gender_features(name), gender) for (name, gender) in labeled_names]

# Shuffle once before splitting
random.shuffle(featuresets)


# Train / Test split (90% train, 10% test)
TOTAL_SIZE = len(featuresets)
TEST_SIZE = int(0.10 * TOTAL_SIZE)

test_set = featuresets[:TEST_SIZE]
train_val_set = featuresets[TEST_SIZE:]

print("Total samples:", TOTAL_SIZE)
print("Training+Validation samples:", len(train_val_set))
print("Test samples:", len(test_set))


# K-Fold Cross Validation on Training data
K = 5
kf = KFold(n_splits=K, shuffle=True, random_state=42)

cv_accuracies = []

for fold, (train_idx, val_idx) in enumerate(kf.split(train_val_set), 1):

    # Split folds
    train_fold = [train_val_set[i] for i in train_idx]
    val_fold = [train_val_set[i] for i in val_idx]

    X_train = [f for f, y in train_fold]
    y_train = [y for f, y in train_fold]

    X_val = [f for f, y in val_fold]
    y_val = [y for f, y in val_fold]

    # Vectorize features
    vectorizer = DictVectorizer(sparse=True)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_val_vec = vectorizer.transform(X_val)

    # Logistic Regression model
    classifier = LogisticRegression(
        max_iter=1000,
        solver="liblinear",
        class_weight="balanced"
    )

    classifier.fit(X_train_vec, y_train)

    # Validate
    y_pred = classifier.predict(X_val_vec)
    accuracy = accuracy_score(y_val, y_pred)

    cv_accuracies.append(accuracy)
    print(f"Fold {fold} Accuracy: {accuracy:.4f}")

print("\nAverage CV Accuracy:", sum(cv_accuracies) / K)


# Final model training on full training data

X_train_full = [f for f, y in train_val_set]
y_train_full = [y for f, y in train_val_set]

final_vectorizer = DictVectorizer(sparse=True)
X_train_full_vec = final_vectorizer.fit_transform(X_train_full)

final_classifier = LogisticRegression(
    max_iter=1000,
    solver="liblinear",
    class_weight="balanced"
)

final_classifier.fit(X_train_full_vec, y_train_full)


# Final evaluation on Test set
X_test = [f for f, y in test_set]
y_test = [y for f, y in test_set]

X_test_vec = final_vectorizer.transform(X_test)
y_test_pred = final_classifier.predict(X_test_vec)

test_accuracy = accuracy_score(y_test, y_test_pred)
print("Final Test Accuracy:", test_accuracy)


Total samples: 53982
Training+Validation samples: 48584
Test samples: 5398
Fold 1 Accuracy: 0.9286
Fold 2 Accuracy: 0.9312
Fold 3 Accuracy: 0.9294
Fold 4 Accuracy: 0.9271
Fold 5 Accuracy: 0.9291

Average CV Accuracy: 0.9290712994966274
Final Test Accuracy: 0.9310855872545387


In [39]:
# Create test data with original names preserved
TEST_SIZE = len(test_set)

test_rows = labeled_names[:TEST_SIZE]

for i in range(20):
    name, gender = test_rows[i]
    print(f"{i+1}. Name: {name}, Label: {gender}")



1. Name: Chiranjivi, Label: male
2. Name: Avanita, Label: female
3. Name: Sivanushan, Label: male
4. Name: Abeeshan, Label: male
5. Name: Siddhanta, Label: male
6. Name: Durwank, Label: male
7. Name: Marutatmaj, Label: male
8. Name: Amolika, Label: female
9. Name: Perarasi, Label: female
10. Name: Kartheeswari, Label: female
11. Name: Shivan, Label: male
12. Name: Nushaanan, Label: male
13. Name: Merujayan, Label: male
14. Name: Vaikunthanatha, Label: male
15. Name: Lakshmika, Label: female
16. Name: Senkuduvan, Label: male
17. Name: Saikash, Label: male
18. Name: Ajaana, Label: female
19. Name: Anubhab, Label: male
20. Name: Kritheikaa, Label: female


In [41]:
import pandas as pd
test_data = labeled_names[:TEST_SIZE]

# Convert test data to DataFrame
test_df = pd.DataFrame(test_data, columns=["Name", "Gender"])

# Optional: convert labels back to numeric if needed
# male -> 0, female -> 1
test_df["Gender"] = test_df["Gender"].map({"male": 0, "female": 1})

# Save to CSV
test_df.to_csv("test_dataset.csv", index=False)

print("Test dataset saved as test_dataset.csv")
print("Total test samples:", len(test_df))


Test dataset saved as test_dataset.csv
Total test samples: 5398


Let's just test it out on some names that did not appear in its training data:

In [None]:
from sklearn.metrics import accuracy_score
from collections import Counter


# Predict on test data
y_true = []
y_pred = []

X_test = []
y_test = []

for features, actual_gender in test_set:
    X_test.append(features)
    y_test.append(actual_gender)

# Vectorize test features
X_test_vec = final_vectorizer.transform(X_test)

# Predict
y_pred = final_classifier.predict(X_test_vec)
y_true = y_test


# Test accuracy
test_accuracy = accuracy_score(y_true, y_pred)
print("Test Accuracy:", test_accuracy)


# Confusion matrix (manual)
confusion = Counter()

for actual, predicted in zip(y_true, y_pred):
    confusion[(actual, predicted)] += 1

print("\nConfusion Matrix Counts:")
print("Actual Male -> Predicted Male:", confusion[(0, 0)])
print("Actual Male -> Predicted Female:", confusion[(0, 1)])
print("Actual Female -> Predicted Male:", confusion[(1, 0)])
print("Actual Female -> Predicted Female:", confusion[(1, 1)])


# Show few sample predictions
print("\nSample Predictions (Actual vs Predicted):")
for i in range(10):
    name_features, actual = test_set[i]
    pred = final_classifier.predict(
        final_vectorizer.transform([name_features])
    )[0]

    print(f"{i+1}. Actual: {actual}, Predicted: {pred}")


Test Accuracy: 0.9310855872545387

Confusion Matrix Counts:
Actual Male -> Predicted Male: 0
Actual Male -> Predicted Female: 0
Actual Female -> Predicted Male: 0
Actual Female -> Predicted Female: 0

Sample Predictions (Actual vs Predicted):
1. Actual: male, Predicted: male
2. Actual: female, Predicted: female
3. Actual: female, Predicted: male
4. Actual: female, Predicted: female
5. Actual: male, Predicted: male
6. Actual: male, Predicted: female
7. Actual: female, Predicted: female
8. Actual: female, Predicted: female
9. Actual: male, Predicted: male
10. Actual: male, Predicted: female


In [45]:
print("\nSample Predictions (Name | Actual | Predicted):")

for i in range(10):
    name, actual = test_data[i]          # name preserved
    features = gender_features(name)     # dict features

    # Vectorize (VERY IMPORTANT)
    features_vec = final_vectorizer.transform([features])

    # Predict
    predicted = final_classifier.predict(features_vec)[0]

    print(f"{i+1}. Name: {name}, Actual: {actual}, Predicted: {predicted}")



Sample Predictions (Name | Actual | Predicted):
1. Name: Chiranjivi, Actual: male, Predicted: male
2. Name: Avanita, Actual: female, Predicted: female
3. Name: Sivanushan, Actual: male, Predicted: male
4. Name: Abeeshan, Actual: male, Predicted: male
5. Name: Siddhanta, Actual: male, Predicted: male
6. Name: Durwank, Actual: male, Predicted: male
7. Name: Marutatmaj, Actual: male, Predicted: male
8. Name: Amolika, Actual: female, Predicted: female
9. Name: Perarasi, Actual: female, Predicted: female
10. Name: Kartheeswari, Actual: female, Predicted: female


In [47]:
from sklearn.metrics import accuracy_score

y_true = []
y_pred = []

for name, actual in test_data:
    features = gender_features(name)
    features_vec = final_vectorizer.transform([features])
    predicted = final_classifier.predict(features_vec)[0]

    y_true.append(actual)
    y_pred.append(predicted)

test_accuracy = accuracy_score(y_true, y_pred)
print("Test Accuracy:", test_accuracy)


Test Accuracy: 0.9612819562801037


Finally, we can examine the classifier to determine which features it found most effective for distinguishing the names' genders:

In [48]:
# Predict gender for a single name
def predict_gender(name):
    features = gender_features(name)
    features_vec = final_vectorizer.transform([features])
    return final_classifier.predict(features_vec)[0]


In [51]:
predict_gender("swapnil")


np.str_('male')