Gender Identification

In [1]:
def gender_features(name):
    name = name.lower()

    features = {}

    # Basic character features
    features["first_letter"] = name[0]
    features["last_letter"] = name[-1]
    features["name_length"] = len(name)

    # Vowel/consonant ending
    features["ends_with_vowel"] = name[-1] in "aeiou"
    features["ends_with_consonant"] = name[-1] not in "aeiou"

    # Last 2 and 3 character suffixes (VERY IMPORTANT)
    features["last_2"] = name[-2:] if len(name) >= 2 else name
    features["last_3"] = name[-3:] if len(name) >= 3 else name

    # Common Indian female suffixes
    female_suffixes = (
        "a", "i", "aa", "ya", "ni", "ika", "ini", "thi", "thra",
        "mitha", "shree", "rani", "latha", "vani", "sri"
    )

    for suf in female_suffixes:
        features[f"ends_with_{suf}"] = name.endswith(suf)

    # Common Indian male suffixes
    male_suffixes = (
        "n", "an", "esh", "raj", "shan", "kar", "deep", "dev",
        "kumar", "th", "ran", "eshan"
    )

    for suf in male_suffixes:
        features[f"ends_with_{suf}"] = name.endswith(suf)

    # Vowel count (Indian female names often higher)
    features["vowel_count"] = sum(1 for c in name if c in "aeiou")

    # Starts with vowel
    features["starts_with_vowel"] = name[0] in "aeiou"

    return features


The returned dictionary, known as a **feature set,** maps from feature names to their values. Feature names are case-sensitive strings that typically provide a short human-readable description of the feature, as in the example `'last_letter'`. Feature values are values with simple types, such as booleans, numbers, and strings.

Now that we've defined a feature extractor, we need to prepare a list of examples and corresponding class labels.

In [2]:
import pandas as pd
import random

# Load the CSV file
# Replace 'names.csv' with your actual CSV file path
df = pd.read_csv("sample_indian_names.csv")

# Map numeric gender labels to text labels
# 0 -> male, 1 -> female
df["Gender"] = df["Gender"].map({0: "male", 1: "female"})

# Convert to list of (name, gender) tuples
labeled_names = list(zip(df["Name"], df["Gender"]))

# Shuffle the combined data
random.shuffle(labeled_names)

labeled_names[:10]


[('Gabinesh', 'male'),
 ('Hestia', 'female'),
 ('Naavinya', 'male'),
 ('Enthushan', 'male'),
 ('Stalin', 'male'),
 ('Mishry', 'male'),
 ('Almika', 'female'),
 ('Yaadhav', 'male'),
 ('Nimalavenen', 'male'),
 ('Vehant', 'male')]

Next, we use the feature extractor to process the names data, and divide the resulting list of feature sets into a **training set** and a **test set** The training set is used to train a Naive Bayes classifier.

In [None]:
import random
from nltk import NaiveBayesClassifier, classify
from sklearn.model_selection import KFold

# Feature extraction
featuresets = [(gender_features(name), gender) for (name, gender) in labeled_names]

# Shuffle once before splitting
random.shuffle(featuresets)

# Train / Test split (90% train, 10% test)
TOTAL_SIZE = len(featuresets)          
TEST_SIZE = int(0.10 * TOTAL_SIZE)    

test_set = featuresets[:TEST_SIZE]
train_val_set = featuresets[TEST_SIZE:]

print("Total samples:", TOTAL_SIZE)
print("Training+Validation samples:", len(train_val_set))
print("Test samples:", len(test_set))

# K-Fold Cross Validation on Training data
K = 5
kf = KFold(n_splits=K, shuffle=True, random_state=42)

cv_accuracies = []

for fold, (train_idx, val_idx) in enumerate(kf.split(train_val_set), 1):
    train_fold = [train_val_set[i] for i in train_idx]
    val_fold = [train_val_set[i] for i in val_idx]

    classifier = NaiveBayesClassifier.train(train_fold)
    accuracy = classify.accuracy(classifier, val_fold)

    cv_accuracies.append(accuracy)
    print(f"Fold {fold} Accuracy: {accuracy:.4f}")

print("\nAverage CV Accuracy:", sum(cv_accuracies) / K)

# Final model training on full training data
final_classifier = NaiveBayesClassifier.train(train_val_set)

# Final evaluation on Test set
test_accuracy = classify.accuracy(final_classifier, test_set)
print("Final Test Accuracy:", test_accuracy)


Total samples: 53982
Training+Validation samples: 48584
Test samples: 5398
Fold 1 Accuracy: 0.8911
Fold 2 Accuracy: 0.8942
Fold 3 Accuracy: 0.8888
Fold 4 Accuracy: 0.8970
Fold 5 Accuracy: 0.8956

Average CV Accuracy: 0.8933394182579854
Final Test Accuracy: 0.8966283808818081


In [4]:
# Create test data with original names preserved
TEST_SIZE = len(test_set)

test_rows = labeled_names[:TEST_SIZE]

for i in range(20):
    name, gender = test_rows[i]
    print(f"{i+1}. Name: {name}, Label: {gender}")



1. Name: Gabinesh, Label: male
2. Name: Hestia, Label: female
3. Name: Naavinya, Label: male
4. Name: Enthushan, Label: male
5. Name: Stalin, Label: male
6. Name: Mishry, Label: male
7. Name: Almika, Label: female
8. Name: Yaadhav, Label: male
9. Name: Nimalavenen, Label: male
10. Name: Vehant, Label: male
11. Name: Smitavaktra, Label: male
12. Name: Kesanth, Label: male
13. Name: Anban, Label: male
14. Name: Urmi, Label: female
15. Name: Nubisha, Label: female
16. Name: Robena, Label: female
17. Name: Raghbir, Label: male
18. Name: Kowmari, Label: female
19. Name: Vinodhine, Label: female
20. Name: Varatam, Label: male


In [None]:
import pandas as pd
test_data = labeled_names[:TEST_SIZE]

# Convert test data to DataFrame
test_df = pd.DataFrame(test_data, columns=["Name", "Gender"])

# Optional: convert labels back to numeric if needed
# male -> 0, female -> 1
test_df["Gender"] = test_df["Gender"].map({"male": 0, "female": 1})

# Save to CSV
test_df.to_csv("test_dataset.csv", index=False)

print("Test dataset saved as test_dataset.csv")
print("Total test samples:", len(test_df))


Test dataset saved as test_dataset.csv
Total test samples: 5398


Let's just test it out on some names that did not appear in its training data:

In [None]:
from nltk import classify
from collections import Counter


# Predict on test data
y_true = []
y_pred = []

for features, actual_gender in test_set:
    predicted_gender = final_classifier.classify(features)
    y_true.append(actual_gender)
    y_pred.append(predicted_gender)


# test accuracy
test_accuracy = classify.accuracy(final_classifier, test_set)
print("Test Accuracy:", test_accuracy)


# Confusion matrix (manual)
confusion = Counter()

for actual, predicted in zip(y_true, y_pred):
    confusion[(actual, predicted)] += 1

print("\nConfusion Matrix Counts:")
print("Actual Male -> Predicted Male:", confusion[('male', 'male')])
print("Actual Male -> Predicted Female:", confusion[('male', 'female')])
print("Actual Female -> Predicted Male:", confusion[('female', 'male')])
print("Actual Female -> Predicted Female:", confusion[('female', 'female')])


# show few sample predictions (verification)
print("\nSample Predictions (Actual vs Predicted):")
for i in range(10):
    features, actual = test_set[i]
    predicted = final_classifier.classify(features)
    print(f"{i+1}. Actual: {actual}, Predicted: {predicted}")


Test Accuracy: 0.8966283808818081

Confusion Matrix Counts:
Actual Male -> Predicted Male: 2560
Actual Male -> Predicted Female: 356
Actual Female -> Predicted Male: 202
Actual Female -> Predicted Female: 2280

Sample Predictions (Actual vs Predicted):
1. Actual: male, Predicted: male
2. Actual: female, Predicted: female
3. Actual: female, Predicted: female
4. Actual: male, Predicted: male
5. Actual: female, Predicted: female
6. Actual: female, Predicted: female
7. Actual: male, Predicted: male
8. Actual: female, Predicted: female
9. Actual: male, Predicted: male
10. Actual: male, Predicted: male


In [7]:
print("\nSample Predictions (Name | Actual | Predicted):")

for i in range(10):
    name, actual = test_data[i]              # name preserved here
    features = gender_features(name)         # recreate features
    predicted = final_classifier.classify(features)

    print(f"{i+1}. Name: {name}, Actual: {actual}, Predicted: {predicted}")



Sample Predictions (Name | Actual | Predicted):
1. Name: Gabinesh, Actual: male, Predicted: male
2. Name: Hestia, Actual: female, Predicted: female
3. Name: Naavinya, Actual: male, Predicted: female
4. Name: Enthushan, Actual: male, Predicted: male
5. Name: Stalin, Actual: male, Predicted: male
6. Name: Mishry, Actual: male, Predicted: male
7. Name: Almika, Actual: female, Predicted: female
8. Name: Yaadhav, Actual: male, Predicted: male
9. Name: Nimalavenen, Actual: male, Predicted: male
10. Name: Vehant, Actual: male, Predicted: male


In [23]:
from nltk.classify import accuracy
print(accuracy(final_classifier, test_set))

0.8966283808818081


Finally, we can examine the classifier to determine which features it found most effective for distinguishing the names' genders:

In [9]:
final_classifier.show_most_informative_features(10)

Most Informative Features
          ends_with_shan = True             male : female =    236.0 : 1.0
                  last_3 = 'van'            male : female =    232.8 : 1.0
                  last_3 = 'han'            male : female =    215.6 : 1.0
                  last_3 = 'yan'            male : female =    154.3 : 1.0
                  last_3 = 'kan'            male : female =    149.6 : 1.0
                  last_3 = 'san'            male : female =    146.9 : 1.0
           ends_with_ika = True           female : male   =    125.8 : 1.0
                  last_3 = 'ika'          female : male   =    125.1 : 1.0
                  last_3 = 'gan'            male : female =    124.3 : 1.0
                  last_3 = 'hen'            male : female =    114.7 : 1.0


In [10]:
# --------------------------------------------------
# Predict gender using the trained model
# --------------------------------------------------
def predict_gender(name):
    features = gender_features(name)
    return final_classifier.classify(features)


In [None]:

# Predict gender for a single name
name = "yashraj"  

# Predict using your trained classifier
features = gender_features(name)
predicted_gender = final_classifier.classify(features)

print(f"Name: {name}, Predicted Gender: {predicted_gender}")


Name: yashraj, Predicted Gender: male
