Gender Identification

In [75]:
import pandas as pd
import random
import joblib
import os
from sklearn.model_selection import KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from collections import Counter

In [76]:
def gender_features(name):
    name = name.lower()

    features = {}

    # BASIC FEATURES (keep existing logic)    
    features["first_letter"] = name[0]
    features["last_letter"] = name[-1]
    features["name_length"] = len(name)

    features["ends_with_vowel"] = name[-1] in "aeiou"
    features["ends_with_consonant"] = name[-1] not in "aeiou"
    features["starts_with_vowel"] = name[0] in "aeiou"

    features["last_2"] = name[-2:] if len(name) >= 2 else name
    features["last_3"] = name[-3:] if len(name) >= 3 else name
    
    # INDIAN-SPECIFIC SUFFIX FEATURES    
    female_suffixes = (
        "a", "i", "aa", "ya", "ni", "ika", "ini", "thi", "thra",
        "mitha", "shree", "rani", "latha", "vani", "sri"
    )

    male_suffixes = (
        "n", "an", "esh", "raj", "shan", "kar", "deep", "dev",
        "kumar", "th", "ran", "eshan"
    )

    for suf in female_suffixes:
        features[f"ends_with_female_{suf}"] = name.endswith(suf)

    for suf in male_suffixes:
        features[f"ends_with_male_{suf}"] = name.endswith(suf)

   
    # VOWEL STATISTICS    
    features["vowel_count"] = sum(1 for c in name if c in "aeiou")
    features["vowel_ratio"] = features["vowel_count"] / len(name)    
   
    # Using boundary markers improves learning
    padded = f"<{name}>"

    # 2-grams, 3-grams, 4-grams
    for n in (2, 3, 4):
        for i in range(len(padded) - n + 1):
            gram = padded[i:i+n]
            features[f"char_{n}gram_{gram}"] = True

    return features


In [77]:


# Load the CSV file
# Replace 'names.csv' with your actual CSV file path
df = pd.read_csv("sample_indian_names.csv")

# Map numeric gender labels to text labels
# 0 -> male, 1 -> female
df["Gender"] = df["Gender"].map({0: "male", 1: "female"})

# Convert to list of (name, gender) tuples
labeled_names = list(zip(df["Name"], df["Gender"]))

# Shuffle the combined data
random.shuffle(labeled_names)

labeled_names[:10]


[('Vibhish', 'male'),
 ('Anmay', 'male'),
 ('Kraanti', 'male'),
 ('Yuveena', 'female'),
 ('Sivahulan', 'male'),
 ('Kogulakannan', 'male'),
 ('Harika', 'female'),
 ('Mishtu', 'female'),
 ('Maharudra', 'male'),
 ('Vani', 'female')]

In [78]:

# Feature extraction
featuresets = [(gender_features(name), gender) for (name, gender) in labeled_names]

# Shuffle once before splitting
random.shuffle(featuresets)


# Train / Test split (90% train, 10% test)
TOTAL_SIZE = len(featuresets)
TEST_SIZE = int(0.10 * TOTAL_SIZE)

test_set = featuresets[:TEST_SIZE]
train_val_set = featuresets[TEST_SIZE:]

print("Total samples:", TOTAL_SIZE)
print("Training+Validation samples:", len(train_val_set))
print("Test samples:", len(test_set))


# K-Fold Cross Validation on Training data
K = 5
kf = KFold(n_splits=K, shuffle=True, random_state=42)

cv_accuracies = []

for fold, (train_idx, val_idx) in enumerate(kf.split(train_val_set), 1):

    # Split folds
    train_fold = [train_val_set[i] for i in train_idx]
    val_fold = [train_val_set[i] for i in val_idx]

    X_train = [f for f, y in train_fold]
    y_train = [y for f, y in train_fold]

    X_val = [f for f, y in val_fold]
    y_val = [y for f, y in val_fold]

    # Vectorize features
    vectorizer = DictVectorizer(sparse=True)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_val_vec = vectorizer.transform(X_val)

    # Logistic Regression model
    classifier = LogisticRegression(
        max_iter=1000,
        solver="liblinear",
        class_weight="balanced"
    )

    classifier.fit(X_train_vec, y_train)

    # Validate
    y_pred = classifier.predict(X_val_vec)
    accuracy = accuracy_score(y_val, y_pred)

    cv_accuracies.append(accuracy)
    print(f"Fold {fold} Accuracy: {accuracy:.4f}")

print("\nAverage CV Accuracy:", sum(cv_accuracies) / K)


# Final model training on full training data

X_train_full = [f for f, y in train_val_set]
y_train_full = [y for f, y in train_val_set]

final_vectorizer = DictVectorizer(sparse=True)
X_train_full_vec = final_vectorizer.fit_transform(X_train_full)

final_classifier = LogisticRegression(
    max_iter=1000,
    solver="liblinear",
    class_weight="balanced"
)

final_classifier.fit(X_train_full_vec, y_train_full)


# Final evaluation on Test set
X_test = [f for f, y in test_set]
y_test = [y for f, y in test_set]

X_test_vec = final_vectorizer.transform(X_test)
y_test_pred = final_classifier.predict(X_test_vec)

test_accuracy = accuracy_score(y_test, y_test_pred)
print("Final Test Accuracy:", test_accuracy)


Total samples: 53982
Training+Validation samples: 48584
Test samples: 5398
Fold 1 Accuracy: 0.9266
Fold 2 Accuracy: 0.9297
Fold 3 Accuracy: 0.9296
Fold 4 Accuracy: 0.9324
Fold 5 Accuracy: 0.9298

Average CV Accuracy: 0.9296270414017647
Final Test Accuracy: 0.9260837347165617


In [79]:

# Create directory to store model artifacts
os.makedirs("model", exist_ok=True)

# Save trained model and vectorizer
joblib.dump(final_classifier, "model/gender_model.joblib")
joblib.dump(final_vectorizer, "model/vectorizer.joblib")

print("Model and vectorizer saved successfully.")


Model and vectorizer saved successfully.


In [80]:
# Create test data with original names preserved
TEST_SIZE = len(test_set)

test_rows = labeled_names[:TEST_SIZE]

for i in range(20):
    name, gender = test_rows[i]
    print(f"{i+1}. Name: {name}, Label: {gender}")



1. Name: Vibhish, Label: male
2. Name: Anmay, Label: male
3. Name: Kraanti, Label: male
4. Name: Yuveena, Label: female
5. Name: Sivahulan, Label: male
6. Name: Kogulakannan, Label: male
7. Name: Harika, Label: female
8. Name: Mishtu, Label: female
9. Name: Maharudra, Label: male
10. Name: Vani, Label: female
11. Name: Saptanshu, Label: male
12. Name: Kesavalu, Label: male
13. Name: Yuganthini, Label: female
14. Name: Jeyagowry, Label: female
15. Name: Darmendran, Label: male
16. Name: Jathush, Label: male
17. Name: Jananthya, Label: female
18. Name: Sivamurugan, Label: male
19. Name: Letchumanan, Label: male
20. Name: Sasivarman, Label: male


In [81]:
test_data = labeled_names[:TEST_SIZE]

# Convert test data to DataFrame
test_df = pd.DataFrame(test_data, columns=["Name", "Gender"])

# Optional: convert labels back to numeric if needed
# male -> 0, female -> 1
test_df["Gender"] = test_df["Gender"].map({"male": 0, "female": 1})

# Save to CSV
test_df.to_csv("test_dataset.csv", index=False)

print("Test dataset saved as test_dataset.csv")
print("Total test samples:", len(test_df))


Test dataset saved as test_dataset.csv
Total test samples: 5398


In [82]:
# Predict on test data
y_true = []
y_pred = []

X_test = []
y_test = []

for features, actual_gender in test_set:
    X_test.append(features)
    y_test.append(actual_gender)

# Vectorize test features
X_test_vec = final_vectorizer.transform(X_test)

# Predict
y_pred = final_classifier.predict(X_test_vec)
y_true = y_test


# Test accuracy
test_accuracy = accuracy_score(y_true, y_pred)
print("Test Accuracy:", test_accuracy)


# Confusion matrix (manual)
confusion = Counter()

for actual, predicted in zip(y_true, y_pred):
    confusion[(actual, predicted)] += 1

print("\nConfusion Matrix Counts:")
print("Actual Male -> Predicted Male:", confusion[("male", "male")])
print("Actual Male -> Predicted Female:", confusion[("male", "female")])
print("Actual Female -> Predicted Male:", confusion[("female", "male")])
print("Actual Female -> Predicted Female:", confusion[("female", "female")])


# Show few sample predictions
print("\nSample Predictions (Actual vs Predicted):")
for i in range(10):
    name_features, actual = test_set[i]
    pred = final_classifier.predict(
        final_vectorizer.transform([name_features])
    )[0]

    print(f"{i+1}. Actual: {actual}, Predicted: {pred}")


Test Accuracy: 0.9260837347165617

Confusion Matrix Counts:
Actual Male -> Predicted Male: 2618
Actual Male -> Predicted Female: 253
Actual Female -> Predicted Male: 146
Actual Female -> Predicted Female: 2381

Sample Predictions (Actual vs Predicted):
1. Actual: female, Predicted: female
2. Actual: female, Predicted: female
3. Actual: male, Predicted: male
4. Actual: male, Predicted: male
5. Actual: female, Predicted: female
6. Actual: female, Predicted: female
7. Actual: female, Predicted: female
8. Actual: female, Predicted: female
9. Actual: female, Predicted: female
10. Actual: female, Predicted: female


In [83]:
print("\nSample Predictions (Name | Actual | Predicted):")

for i in range(10):
    name, actual = test_data[i]          # name preserved
    features = gender_features(name)     # dict features

    # Vectorize (VERY IMPORTANT)
    features_vec = final_vectorizer.transform([features])

    # Predict
    predicted = final_classifier.predict(features_vec)[0]

    print(f"{i+1}. Name: {name}, Actual: {actual}, Predicted: {predicted}")



Sample Predictions (Name | Actual | Predicted):
1. Name: Vibhish, Actual: male, Predicted: male
2. Name: Anmay, Actual: male, Predicted: male
3. Name: Kraanti, Actual: male, Predicted: male
4. Name: Yuveena, Actual: female, Predicted: female
5. Name: Sivahulan, Actual: male, Predicted: male
6. Name: Kogulakannan, Actual: male, Predicted: male
7. Name: Harika, Actual: female, Predicted: female
8. Name: Mishtu, Actual: female, Predicted: female
9. Name: Maharudra, Actual: male, Predicted: male
10. Name: Vani, Actual: female, Predicted: female


In [84]:
def predict_gender_with_unisex(name, threshold=0.80, margin=0.65):
    features = gender_features(name)
    features_vec = final_vectorizer.transform([features])

    probs = final_classifier.predict_proba(features_vec)[0]
    classes = [str(c) for c in final_classifier.classes_]

    prob_dict = {cls: float(prob) for cls, prob in zip(classes, probs)}

    sorted_probs = sorted(prob_dict.items(), key=lambda x: x[1], reverse=True)
    top_label, top_prob = sorted_probs[0]
    second_label, second_prob = sorted_probs[1]

    if top_prob < threshold or (top_prob - second_prob) < margin:
        return {
            "name": name,
            "prediction": "Common",
            "confidence": top_prob,
            "probabilities": prob_dict
        }

    return {
        "name": name,
        "prediction": top_label,
        "confidence": top_prob,
        "probabilities": prob_dict
    }


In [85]:
BASE_DIR = os.getcwd()   # Notebook-safe base directory
print("BASE_DIR set to:", BASE_DIR)


BASE_DIR set to: d:\customapps\ECAT\Gender-Classification


In [86]:
def store_feedback(name, model_prediction, human_verdict, confidence):
    """
    Stores human feedback in feedback.csv
    """

    file_path = os.path.join(BASE_DIR, "feedback-identification.csv")

    row = pd.DataFrame(
        [[name, model_prediction, human_verdict, confidence]],
        columns=[
            "Name",
            "Model_Prediction",
            "Human_Verdict",
            "Confidence"
        ]
    )

    if os.path.exists(file_path):
        row.to_csv(file_path, mode="a", header=False, index=False)
    else:
        row.to_csv(file_path, index=False)

    print(f"Feedback saved at: {file_path}")


In [87]:
# Name input (single source of truth)
name_input = "juspreet"

# Model prediction
result = predict_gender_with_unisex(name_input)

# Human label
human_verdict = "female"

# Store feedback using the same name
store_feedback(
    name=name_input,
    model_prediction=result["prediction"],
    human_verdict=human_verdict,
    confidence=result["confidence"]
)


Feedback saved at: d:\customapps\ECAT\Gender-Classification\feedback-identification.csv


In [88]:
print(result)

{'name': 'juspreet', 'prediction': 'Common', 'confidence': 0.5826166369063538, 'probabilities': {'female': 0.4173833630936462, 'male': 0.5826166369063538}}


For testing the Retained model

In [89]:
# Retrain Gender Model using Human Feedback
# (v1 → v2 with warm start & versioning)

import os
import pandas as pd
import joblib
import numpy as np
from collections import Counter
from sklearn.linear_model import LogisticRegression


# Base directory
BASE_DIR = os.getcwd()
MODEL_DIR = os.path.join(BASE_DIR, "model")
os.makedirs(MODEL_DIR, exist_ok=True)


# Versioned paths
MODEL_V1_PATH = os.path.join(MODEL_DIR, "gender_model.joblib")
MODEL_V2_PATH = os.path.join(MODEL_DIR, "gender_model_v2.joblib")
VECTORIZER_PATH = os.path.join(MODEL_DIR, "vectorizer.joblib")
FEEDBACK_PATH = os.path.join(BASE_DIR, "feedback-identification.csv")

# Load v1 artifacts
final_classifier = joblib.load(MODEL_V1_PATH)
final_vectorizer = joblib.load(VECTORIZER_PATH)
print("Loaded v1 model and vectorizer")

# Feature extractor (same as training)
def gender_features(name):
    name = name.lower()
    return {
        "last_letter": name[-1],
        "first_letter": name[0],
        "name_length": len(name),
        "suffix_2": name[-2:],
        "suffix_3": name[-3:],
        "prefix_2": name[:2],
        "prefix_3": name[:3]
    }


# Load feedback data
feedback_df = pd.read_csv(FEEDBACK_PATH)

print("\nFeedback samples loaded:", len(feedback_df))
print(feedback_df.head())

# Use HUMAN verdict as source of truth
feedback_df["Final_Label"] = feedback_df["Human_Verdict"]

# Build training data from feedback
# (Human feedback is more important → oversample)
feedback_features = []
feedback_labels = []

OVERSAMPLE_FACTOR = 14   # human verdict priority

for _, row in feedback_df.iterrows():
    for _ in range(OVERSAMPLE_FACTOR):
        feedback_features.append(gender_features(row["Name"]))
        feedback_labels.append(row["Final_Label"])

print("\nLabel distribution after oversampling:")
print(Counter(feedback_labels))

# Vectorize
X_feedback_vec = final_vectorizer.transform(feedback_features)
y_feedback = feedback_labels


# CREATE v2 MODEL (COPY v1 WEIGHTS → WARM START)
retrained_classifier = LogisticRegression(
    max_iter=1000,
    solver="liblinear",
    class_weight="balanced",
    warm_start=True
)

# Copy learned parameters from v1
retrained_classifier.classes_ = final_classifier.classes_
retrained_classifier.coef_ = np.copy(final_classifier.coef_)
retrained_classifier.intercept_ = np.copy(final_classifier.intercept_)
retrained_classifier.n_features_in_ = final_classifier.n_features_in_
print("\nWeights copied from v1 → initializing v2")

# Retrain model (continues learning)
retrained_classifier.fit(X_feedback_vec, y_feedback)
print("\nv2 model retrained using human feedback")

# Save v2 model (DO NOT overwrite v1)
joblib.dump(retrained_classifier, MODEL_V2_PATH)

print("\n v2 model saved successfully")
print("v1 model path:", MODEL_V1_PATH)
print("v2 model path:", MODEL_V2_PATH)

Loaded v1 model and vectorizer

Feedback samples loaded: 103
      Name Model_Prediction Human_Verdict  Confidence
0  swapnil             male        common    0.999485
1     Ravi           Common          male    0.807898
2  Nandini           female        female    0.921275
3    Payal           female        female    0.926518
4    Sonal             male        female    0.995755

Label distribution after oversampling:
Counter({'male': 854, 'female': 532, 'common': 56})

Weights copied from v1 → initializing v2

v2 model retrained using human feedback

 v2 model saved successfully
v1 model path: d:\customapps\ECAT\Gender-Classification\model\gender_model.joblib
v2 model path: d:\customapps\ECAT\Gender-Classification\model\gender_model_v2.joblib




In [90]:
# Accuracy Comparison: v1 vs v2

import os
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score

# ----------------
# Paths
# ----------------
BASE_DIR = os.getcwd()
MODEL_DIR = os.path.join(BASE_DIR, "model")

MODEL_V1_PATH = os.path.join(MODEL_DIR, "gender_model.joblib")
MODEL_V2_PATH = os.path.join(MODEL_DIR, "gender_model_v2.joblib")
VECTORIZER_PATH = os.path.join(MODEL_DIR, "vectorizer.joblib")
FEEDBACK_PATH = os.path.join(BASE_DIR, "feedback-identification.csv")

# Load models & vectorizer
model_v1 = joblib.load(MODEL_V1_PATH)
model_v2 = joblib.load(MODEL_V2_PATH)
vectorizer = joblib.load(VECTORIZER_PATH)
print("Loaded v1 and v2 models")

# Feature extractor (same as training)
def gender_features(name):
    name = name.lower()
    return {
        "last_letter": name[-1],
        "first_letter": name[0],
        "name_length": len(name),
        "suffix_2": name[-2:],
        "suffix_3": name[-3:],
        "prefix_2": name[:2],
        "prefix_3": name[:3]
    }

# 1️⃣ Accuracy on ORIGINAL TEST SET (Before Feedback)
X_test = [features for features, _ in test_set]
y_test = [label for _, label in test_set]

X_test_vec = vectorizer.transform(X_test)

y_test_pred_v1 = model_v1.predict(X_test_vec)
y_test_pred_v2 = model_v2.predict(X_test_vec)

acc_test_v1 = accuracy_score(y_test, y_test_pred_v1)
acc_test_v2 = accuracy_score(y_test, y_test_pred_v2)

print("\nTest Set Accuracy:")
print("v1 (Before Feedback):", round(acc_test_v1, 4))
print("v2 (After Feedback): ", round(acc_test_v2, 4))


# Accuracy on FEEDBACK DATA (Human Truth)
feedback_df = pd.read_csv(FEEDBACK_PATH)

X_feedback = [gender_features(name) for name in feedback_df["Name"]]
y_feedback = feedback_df["Human_Verdict"].tolist()

X_feedback_vec = vectorizer.transform(X_feedback)

y_feedback_pred_v1 = model_v1.predict(X_feedback_vec)
y_feedback_pred_v2 = model_v2.predict(X_feedback_vec)

acc_feedback_v1 = accuracy_score(y_feedback, y_feedback_pred_v1)
acc_feedback_v2 = accuracy_score(y_feedback, y_feedback_pred_v2)

print("\nFeedback Data Accuracy (Human Verdict):")
print("v1 (Before Feedback):", round(acc_feedback_v1, 4))
print("v2 (After Feedback): ", round(acc_feedback_v2, 4))

# Improvement Summary

print("\nAccuracy Improvement Summary:")
print("Test Set Change     :", round(acc_test_v2 - acc_test_v1, 4))
print("Feedback Set Change :", round(acc_feedback_v2 - acc_feedback_v1, 4))


Loaded v1 and v2 models

Test Set Accuracy:
v1 (Before Feedback): 0.9261
v2 (After Feedback):  0.704

Feedback Data Accuracy (Human Verdict):
v1 (Before Feedback): 0.5631
v2 (After Feedback):  0.8155

Accuracy Improvement Summary:
Test Set Change     : -0.2221
Feedback Set Change : 0.2524
