Gender Identification

In [125]:
def gender_features(name):
    name = name.lower()

    features = {}

    # BASIC FEATURES (keep existing logic)    
    features["first_letter"] = name[0]
    features["last_letter"] = name[-1]
    features["name_length"] = len(name)

    features["ends_with_vowel"] = name[-1] in "aeiou"
    features["ends_with_consonant"] = name[-1] not in "aeiou"
    features["starts_with_vowel"] = name[0] in "aeiou"

    features["last_2"] = name[-2:] if len(name) >= 2 else name
    features["last_3"] = name[-3:] if len(name) >= 3 else name
    
    # INDIAN-SPECIFIC SUFFIX FEATURES    
    female_suffixes = (
        "a", "i", "aa", "ya", "ni", "ika", "ini", "thi", "thra",
        "mitha", "shree", "rani", "latha", "vani", "sri"
    )

    male_suffixes = (
        "n", "an", "esh", "raj", "shan", "kar", "deep", "dev",
        "kumar", "th", "ran", "eshan"
    )

    for suf in female_suffixes:
        features[f"ends_with_female_{suf}"] = name.endswith(suf)

    for suf in male_suffixes:
        features[f"ends_with_male_{suf}"] = name.endswith(suf)

   
    # VOWEL STATISTICS    
    features["vowel_count"] = sum(1 for c in name if c in "aeiou")
    features["vowel_ratio"] = features["vowel_count"] / len(name)    
   
    # Using boundary markers improves learning
    padded = f"<{name}>"

    # 2-grams, 3-grams, 4-grams
    for n in (2, 3, 4):
        for i in range(len(padded) - n + 1):
            gram = padded[i:i+n]
            features[f"char_{n}gram_{gram}"] = True

    return features


The returned dictionary, known as a **feature set,** maps from feature names to their values. Feature names are case-sensitive strings that typically provide a short human-readable description of the feature, as in the example `'last_letter'`. Feature values are values with simple types, such as booleans, numbers, and strings.

Now that we've defined a feature extractor, we need to prepare a list of examples and corresponding class labels.

In [126]:
import pandas as pd
import random

# Load the CSV file
# Replace 'names.csv' with your actual CSV file path
df = pd.read_csv("sample_indian_names.csv")

# Map numeric gender labels to text labels
# 0 -> male, 1 -> female
df["Gender"] = df["Gender"].map({0: "male", 1: "female"})

# Convert to list of (name, gender) tuples
labeled_names = list(zip(df["Name"], df["Gender"]))

# Shuffle the combined data
random.shuffle(labeled_names)

labeled_names[:10]


[('Thamu', 'male'),
 ('Shreemaan', 'male'),
 ('Manimehala', 'female'),
 ('Karyshna', 'female'),
 ('Ubesh', 'male'),
 ('Vedanti', 'female'),
 ('Nabhayan', 'male'),
 ('Miruthanya', 'female'),
 ('Thashwina', 'female'),
 ('Atharsan', 'male')]

Next, we use the feature extractor to process the names data, and divide the resulting list of feature sets into a **training set** and a **test set** The training set is used to train a Naive Bayes classifier.

In [127]:
import random
from sklearn.model_selection import KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


# STEP 1: Feature extraction (name preserved)
featuresets = [(name, gender_features(name), gender) for (name, gender) in labeled_names]

random.shuffle(featuresets)

# Train / Test split
TOTAL_SIZE = len(featuresets)
TEST_SIZE = int(0.10 * TOTAL_SIZE)

test_set = featuresets[:TEST_SIZE]
train_val_set = featuresets[TEST_SIZE:]

print("Total samples:", TOTAL_SIZE)
print("Training+Validation samples:", len(train_val_set))
print("Test samples:", len(test_set))


# K-Fold Cross Validation
K = 5
kf = KFold(n_splits=K, shuffle=True, random_state=42)

cv_accuracies = []

for fold, (train_idx, val_idx) in enumerate(kf.split(train_val_set), 1):

    train_fold = [train_val_set[i] for i in train_idx]
    val_fold = [train_val_set[i] for i in val_idx]

    X_train = [features for _, features, _ in train_fold]
    y_train = [label for _, _, label in train_fold]

    X_val = [features for _, features, _ in val_fold]
    y_val = [label for _, _, label in val_fold]

    vectorizer = DictVectorizer(sparse=True)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_val_vec = vectorizer.transform(X_val)

    classifier = LogisticRegression(
        max_iter=1000,
        solver="liblinear",
        class_weight="balanced"
    )

    classifier.fit(X_train_vec, y_train)

    y_pred = classifier.predict(X_val_vec)
    accuracy = accuracy_score(y_val, y_pred)

    cv_accuracies.append(accuracy)
    print(f"Fold {fold} Accuracy: {accuracy:.4f}")

print("\nAverage CV Accuracy:", sum(cv_accuracies) / K)


Total samples: 53982
Training+Validation samples: 48584
Test samples: 5398
Fold 1 Accuracy: 0.9287
Fold 2 Accuracy: 0.9293
Fold 3 Accuracy: 0.9305
Fold 4 Accuracy: 0.9272
Fold 5 Accuracy: 0.9297

Average CV Accuracy: 0.9290918946914012


In [128]:
# Final training on full train+val set
X_train_full = [features for _, features, _ in train_val_set]
y_train_full = [label for _, _, label in train_val_set]

final_vectorizer = DictVectorizer(sparse=True)
X_train_full_vec = final_vectorizer.fit_transform(X_train_full)

final_classifier = LogisticRegression(
    max_iter=1000,
    solver="liblinear",
    class_weight="balanced"
)

final_classifier.fit(X_train_full_vec, y_train_full)


# Test evaluation
X_test = [features for _, features, _ in test_set]
y_test = [label for _, _, label in test_set]

X_test_vec = final_vectorizer.transform(X_test)
y_test_pred = final_classifier.predict(X_test_vec)

test_accuracy = accuracy_score(y_test, y_test_pred)
print("Final Test Accuracy:", test_accuracy)


Final Test Accuracy: 0.9347906632085958


In [129]:
# Create test data with original names preserved
TEST_SIZE = len(test_set)

test_rows = labeled_names[:TEST_SIZE]

for i in range(20):
    name, gender = test_rows[i]
    print(f"{i+1}. Name: {name}, Label: {gender}")



1. Name: Thamu, Label: male
2. Name: Shreemaan, Label: male
3. Name: Manimehala, Label: female
4. Name: Karyshna, Label: female
5. Name: Ubesh, Label: male
6. Name: Vedanti, Label: female
7. Name: Nabhayan, Label: male
8. Name: Miruthanya, Label: female
9. Name: Thashwina, Label: female
10. Name: Atharsan, Label: male
11. Name: Kaarthigayini, Label: female
12. Name: Thivashinie, Label: female
13. Name: Rukeeshan, Label: male
14. Name: Amokitha, Label: female
15. Name: Devadyumna, Label: male
16. Name: Dhinakaran, Label: male
17. Name: Vadamalai, Label: male
18. Name: Dia, Label: female
19. Name: Ucchal, Label: female
20. Name: Kapilash, Label: male


In [130]:
import pandas as pd

# test_set contains (name, features, gender)
test_data = [(name, gender) for (name, _, gender) in test_set]

test_df = pd.DataFrame(test_data, columns=["Name", "Gender"])

# Optional: convert to numeric labels
test_df["Gender"] = test_df["Gender"].map({"male": 0, "female": 1})

test_df.to_csv("test_dataset.csv", index=False)

print("Test dataset saved as test_dataset.csv")
print("Total test samples:", len(test_df))


Test dataset saved as test_dataset.csv
Total test samples: 5398


Let's just test it out on some names that did not appear in its training data:

In [131]:
from sklearn.metrics import accuracy_score
from collections import Counter

X_test = []
y_test = []

# Correct unpacking (3 values)
for name, features, actual_gender in test_set:
    X_test.append(features)
    y_test.append(actual_gender)

# Vectorize test features
X_test_vec = final_vectorizer.transform(X_test)

# Predict
y_pred = final_classifier.predict(X_test_vec)
y_true = y_test

# Test accuracy
test_accuracy = accuracy_score(y_true, y_pred)
print("Test Accuracy:", test_accuracy)

# Confusion matrix
confusion = Counter()
for actual, predicted in zip(y_true, y_pred):
    confusion[(actual, predicted)] += 1

print("Actual Male -> Predicted Male:", confusion[("male", "male")])
print("Actual Male -> Predicted Female:", confusion[("male", "female")])
print("Actual Female -> Predicted Male:", confusion[("female", "male")])
print("Actual Female -> Predicted Female:", confusion[("female", "female")])

# Show sample predictions
print("\nSample Predictions (Name | Actual | Predicted):")
for i in range(10):
    name, features, actual = test_set[i]
    pred = final_classifier.predict(
        final_vectorizer.transform([features])
    )[0]

    print(f"{i+1}. Name: {name}, Actual: {actual}, Predicted: {pred}")


Test Accuracy: 0.9347906632085958
Actual Male -> Predicted Male: 2644
Actual Male -> Predicted Female: 209
Actual Female -> Predicted Male: 143
Actual Female -> Predicted Female: 2402

Sample Predictions (Name | Actual | Predicted):
1. Name: Praseeth, Actual: male, Predicted: male
2. Name: Swagat, Actual: male, Predicted: male
3. Name: Vallaki, Actual: male, Predicted: female
4. Name: Hemavarana, Actual: female, Predicted: female
5. Name: Saachi, Actual: female, Predicted: female
6. Name: Jeyachandraramanthanan, Actual: male, Predicted: male
7. Name: Kaarmuhil, Actual: male, Predicted: female
8. Name: Ayoti, Actual: female, Predicted: female
9. Name: Ibha, Actual: female, Predicted: female
10. Name: Shreyars, Actual: male, Predicted: male


In [132]:
print("\nSample Predictions (Name | Actual | Predicted):")

for i in range(10):
    name, actual = test_data[i]          # name preserved
    features = gender_features(name)     # dict features

    # Vectorize (VERY IMPORTANT)
    features_vec = final_vectorizer.transform([features])

    # Predict
    predicted = final_classifier.predict(features_vec)[0]

    print(f"{i+1}. Name: {name}, Actual: {actual}, Predicted: {predicted}")



Sample Predictions (Name | Actual | Predicted):
1. Name: Praseeth, Actual: male, Predicted: male
2. Name: Swagat, Actual: male, Predicted: male
3. Name: Vallaki, Actual: male, Predicted: female
4. Name: Hemavarana, Actual: female, Predicted: female
5. Name: Saachi, Actual: female, Predicted: female
6. Name: Jeyachandraramanthanan, Actual: male, Predicted: male
7. Name: Kaarmuhil, Actual: male, Predicted: female
8. Name: Ayoti, Actual: female, Predicted: female
9. Name: Ibha, Actual: female, Predicted: female
10. Name: Shreyars, Actual: male, Predicted: male


In [133]:
from sklearn.metrics import accuracy_score

y_true = []
y_pred = []

for name, actual in test_data:
    features = gender_features(name)
    features_vec = final_vectorizer.transform([features])
    predicted = final_classifier.predict(features_vec)[0]

    y_true.append(actual)
    y_pred.append(predicted)

test_accuracy = accuracy_score(y_true, y_pred)
print("Test Accuracy:", test_accuracy)


Test Accuracy: 0.9347906632085958


Finally, we can examine the classifier to determine which features it found most effective for distinguishing the names' genders:

In [134]:
# Predict gender for a single name
def predict_gender(name):
    features = gender_features(name)
    features_vec = final_vectorizer.transform([features])
    return final_classifier.predict(features_vec)[0]


In [191]:
def predict_gender_with_confidence1(name):
    features = gender_features(name)
    features_vec = final_vectorizer.transform([features])

    probs = final_classifier.predict_proba(features_vec)[0]
    pred_index = probs.argmax()

    return {
        "name": name,
        "gender": final_classifier.classes_[pred_index],
        "confidence": float(probs[pred_index])
    }


In [192]:
predict_gender_with_confidence1("Ravi")

{'name': 'Ravi', 'gender': np.str_('female'), 'confidence': 0.9326654669756811}

In [136]:
from sklearn.metrics import accuracy_score
import pandas as pd

X_test = []
y_test = []
names = []

for name, features, actual_gender in test_set:
    names.append(name)
    X_test.append(features)
    y_test.append(actual_gender)

# Vectorize
X_test_vec = final_vectorizer.transform(X_test)

# Predictions
y_pred = final_classifier.predict(X_test_vec)
y_proba = final_classifier.predict_proba(X_test_vec)

# Max probability = confidence
confidences = y_proba.max(axis=1)

print("Test Accuracy:", accuracy_score(y_test, y_pred))


Test Accuracy: 0.9347906632085958


In [137]:
analysis_df = pd.DataFrame({
    "Name": names,
    "Actual": y_test,
    "Predicted": y_pred,
    "Confidence": confidences
})

analysis_df["Correct"] = analysis_df["Actual"] == analysis_df["Predicted"]

analysis_df.head()


Unnamed: 0,Name,Actual,Predicted,Confidence,Correct
0,Praseeth,male,male,0.994241,True
1,Swagat,male,male,0.999709,True
2,Vallaki,male,female,0.712298,False
3,Hemavarana,female,female,0.956156,True
4,Saachi,female,female,0.865725,True


In [138]:
high_conf_errors = analysis_df[
    (analysis_df["Correct"] == False) &
    (analysis_df["Confidence"] >= 0.80)
].sort_values(by="Confidence", ascending=False)

print("High-confidence errors:", len(high_conf_errors))
high_conf_errors.head(10)


High-confidence errors: 187


Unnamed: 0,Name,Actual,Predicted,Confidence,Correct
4802,Muthumurugan,female,male,0.999588,False
1256,Kamalesh,female,male,0.997919,False
4448,Aradhan,female,male,0.997879,False
85,Rajatheran,female,male,0.996531,False
2211,Sabrang,female,male,0.99639,False
2111,Charchika,male,female,0.996199,False
1320,Ishwin,female,male,0.994798,False
2536,Yohshini,male,female,0.993689,False
3436,Saon,female,male,0.993444,False
4489,Lavi,male,female,0.993044,False


In [139]:
low_conf_samples = analysis_df[
    analysis_df["Confidence"] <= 0.60
].sort_values(by="Confidence")

print("Low-confidence samples:", len(low_conf_samples))
low_conf_samples.head(10)


Low-confidence samples: 128


Unnamed: 0,Name,Actual,Predicted,Confidence,Correct
2188,Sangeet,female,male,0.500028,False
2400,Mehar,female,female,0.50039,True
3210,Latakara,female,female,0.500718,True
4076,Chirayu,male,male,0.502193,True
3424,Pushkal,male,male,0.503496,True
1603,Ijya,female,male,0.503792,False
5149,Nanthanu,male,female,0.504126,False
4945,Ribhya,female,female,0.505575,True
4615,Chankya,male,female,0.506103,False
450,Tavleen,female,female,0.508201,True


In [140]:
import pandas as pd

for enc in ["utf-8", "latin1", "cp1252"]:
    try:
        df = pd.read_csv("sample_indian_names2.csv", encoding=enc)
        print(f"Loaded successfully with encoding: {enc}")
        break
    except UnicodeDecodeError:
        continue


Loaded successfully with encoding: utf-8


In [141]:
df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)

split_ratio = 0.5
split_index = int(len(df) * split_ratio)

df_part1 = df.iloc[:split_index]
df_part2 = df.iloc[split_index:]

df_part1.to_csv("feedback_dataset1.csv", index=False)
df_part2.to_csv("feedback_dataset2.csv", index=False)


In [164]:
import pandas as pd

feedback_df = pd.read_csv("feedback_dataset1.csv")

# Convert numeric labels to string labels
feedback_df["Gender"] = feedback_df["Gender"].map({
    0: "female",
    1: "male"
})

# Safety check
print(feedback_df["Gender"].value_counts())

feedback_data = list(zip(feedback_df["Name"], feedback_df["Gender"]))


Gender
female    342
male      308
Name: count, dtype: int64


In [165]:
names = []
actuals = []
preds = []
confs = []

for name, actual in feedback_data:
    features = gender_features(name)
    vec = final_vectorizer.transform([features])

    proba = final_classifier.predict_proba(vec)[0]
    pred = final_classifier.classes_[proba.argmax()]
    conf = proba.max()

    names.append(name)
    actuals.append(actual)
    preds.append(pred)
    confs.append(conf)


In [166]:
analysis_df = pd.DataFrame({
    "Name": names,
    "Actual": actuals,
    "Predicted": preds,
    "Confidence": confs
})

analysis_df["Correct"] = analysis_df["Actual"] == analysis_df["Predicted"]

analysis_df.head()


Unnamed: 0,Name,Actual,Predicted,Confidence,Correct
0,Ravi,male,female,0.932665,False
1,Mrunal,female,female,0.757686,True
2,Muskan,female,male,0.868234,False
3,Bobby,male,female,0.864592,False
4,Monali,female,female,0.91797,True


In [167]:
CONF_THRESHOLD = 0.60

selected_feedback = analysis_df[
    (analysis_df["Correct"] == False) |
    (analysis_df["Confidence"] < CONF_THRESHOLD)
].copy()

print("Total feedback candidates:", len(selected_feedback))


Total feedback candidates: 115


In [193]:
def assign_weight(row):
    if not row["Correct"]:
        return 4.0          # incorrect → strongest signal
    else:
        return 1.5          # low-confidence but correct

selected_feedback["Weight"] = selected_feedback.apply(assign_weight, axis=1)

selected_feedback[["Name", "Actual", "Confidence", "Correct", "Weight"]].head()


Unnamed: 0,Name,Actual,Confidence,Correct,Weight
0,Ravi,male,0.932665,False,4.0
2,Muskan,female,0.868234,False,4.0
3,Bobby,male,0.864592,False,4.0
7,Chanchal,female,0.786615,False,4.0
14,Shanta,female,0.996213,False,4.0


In [194]:
feedback_training_data = [
    (row["Name"], row["Actual"], row["Weight"])
    for _, row in selected_feedback.iterrows()
]


In [195]:
base_training_data = [
    (name, gender, 1.0)
    for (name, gender) in labeled_names
]


In [196]:
combined_training_data = base_training_data + feedback_training_data

print("Base training samples:", len(base_training_data))
print("Feedback samples added:", len(feedback_training_data))
print("Total training samples:", len(combined_training_data))


Base training samples: 53982
Feedback samples added: 115
Total training samples: 54097


In [197]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

X_train = []
y_train = []
weights = []

for name, label, weight in combined_training_data:
    X_train.append(gender_features(name))
    y_train.append(label)
    weights.append(weight)

vectorizer_retrain = DictVectorizer(sparse=True)
X_train_vec = vectorizer_retrain.fit_transform(X_train)

retrained_model = LogisticRegression(
    max_iter=1000,
    solver="liblinear",
    class_weight="balanced"
)

retrained_model.fit(
    X_train_vec,
    y_train,
    sample_weight=weights
)


0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",'balanced'
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'liblinear'


In [198]:
from sklearn.metrics import accuracy_score

X_test = [features for _, features, _ in test_set]
y_test = [label for _, _, label in test_set]

X_test_vec = vectorizer_retrain.transform(X_test)
y_test_pred = retrained_model.predict(X_test_vec)

print(
    "Retrained model test accuracy:",
    accuracy_score(y_test, y_test_pred)
)


Retrained model test accuracy: 0.9644312708410523


In [199]:
def predict_gender1(name):
    features = gender_features(name)
    features_vec = vectorizer_retrain.transform([features])
    prediction = retrained_model.predict(features_vec)[0]
    return prediction


In [200]:
predict_gender1("Ravi")

np.str_('female')

In [201]:
def predict_gender_with_confidence(name):
    features = gender_features(name)
    features_vec = vectorizer_retrain.transform([features])

    probs = retrained_model.predict_proba(features_vec)[0]
    pred_index = probs.argmax()

    return {
        "name": name,
        "gender": retrained_model.classes_[pred_index],
        "confidence": float(probs[pred_index])
    }


In [202]:
predict_gender_with_confidence("Ravi")

{'name': 'Ravi', 'gender': np.str_('female'), 'confidence': 0.7352059404165119}