In [25]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [26]:
def load_data(encoding='utf-8'):
    try:
        data = pd.read_csv("D:\Downloads\Gender_Detection (1).csv", encoding=encoding)
        X = data['Name'].tolist()
        y = data['Gender'].tolist()
        return X, y
    except UnicodeDecodeError:
        print(f"UnicodeDecodeError: Failed to read the file with {encoding} encoding.")
        print("Trying another encoding...")
        encodings_to_try = ['utf-8', 'latin-1', 'ISO-8859-1']
        for enc in encodings_to_try:
            try:
                data = pd.read_csv("D:\Downloads\Gender_Detection (1).csv", encoding=enc)
                X = data['Name'].tolist()
                y = data['Gender'].tolist()
                print(f"Successfully read the file with {enc} encoding.")
                return X, y
            except UnicodeDecodeError:
                print(f"Failed to read the file with {enc} encoding.")
        raise ValueError("Could not read the file with any encoding. Check the file's encoding format.")


In [27]:
def train_model(X, y):
    vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3))
    X = vectorizer.fit_transform(X)
    model = MultinomialNB()
    model.fit(X, y)
    return model, vectorizer

In [28]:
def predict_name_gender(name, model, vectorizer):
    name_vectorized = vectorizer.transform([name])
    prediction = model.predict(name_vectorized)
    return prediction[0]

In [29]:
if __name__ == "__main__":
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model, vectorizer = train_model(X_train, y_train)

UnicodeDecodeError: Failed to read the file with utf-8 encoding.
Trying another encoding...
Failed to read the file with utf-8 encoding.
Successfully read the file with latin-1 encoding.


In [30]:
X_test_vectorized = vectorizer.transform(X_test)
y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.71


In [31]:
names_to_test = ["John", "Emily", "Michael", "Sophia", "Alex", "james"]
for name in names_to_test:
    predicted_gender = predict_name_gender(name, model, vectorizer)
    print(f"The name {name} is predicted to be {predicted_gender}")


The name John is predicted to be M
The name Emily is predicted to be F
The name Michael is predicted to be M
The name Sophia is predicted to be F
The name Alex is predicted to be F
The name james is predicted to be M
