In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer        # TfidVectorizer
from sklearn.feature_extraction import DictVectorizer

##### Dataset downloaded from Kaggle : US Baby names

In [2]:
df = pd.read_csv("NationalNames.csv", usecols=['Id', 'Name', 'Gender'])
df.rename(columns={'Gender':'sex', 'Name':'name', 'Id':'index'}, inplace=True)
df.head()

Unnamed: 0,index,name,sex
0,1,Mary,F
1,2,Anna,F
2,3,Emma,F
3,4,Elizabeth,F
4,5,Minnie,F


In [3]:
# df.size
df.columns

Index(['index', 'name', 'sex'], dtype='object')

In [4]:
df.isnull().sum()

index    0
name     0
sex      0
dtype: int64

In [5]:
# data.dtypes

print("Male : ",   df[df.sex == 'F'].size )
print("Female : ", df[df.sex == 'M'].size )

print("Total : ", df.size)

Male :  3245049
Female :  2231250
Total :  5476299


In [6]:
# creating new dataframe
df_names = df

# df_names.head()

In [7]:
df_names.sex.replace({'F':0, 'M':1}, inplace=True)     # replacing F:0 and M:1

In [8]:
df_names.sex.unique()

array([0, 1], dtype=int64)

In [9]:
df_names.dtypes

index     int64
name     object
sex       int64
dtype: object

#### Feature Extraction

In [10]:
Xfeatures = df_names['name']      # features

In [11]:
cv = CountVectorizer()          # counvectorizer
X = cv.fit_transform(Xfeatures)        # X - features

# cv.get_feature_names()

In [12]:
from sklearn.model_selection import train_test_split

X    # features
y = df_names.sex   # labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### Naive Bayes Model

In [13]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
print(f"Accuracy is : {clf.score(X_test, y_test) : 0.2%}")

Accuracy is :  89.55%


#### Sample Prediction

In [15]:
sample_name = ['Minnie']                           # Emma , Anna
vect = cv.transform(sample_name).toarray()

In [16]:
# vect
clf.predict(vect)               # F:0 , M:1

array([0], dtype=int64)

In [17]:
# cv.inverse_transform([0])

In [18]:
sample_name2 = ['David']                  
vect2 = cv.transform(sample_name2).toarray()

clf.predict(vect2)               # F:0 , M:1

array([1], dtype=int64)

### Function for Prediction 

In [25]:
def genderpredictor(x):
    test_name = [x]
    vect = cv.transform(test_name).toarray()
    result = clf.predict(vect)
    if result == 0:
        print("Female")
    else:
        print("Male")

In [34]:
genderpredictor("Maria")

Female


## -----------------------------------

### Save the model

In [21]:
from sklearn.externals import joblib

In [22]:
nbmodel = open('naivebayesmodel.pkl', 'wb')          # naive bayes model

joblib.dump(clf, nbmodel)

nbmodel.close()