In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df1 = pd.read_csv('base_generos.csv')

In [3]:
df1.columns = ['first_name', 'name', 'sex', 'frequency_female',
       'frequency_male', 'frequency_total', 'frequency_group', 'ratio',
       'alternative_names']

In [4]:
df1.sex.unique()

array(['M', 'F'], dtype=object)

In [5]:
np.shape(df1)

(100787, 9)

In [6]:
# Load our data
df2 = pd.read_csv('names_dataset.csv')

In [7]:
df2.sex.unique()

array(['F', 'M'], dtype=object)

In [8]:
df = pd.concat([df1[['name','sex']], df2], ignore_index=True)

In [9]:
df

Unnamed: 0,name,sex,index
0,ABRAAO,M,
1,ADRIANA,F,
2,ADRIANO,M,
3,AILTON,M,
4,ALAN,M,
...,...,...,...
195807,Zecharya,M,1858664.0
195808,Ziheng,M,1858676.0
195809,Ziyu,M,1858679.0
195810,Zykir,M,1858686.0


In [10]:
# Data Cleaning
# Checking for column name consistency
df.columns

Index(['name', 'sex', 'index'], dtype='object')

In [11]:
# Data Types
df.dtypes

name      object
sex       object
index    float64
dtype: object

In [12]:
# Checking for Missing Values
df.isnull().isnull().sum()

name     0
sex      0
index    0
dtype: int64

In [13]:
# Number of Female Names
df[df.sex == 'F'].size

347550

In [14]:
# Number of Male Names
df[df.sex == 'M'].size

239886

In [15]:
df_names = df

In [16]:
# Replacing All F and M with 0 and 1 respectively
df_names.sex.replace({'F':0,'M':1},inplace=True)

In [17]:
df_names.sex.unique()

array([1, 0], dtype=int64)

In [18]:
df_names.dtypes

name      object
sex        int64
index    float64
dtype: object

In [19]:
Xfeatures =df_names['name']

In [41]:
# Feature Extraction 
cv = TfidfVectorizer()
X = cv.fit_transform(Xfeatures)

In [21]:
cv.get_feature_names()

['aaban',
 'aabha',
 'aabid',
 'aabriella',
 'aada',
 'aadam',
 'aadan',
 'aadarsh',
 'aaden',
 'aadesh',
 'aadhav',
 'aadhavan',
 'aadhi',
 'aadhira',
 'aadhvik',
 'aadhya',
 'aadhyan',
 'aadi',
 'aadian',
 'aadil',
 'aadin',
 'aadish',
 'aadison',
 'aadit',
 'aadith',
 'aadithya',
 'aaditri',
 'aaditya',
 'aadiv',
 'aadon',
 'aadrian',
 'aadrika',
 'aadrit',
 'aadvik',
 'aadvika',
 'aadya',
 'aadyn',
 'aafia',
 'aafreen',
 'aagam',
 'aage',
 'aagot',
 'aahaan',
 'aahan',
 'aahana',
 'aahil',
 'aahir',
 'aahliyah',
 'aahna',
 'aahron',
 'aaidan',
 'aaiden',
 'aaidyn',
 'aaila',
 'aailiyah',
 'aailyah',
 'aaima',
 'aaira',
 'aairah',
 'aaisha',
 'aaishah',
 'aaiyana',
 'aaiza',
 'aaja',
 'aajah',
 'aajaylah',
 'aajon',
 'aakanksha',
 'aakarsh',
 'aakash',
 'aakeem',
 'aakilah',
 'aakira',
 'aakiyah',
 'aakriti',
 'aala',
 'aalaiya',
 'aalaiyah',
 'aalana',
 'aalanah',
 'aalani',
 'aalap',
 'aalaya',
 'aalayah',
 'aalayiah',
 'aalayjah',
 'aalayna',
 'aalaysha',
 'aalaysia',
 'aalea',
 

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
y = df_names.sex

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [25]:
from sklearn.tree import DecisionTreeClassifier

In [26]:
clf = DecisionTreeClassifier().fit(X_train, y_train)

In [28]:
clf.score(X_test, y_test)

0.7245349592992665

In [50]:
vect = cv.transform(['Vinicius'])

In [51]:
vect

<1x137913 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [52]:
if clf.predict(vect)[0] == 1:
    print('M')
if clf.predict(vect)[0] == 0:
    print('F')

M


In [57]:
from sklearn.externals import joblib

In [58]:
decisiontreModel = open("decisiontreemodel.pkl","wb")

In [59]:
joblib.dump(dclf,decisiontreModel)

In [60]:
decisiontreModel.close

<function BufferedWriter.close>

In [76]:
#Alternative to Model Saving
import pickle
dctreeModel = open('gitlab/modelo',"wb")

In [78]:
dctreeModel.close()

In [54]:
import pandas as pd

In [91]:
df = pd.read_csv('basegeneroteste.csv')

In [90]:
df.reset_index()

Unnamed: 0,index,plan_uuid,first_name,created_dt,sex
0,0,000368d5-8622-4a30-8497-7c7b81d0297c,JOSEILDO,2020-08-14 16:11:12,M
1,1,0004f088-6f70-4a19-b618-5df88c7fd65e,EMMANUEL,2019-12-11 02:50:16,M
2,2,0005001f-c8fc-4f02-a3b6-b65397d4376a,FRANCIANE,2020-02-04 21:20:58,F
3,3,000b2cd1-29e5-4d1e-a2fc-1499131d0d38,KATIA,2021-04-05 13:33:57,F
4,4,0017a4b1-ae59-4af5-bb76-cb019264aaf4,MATHEUS,2021-04-28 21:46:33,M
...,...,...,...,...,...
17626,17626,ffe4ff40-98cf-4ec7-9902-9534cea138ab,FABIO,2020-04-28 10:33:11,M
17627,17627,ffe558f0-b71b-4f2e-99eb-fb2835cd8809,MARIANE,2019-12-22 12:23:04,F
17628,17628,ffe7783a-7bf6-457d-9fc9-7a4d1698a45b,WILSON,2019-12-01 11:22:33,M
17629,17629,ffeb9662-ca24-4a36-98a7-1b29c8db4bc0,FLAVIA,2021-03-24 15:06:01,F


In [86]:
df.to_csv('basegeneroteste.csv',index=False)