In [357]:
import pandas as pd
from pandas import Series, DataFrame

### Supervised Classification

#### Gender Identification

In [358]:
from nltk.corpus import names

In [359]:
male_names = names.words('male.txt')
male_names[:5]

[u'Aamir', u'Aaron', u'Abbey', u'Abbie', u'Abbot']

In [360]:
female_names = names.words('female.txt')
female_names[:5]

[u'Abagael', u'Abagail', u'Abbe', u'Abbey', u'Abbi']

In [361]:
df_male = pd.DataFrame({'name':male_names})
df_male.head()

Unnamed: 0,name
0,Aamir
1,Aaron
2,Abbey
3,Abbie
4,Abbot


In [362]:
df_female = pd.DataFrame({'name':female_names})
df_female.head()

Unnamed: 0,name
0,Abagael
1,Abagail
2,Abbe
3,Abbey
4,Abbi


In [363]:
df_male['last_letter'] = df_male['name'].map(lambda x:x[-1])
df_female['last_letter'] = df_female['name'].map(lambda x:x[-1])

In [364]:
# get the last letter of each name

df_male.head()

Unnamed: 0,name,last_letter
0,Aamir,r
1,Aaron,n
2,Abbey,y
3,Abbie,e
4,Abbot,t


In [365]:
df_female.head()

Unnamed: 0,name,last_letter
0,Abagael,l
1,Abagail,l
2,Abbe,e
3,Abbey,y
4,Abbi,i


In [366]:
# add label 'male' to all the names

df_male['gender'] = 'male'
df_male.head()

Unnamed: 0,name,last_letter,gender
0,Aamir,r,male
1,Aaron,n,male
2,Abbey,y,male
3,Abbie,e,male
4,Abbot,t,male


In [367]:
# add label 'female' to all the names

df_female['gender'] = 'female'
df_female.head()

Unnamed: 0,name,last_letter,gender
0,Abagael,l,female
1,Abagail,l,female
2,Abbe,e,female
3,Abbey,y,female
4,Abbi,i,female


#### Concat two dataframes

In [368]:
df = df_male.append(df_female)

##### shuffle the names

In [369]:
# https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows

In [370]:
# frac=1 means return all rows in random order
# .reset_index(drop=True) prevents creating a new column that contains the old df's index

df = df.sample(frac=1).reset_index(drop=True)
df['gender'] = df['gender']# .map({'male':1}).astype(float)
df = df.drop('name', axis=1)



In [371]:
df.head()

Unnamed: 0,last_letter,gender
0,a,female
1,n,female
2,l,male
3,l,female
4,a,female


#### Pay attention here: you need to convert string columns to numeric, otherwise you will get error "could not convert string to float" when fitting models

In [372]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for col in df.columns:
    df[col] = label_encoder.fit_transform(df[col])
df.head()

Unnamed: 0,last_letter,gender
0,1,0
1,14,0
2,12,1
3,12,0
4,1,0


In [373]:
# number of rows

df.shape[0]

7944

In [374]:
train_size = int(df.shape[0] * 0.8)
df_train = df[:train_size]
df_test = df[train_size:]
df_train.shape[0], df_test.shape[0]

(6355, 1589)

#### Decision Tree

In [375]:
from sklearn.tree import DecisionTreeClassifier

In [376]:
X_train = df_train.drop('gender', axis=1)
Y_train = df_train['gender']
X_test = df_test.drop('gender', axis=1)
Y_test = df_test['gender']
X_train.shape, Y_train.shape, X_test.shape

((6355, 1), (6355,), (1589, 1))

In [377]:
decision_tree_model = GaussianNB()
decision_tree_model.fit(X_train, Y_train)

GaussianNB(priors=None)

In [378]:
Y_predict = decision_tree_model.predict(X_test)

#### Accuracy score

In [379]:
from sklearn.metrics import accuracy_score

In [380]:
accuracy_score(Y_test, Y_predict)

0.68785399622404031