In [1344]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

In [1345]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

### Supervised Classification

#### Gender Identification

In [1346]:
from nltk.corpus import names

In [1347]:
male_names = names.words('male.txt')
male_names[:5]

[u'Aamir', u'Aaron', u'Abbey', u'Abbie', u'Abbot']

In [1348]:
female_names = names.words('female.txt')
female_names[:5]

[u'Abagael', u'Abagail', u'Abbe', u'Abbey', u'Abbi']

In [1349]:
df_male = pd.DataFrame({'name':male_names})
df_male.head()

Unnamed: 0,name
0,Aamir
1,Aaron
2,Abbey
3,Abbie
4,Abbot


In [1350]:
df_female = pd.DataFrame({'name':female_names})
df_female.head()

Unnamed: 0,name
0,Abagael
1,Abagail
2,Abbe
3,Abbey
4,Abbi


In [1351]:
df_male['last_letter'] = df_male['name'].map(lambda x:x[-1])
df_female['last_letter'] = df_female['name'].map(lambda x:x[-1])

In [1352]:
# get the last letter of each name

df_male.head()

Unnamed: 0,name,last_letter
0,Aamir,r
1,Aaron,n
2,Abbey,y
3,Abbie,e
4,Abbot,t


In [1353]:
df_female.head()

Unnamed: 0,name,last_letter
0,Abagael,l
1,Abagail,l
2,Abbe,e
3,Abbey,y
4,Abbi,i


In [1354]:
# add label 'male' to all the names

df_male['gender'] = 'male'
df_male.head()

Unnamed: 0,name,last_letter,gender
0,Aamir,r,male
1,Aaron,n,male
2,Abbey,y,male
3,Abbie,e,male
4,Abbot,t,male


In [1355]:
# add label 'female' to all the names

df_female['gender'] = 'female'
df_female.head()

Unnamed: 0,name,last_letter,gender
0,Abagael,l,female
1,Abagail,l,female
2,Abbe,e,female
3,Abbey,y,female
4,Abbi,i,female


#### Concat two dataframes

In [1356]:
df = df_male.append(df_female)

##### shuffle the names

In [1357]:
# https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows

In [1358]:
# frac=1 means return all rows in random order
# .reset_index(drop=True) prevents creating a new column that contains the old df's index

df = df.sample(frac=1).reset_index(drop=True)
df['gender'] = df['gender']# .map({'male':1}).astype(float)
df = df.drop('name', axis=1)



In [1359]:
df.head()

Unnamed: 0,last_letter,gender
0,t,male
1,e,male
2,e,male
3,y,male
4,n,male


#### Pay attention here: you need to convert string columns to numeric, otherwise you will get error "could not convert string to float" when fitting models

In [1360]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

df['last_letter'] = label_encoder.fit_transform(df['last_letter'])

# for col in df.columns:
#     df[col]= label_encoder.fit_transform(df[col])
df.head()

Unnamed: 0,last_letter,gender
0,19,male
1,5,male
2,5,male
3,24,male
4,14,male


#### One Hot Encoder to convert categorical feature to numeric

In [1361]:
# from sklearn.preprocessing import OneHotEncoder
# one_hot_encoder = OneHotEncoder()

# for col in df.columns:
#     df[col]= one_hot_encoder.fit(df[col])
# df.head()

In [1362]:
# number of rows

df.shape[0]

7944

#### Train Test Split

#### !!! you need to tranform X_train and X_test to dataframe, otherwise you will see error 

"Found arrays with inconsistent numbers of samples when calling " when fitting models

In [1363]:
from sklearn.model_selection import train_test_split

df_data = df['last_letter'].to_frame()
df_target = df['gender']

X_train, X_test, Y_train, Y_test = train_test_split(df_data, df_target, train_size = 0.8)

#### Decision Tree

In [1364]:
from sklearn.tree import DecisionTreeClassifier

In [1365]:
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((6355, 1), (6355,), (1589, 1), (1589,))

In [1366]:
decision_tree_model = GaussianNB()
decision_tree_model.fit(X_train, Y_train)

GaussianNB(priors=None)

In [1367]:
Y_predict = decision_tree_model.predict(X_test)

#### Accuracy score

In [1368]:
from sklearn.metrics import accuracy_score

In [1369]:
accuracy_score(Y_test, Y_predict)

0.6859660163624921

#### Make a function that use a prediction model and produce accuracy score

In [1370]:
def predict_and_measure(X_train, Y_train, X_test, Y_test, model):
    model.fit(X_train, Y_train)
    Y_predict = model.predict(X_test)
    return accuracy_score(Y_test, Y_predict)

#### Logistic Regression

In [1371]:
lr_model = LogisticRegression()

predict_and_measure(X_train, Y_train, X_test, Y_test, lr_model)

0.66834487098804285

#### Support Vector Machine

In [1372]:
svm_model = SVC()

predict_and_measure(X_train, Y_train, X_test, Y_test, svm_model)

0.74260541220893639

#### Stochastic Gradient Descent

In [1373]:
sgd_model = SGDClassifier()

predict_and_measure(X_train, Y_train, X_test, Y_test, sgd_model)

0.62492133417243545

#### Random Forest

In [1374]:
rf_model = RandomForestClassifier(n_estimators=10)

predict_and_measure(X_train, Y_train, X_test, Y_test, rf_model)

0.73694147262429199

In [1375]:
rf_model.predict(7)

array(['male'], dtype=object)

#### To use string as input to predict gender, you need to use LabelEncoder()'s tranform() method to convert it back:

https://stackoverflow.com/questions/44748410/labelencoder-reverse-and-use-categorical-data-on-model/44750604#44750604

In [1376]:
x = ['c']
x_encoded = label_encoder.transform(x)
x_encoded


array([3])

In [1377]:
rf_model.predict(x_encoded)



array(['male'], dtype=object)

#### K Fold Cross Validations

In [1378]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [1383]:
rf_model_cv = RandomForestClassifier(n_estimators=20)
k_fold = KFold(10)

kfold_score = cross_val_score(rf_model_cv, df_data, df_target, cv=k_fold, n_jobs=1)
kfold_score

array([ 0.75220126,  0.77106918,  0.75220126,  0.73081761,  0.75314861,
        0.78967254,  0.74559194,  0.76070529,  0.76322418,  0.77581864])

In [1384]:
kfold_score.mean()

0.75944505172441112