In [136]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

In [137]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

### Supervised Classification

#### Gender Identification

In [138]:
from nltk.corpus import names

In [139]:
male_names = names.words('male.txt')
male_names[:5]

[u'Aamir', u'Aaron', u'Abbey', u'Abbie', u'Abbot']

In [140]:
female_names = names.words('female.txt')
female_names[:5]

[u'Abagael', u'Abagail', u'Abbe', u'Abbey', u'Abbi']

In [141]:
df_male = pd.DataFrame({'name':male_names})
df_male.head()

Unnamed: 0,name
0,Aamir
1,Aaron
2,Abbey
3,Abbie
4,Abbot


In [142]:
df_female = pd.DataFrame({'name':female_names})
df_female.head()

Unnamed: 0,name
0,Abagael
1,Abagail
2,Abbe
3,Abbey
4,Abbi


In [143]:
df_male['last_letter'] = df_male['name'].map(lambda x:x[-1])
df_female['last_letter'] = df_female['name'].map(lambda x:x[-1])

In [144]:
# get the last letter of each name

df_male.head()

Unnamed: 0,name,last_letter
0,Aamir,r
1,Aaron,n
2,Abbey,y
3,Abbie,e
4,Abbot,t


In [145]:
df_female.head()

Unnamed: 0,name,last_letter
0,Abagael,l
1,Abagail,l
2,Abbe,e
3,Abbey,y
4,Abbi,i


In [146]:
# add label 'male' to all the names

df_male['gender'] = 'male'
df_male.head()

Unnamed: 0,name,last_letter,gender
0,Aamir,r,male
1,Aaron,n,male
2,Abbey,y,male
3,Abbie,e,male
4,Abbot,t,male


In [147]:
# add label 'female' to all the names

df_female['gender'] = 'female'
df_female.head()

Unnamed: 0,name,last_letter,gender
0,Abagael,l,female
1,Abagail,l,female
2,Abbe,e,female
3,Abbey,y,female
4,Abbi,i,female


#### Concat two dataframes

In [148]:
df = df_male.append(df_female)

##### shuffle the names

In [149]:
# https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows

In [150]:
# frac=1 means return all rows in random order
# .reset_index(drop=True) prevents creating a new column that contains the old df's index

df = df.sample(frac=1).reset_index(drop=True)
df['gender'] = df['gender']# .map({'male':1}).astype(float)
df = df.drop('name', axis=1)



In [151]:
df.head()

Unnamed: 0,last_letter,gender
0,a,female
1,o,female
2,e,male
3,l,female
4,d,male


#### Pay attention here: you need to convert string columns to numeric, otherwise you will get error "could not convert string to float" when fitting models

In [152]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

df['last_letter'] = label_encoder.fit_transform(df['last_letter'])

# for col in df.columns:
#     df[col]= label_encoder.fit_transform(df[col])
df.head()

Unnamed: 0,last_letter,gender
0,1,female
1,15,female
2,5,male
3,12,female
4,4,male


#### One Hot Encoder to convert categorical feature to numeric

In [153]:
# from sklearn.preprocessing import OneHotEncoder
# one_hot_encoder = OneHotEncoder()

# for col in df.columns:
#     df[col]= one_hot_encoder.fit(df[col])
# df.head()

In [154]:
# number of rows

df.shape[0]

7944

#### Train Test Split

#### !!! you need to tranform X_train and X_test to dataframe, otherwise you will see error 

"Found arrays with inconsistent numbers of samples when calling " when fitting models

In [155]:
from sklearn.model_selection import train_test_split

df_data = df['last_letter'].to_frame()
df_target = df['gender']

X_train, X_test, Y_train, Y_test = train_test_split(df_data, df_target, train_size = 0.8)

#### Decision Tree

In [156]:
from sklearn.tree import DecisionTreeClassifier

In [157]:
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((6355, 1), (6355,), (1589, 1), (1589,))

In [158]:
decision_tree_model = GaussianNB()
decision_tree_model.fit(X_train, Y_train)

GaussianNB(priors=None)

In [159]:
Y_predict = decision_tree_model.predict(X_test)

#### Accuracy score

In [160]:
from sklearn.metrics import accuracy_score

In [161]:
accuracy_score(Y_test, Y_predict)

0.67023285084959094

#### Make a function that use a prediction model and produce accuracy score

In [162]:
def predict_and_measure(X_train, Y_train, X_test, Y_test, model):
    model.fit(X_train, Y_train)
    Y_predict = model.predict(X_test)
    return accuracy_score(Y_test, Y_predict)

#### Logistic Regression

In [163]:
lr_model = LogisticRegression()

predict_and_measure(X_train, Y_train, X_test, Y_test, lr_model)

0.65198237885462551

#### Support Vector Machine

In [164]:
svm_model = SVC()

predict_and_measure(X_train, Y_train, X_test, Y_test, svm_model)

0.76463184392699812

#### Stochastic Gradient Descent

In [165]:
sgd_model = SGDClassifier()

predict_and_measure(X_train, Y_train, X_test, Y_test, sgd_model)

0.62555066079295152

#### Random Forest

In [166]:
rf_model = RandomForestClassifier(n_estimators=10)

predict_and_measure(X_train, Y_train, X_test, Y_test, rf_model)

0.76463184392699812

In [167]:
rf_model.predict(7)

array(['male'], dtype=object)

#### To use string as input to predict gender, you need to use LabelEncoder()'s tranform() method to convert it back:

https://stackoverflow.com/questions/44748410/labelencoder-reverse-and-use-categorical-data-on-model/44750604#44750604

In [168]:
x = ['c']
x_encoded = label_encoder.transform(x)
x_encoded


array([3])

In [169]:
rf_model.predict(x_encoded)



array(['male'], dtype=object)

#### K Fold Cross Validations

In [170]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [171]:
rf_model_cv = RandomForestClassifier(n_estimators=20)
k_fold = KFold(10)

kfold_score = cross_val_score(rf_model_cv, df_data, df_target, cv=k_fold, n_jobs=5)
kfold_score

array([ 0.73836478,  0.76603774,  0.78113208,  0.75597484,  0.7581864 ,
        0.76070529,  0.76700252,  0.76826196,  0.76070529,  0.76196474])

In [172]:
kfold_score.mean()

0.76183356304358152

## Exercises

### 

In [173]:
from nltk.corpus import names

In [174]:
name_gender = ([(name, 'male') for name in names.words('male.txt')]
              +[(name, 'female') for name in names.words('female.txt')])

#### Use shuffle in random module to randomize the tuple list

In [175]:
import random
random.shuffle(name_gender)
name_gender[:5]

[(u'Elvis', 'male'),
 (u'Tanner', 'male'),
 (u'Maryellen', 'female'),
 (u'Nariko', 'female'),
 (u'Virgil', 'male')]

In [176]:
def last_letter(name):
    return name[-1:]

last_letter('alex')

'x'

In [177]:
feature_set = [(last_letter(name), gender) for (name, gender) in name_gender]
feature_set[:5]

[(u's', 'male'),
 (u'r', 'male'),
 (u'n', 'female'),
 (u'o', 'female'),
 (u'l', 'male')]

In [178]:
len(feature_set)

7944

#### Split the training and testing datasets

In [179]:
train_set = feature_set[:int(len(feature_set)*0.8)]
test_set = feature_set[int(len(feature_set)*0.8):]

In [180]:
train_set[:5]

[(u's', 'male'),
 (u'r', 'male'),
 (u'n', 'female'),
 (u'o', 'female'),
 (u'l', 'male')]

#### Use the nltk default classifier

In [183]:
import nltk
from nltk import *

clf = nltk.NaiveBayesClassifier.train(train_set)