In [79]:
#Importing modules and dataset
import pandas as pd
import numpy as np
import random
from nltk import NaiveBayesClassifier,classify
from sklearn.svm import LinearSVC
from nltk.classify.scikitlearn import SklearnClassifier
dfM = pd.read_csv('male',header=0)
dfF = pd.read_csv('female',header=0)

In [80]:
#Completely removing samples that donot provide first name
dfM.drop([2016,3623,12070,6385,7797,13494,10909,14196],axis=0, inplace=True)
dfF.drop([4419,1406,3570,10347,2861,14336],axis=0,inplace=True)
#Combining male and female dataset and shuffling the rows
combined = pd.concat([dfM, dfF], axis = 0)
combined = combined.iloc[(np.random.permutation(len(combined)))]
combined.reset_index(drop = True, inplace = True)

In [81]:
#Wrangling with the dataset

##Dropping the unnecessary extra column
combined.drop('race',axis = 1, inplace =True)
# Drop NaN (missing names)
combined = combined.dropna(axis=0)

##Removing duplicate rows
##Dataset almost reduces to half when duplicate rows are removed
combined = combined.drop_duplicates(subset = ['name','gender'])
combined.reset_index(drop = True, inplace = True)

## Removing @ and keeping first part
combined.name = combined.name.map(lambda name: name.split('@')[0].strip() if(name.find('@')!=-1) else name.strip())


##Removing titles ( they appear before dot)
combined.name = combined.name.map(lambda name: name.strip() if(name.find('.')==-1) else name.split('.')[1].strip() if(len(name.split('.')[0].strip())<=5) else name.strip())

##Removing titles (if they don't have dot)
def checkTitle1(name):
    title = ['j ','dr ','ku ','ku- ','k ','km ','km- ','kum ','km0 ','sant ','st ','mo ','gen ','smt ','ms ','mis ','shri ','sri ','sh ','shi ','p ','md ','gd ','m ','sk ','so ','mohd ','mho ','dd ','ed ','ct ','na ', 'miss ', 'lc ', 'smt- ', 'smts ','smt-', 'smt,','1-smt ','mo- ','gs-1957975 ','mrs ','shrimati ','a ','b ']
    k = any(name.find(i)==0 for i in title)
    return k
def checkTitle2(name):
    title = ['ku  ','kum  ','shri  ','md  ','mohd  ','smt  ','km  ']
    k = any(name.find(i)==0 for i in title)
    return k
def checkTitle3(name):
    title = ['s p ','kum a ','ct b ']
    k = any(name.find(i)==0 for i in title)
    return k
combined.name = combined.name.map(lambda name: name.split('  ')[1].strip() if(checkTitle2(name)==True) else name.split(' ')[2].strip() if (checkTitle3(name)==True) else name.split(' ')[1].strip() if(checkTitle1(name)==True) else name.strip())


##Extracting first name 
combined.name = combined.name.map(lambda name: name if(name.find(' ')==-1) else name.split()[0].strip())


##Removing any special character associated with the first name
combined.name = combined.name.map(lambda name: ''.join(i for i in name if i.isalpha()).strip())

## Again Removing NAN rows
combined = combined.dropna(axis=0)

##Removing non english rows
for index,row in combined.iterrows():
    try:
        row['name'].encode('ascii')
    except UnicodeEncodeError:
        combined.drop(index,axis=0,inplace=True)
combined.reset_index(drop = True, inplace = True)

##Again removing duplicate rows
combined = combined.drop_duplicates(subset = ['name','gender'])
combined.reset_index(drop = True, inplace = True)

## copying data to csv file
combined.to_csv('CleanedData.csv', index=False)

In [82]:
#Feature Extraction
#Last Character
# combined['LastChar'] = combined.name.map(lambda name: name[-1])

# #Is last character A or E or I or Y
# combined['IsLastCharAEIY'] = combined.LastChar.map(lambda name: 1 if (name=='e' or name=='i' or name=='y' or name =='a') else 0)

# #Second Last Character
# combined['SecLastChar'] = combined.name.map(lambda name: name[-2])

# #Is second Last character A or E or I
# combined['IsSecLastCharAEIY'] = combined.SecLastChar.map(lambda name: 1 if (name=='e' or name=='i' or name =='a') else 0)

#No. of vowels
def countVowels(string):
    num_vowels=0
    for char in string:
        if char in "aeiou":
           num_vowels = num_vowels+1
    return num_vowels
combined['NoOfVowels'] = combined.name.map(lambda name: countVowels(name))

#Total Number of 'e', 'i' in the name
def countEI(string):
    num_ei=0
    for char in string:
        if char in "ei":
           num_ei = num_ei+1
    return num_ei
combined['NumEI'] = combined.name.map(lambda name: countEI(name))

# Length of the name:
# combined['Length'] = combined.name.map(lambda name: len(name))

In [58]:
#Feature Visualization
#total no. of female names = 3297
#total no. of male names = 4024

##Last character counts
# print(combined[combined['gender']=='f'].LastChar.value_counts())
# print(combined[combined['gender']=='m'].LastChar.value_counts())

##Second Last Character Counts
# print(combined[combined['gender']=='f'].SecLastChar.value_counts())
# print(combined[combined['gender']=='m'].SecLastChar.value_counts())
# combined.info()

In [85]:
def _nameFeatures(name):
    name=name.lower()
    return{'lastChar':name[-1],
          'lastTwoChar':name[-2:],
          'isLastAEIY':(name[-1] in 'aeiy'),
          'isSecLastAEI':(name[-2] in 'aei'),
          'NoOfVowels':countVowels(name),
           'NumEI':countEI(name),
           'length':len(name),
           'firstChar':name[0]
           }
def extractFeatures(dataframe):
    featureSet = list()
    for index,row in dataframe.iterrows():
        featureSet.append((_nameFeatures(row['name']),row['gender']))
    return featureSet

def TrainAndTestNB(dataframe):
    featureSet = extractFeatures(dataframe)
    random.shuffle(featureSet)
    name_count = len(featureSet)
    cut = int(name_count*0.80)
    trainSet = featureSet[:cut]
    testSet = featureSet[cut:]
    classifier = NaiveBayesClassifier.train(trainSet)
    print('Testing Accuracy: {} '.format(classify.accuracy(classifier,testSet)))
    print('Most Informative Features')
    print(classifier.show_most_informative_features(50))

def TrainAndTestSVM(dataframe):
    featureSet = extractFeatures(dataframe)
    random.shuffle(featureSet)
    name_count = len(featureSet)
    cut = int(name_count*0.80)
    trainSet = featureSet[:cut]
    testSet = featureSet[cut:]
    classif = SklearnClassifier(LinearSVC())
    classifier = classif.train(trainSet)
    print('Testing Accuracy: {} '.format(classify.accuracy(classifier,testSet)))


In [88]:
TrainAndTestNB(combined)
TrainAndTestSVM(combined)

Testing Accuracy: 0.7663934426229508 
Most Informative Features
Most Informative Features
             lastTwoChar = 'th'                m : f      =     18.1 : 1.0
             lastTwoChar = 'nt'                m : f      =     18.1 : 1.0
             lastTwoChar = 'ad'                m : f      =     11.1 : 1.0
                lastChar = 'd'                 m : f      =     11.0 : 1.0
             lastTwoChar = 'if'                m : f      =     10.3 : 1.0
             lastTwoChar = 'ka'                f : m      =      8.8 : 1.0
             lastTwoChar = 'ta'                f : m      =      8.7 : 1.0
             lastTwoChar = 'ib'                m : f      =      8.7 : 1.0
             lastTwoChar = 'id'                m : f      =      8.5 : 1.0
             lastTwoChar = 'nd'                m : f      =      8.0 : 1.0
                lastChar = 'f'                 m : f      =      8.0 : 1.0
             lastTwoChar = 'ba'                f : m      =      7.9 : 1.0
          