In [1]:
import pandas as pd
import nltk
import scipy.sparse as sp
from sklearn.pipeline import FeatureUnion
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import f1_score, precision_recall_fscore_support as score
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
allNamesDF=pd.read_csv('prediction.csv')

In [3]:
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
       # if re.match('[a-zA-Z]', token):
        #if('-' not in token and is_ascii(token)):
        filtered_tokens.append(token)
#         else:
#             print(token)
    return filtered_tokens

In [4]:
victorizerNames=[]
for name in allNamesDF.name:
    myName=""
    for char in name:
        myName+=char+' '
    victorizerNames.append(myName[:-1])
    #victorizerNames.append(name)

In [5]:
L = [e.split() for e in victorizerNames]

In [6]:
def genderFeatures(name,gender):
    features = {}
    #print(name,gender)
    features['name']=name.lower().title()+'.'
    
    if(gender=='male'):
        features['gender']=0
    else:
        features['gender']=1
    
    if(features['name'].startswith('Abo')==True):
        features["aboPrefix"] = 1
    else:
        features["aboPrefix"] = 0
        
    if(features['name'].startswith('Abd')==True):
        features["abdPrefix"] = 1
    else:
        features["abdPrefix"] = 0
    
    if(len(features['name'])<7):
        features["isShort"] = 1
    else:
        features["isShort"] = 0
    
    if(features['name'].endswith('a.')==True): 
        features["aSuffix"] = 1
    else:
        features["aSuffix"] = 0
    
    if(features['name'].endswith('ah.')==True): 
        features["ahSuffix"] = 1
    else:
        features["ahSuffix"] = 0
        
    if(features['name'].endswith('allah.')==True): 
        features["allahSuffix"] = 1
    else:
        features["allahSuffix"] = 0
    
    if(features['name'].endswith('al.')==True): 
        features["alSuffix"] = 1
    else:
        features["alSuffix"] = 0
    
    
    tmp=features['name'].replace('a','%').replace('e','%').replace('i','%').replace('o','%').replace('u','%')
    vowels=tmp.count('%')
    features['countVowels'] = int(vowels)
#     if(vowels>2):
#         features["moreVowels"] = 1
#     else:
#         features["moreVowels"] = 0
    return features

In [7]:
genderFeaturesDF=pd.DataFrame(columns=['name','gender','aboPrefix','abdPrefix','isShort', 'aSuffix', 'ahSuffix','allahSuffix','alSuffix'])
for row in allNamesDF.itertuples():
    features=genderFeatures(getattr(row, "name"), getattr(row, "gender"))
    genderFeaturesDF=genderFeaturesDF.append(features,ignore_index=True)

In [8]:
featuresList = genderFeaturesDF[['aboPrefix','abdPrefix','isShort', 'countVowels', 'aSuffix', 'ahSuffix','allahSuffix','alSuffix']].values.tolist()

In [9]:
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(use_idf=False,lowercase=False,ngram_range=(1,6),analyzer='char',stop_words=None)

tfidf_matrix = tfidf_vectorizer.fit_transform(allNamesDF.name) #fit the vectorizer to synopses

print(tfidf_matrix.shape)
print(len(tfidf_vectorizer.vocabulary_))

(1222, 7946)
7946


In [10]:
combined = sp.hstack([tfidf_matrix, featuresList], format='csr')

In [11]:
#split data to training & testing
X_train, X_test, y_train, y_test = train_test_split(combined,allNamesDF.gender, test_size=0.2, random_state=1)
crossV = ShuffleSplit(n_splits=5, test_size=0.2, random_state=1)
print('Training : ',X_train.shape, y_train.shape)
print('Testing : ',X_test.shape, y_test.shape)

Training :  (977, 7954) (977,)
Testing :  (245, 7954) (245,)


In [12]:
allResults=pd.DataFrame(columns=["Algorithm","Fscore","Precision","Recall"])

In [13]:
#SVM (Linear)
clf = svm.SVC(kernel='linear', C=1, probability=True).fit(X_train, y_train)
scores = cross_val_score(clf, X_train, y_train, cv=crossV,scoring='f1_micro')
predicted=clf.predict(X_test)
precision, recall, fscore, support = score(y_test, predicted, average='micro')
allResults=allResults.append({"Algorithm":'SVM Linear',"Fscore":fscore.mean(),"Precision":precision.mean(),"Recall":recall.mean()}, ignore_index=True)

#KNN
clf = KNeighborsClassifier(n_neighbors = 7)
clf.fit(X_train,y_train)
scores = cross_val_score(clf, X_train, y_train, cv=crossV,scoring='f1_micro')
predicted=clf.predict(X_test)
precision, recall, fscore, support = score(y_test, predicted, average='micro')
allResults=allResults.append({"Algorithm":'KNN k=7',"Fscore":fscore.mean(),"Precision":precision.mean(),"Recall":recall.mean()}, ignore_index=True)

#Logistic Regression
clf = LogisticRegression()
clf.fit(X_train,y_train)
scores = cross_val_score(clf, X_train, y_train, cv=crossV,scoring='f1_micro')
predicted=clf.predict(X_test)
precision, recall, fscore, support = score(y_test, predicted, average='micro')
allResults=allResults.append({"Algorithm":'Logistic Regression',"Fscore":fscore.mean(),"Precision":precision.mean(),"Recall":recall.mean()}, ignore_index=True)

#Random Forest (1000tree)
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X_train,y_train)
scores = cross_val_score(clf, X_train, y_train, cv=crossV,scoring='f1_micro')
predicted=clf.predict(X_test)
precision, recall, fscore, support = score(y_test, predicted, average='micro')
allResults=allResults.append({"Algorithm":'Random Forest (trees=1000)',"Fscore":fscore.mean(),"Precision":precision.mean(),"Recall":recall.mean()}, ignore_index=True)

#DecisionTree 
clf = DecisionTreeClassifier(random_state=1)
clf.fit(X_train,y_train)
scores = cross_val_score(clf, X_train, y_train, cv=crossV,scoring='f1_micro')
predicted=clf.predict(X_test)
precision, recall, fscore, support = score(y_test, predicted, average='micro')
allResults=allResults.append({"Algorithm":'DecisionTree',"Fscore":fscore.mean(),"Precision":precision.mean(),"Recall":recall.mean()}, ignore_index=True)

In [14]:
allResults

Unnamed: 0,Algorithm,Fscore,Precision,Recall
0,SVM Linear,0.844898,0.844898,0.844898


In [15]:
y_testList=y_test.tolist()
c=0
for i in range (len(y_test)):
    if(y_testList[i]!=predicted[i]):
        c=c+1
        print(y_test.index[i],genderFeaturesDF.name.ix[y_test.index[i]],['male', 'female'][genderFeaturesDF.gender.ix[y_test.index[i]]],'=>',predicted[i])
print(c)

912 Marihan. female => male
1159 Maram. female => male
906 Nihal. female => male
512 Mousa. male => female
248 Nagat. female => male
538 Ekram. female => male
311 Nardin. female => male
259 Taha. male => female
1042 Rabab. female => male
705 Charl. female => male
1041 Suzan. female => male
341 Kariman. female => male
855 Yasmine. female => male
670 Waganat. female => male
1193 Suzy. female => male
108 Perihan. female => male
799 Nawal. female => male
404 Set. female => male
27 Qessmat. female => male
90 Sohier. female => male
314 Elham. female => male
663 Diaa. male => female
729 Enmar. female => male
1021 Sherry. female => male
1116 Tasnim. female => male
888 Amin. male => female
350 Haidy. female => male
439 Nairouz. female => male
766 Nouran. female => male
568 Engy. female => male
428 Zohour. female => male
644 Taysir. female => male
163 Mimy. female => male
1190 Rosalin. female => male
78 Hedayet. female => male
831 Amany. female => male
200 Salah. male => female
951 Martin. male 

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


In [16]:
genderFeaturesDF[genderFeaturesDF['alSuffix']== True]

Unnamed: 0,name,gender,aboPrefix,abdPrefix,isShort,aSuffix,ahSuffix,allahSuffix,alSuffix,countVowels
45,Nehal.,1,0,0,1,0,0,0,1,2.0
64,Talal.,0,0,0,1,0,0,0,1,2.0
71,Heikal.,0,0,0,0,0,0,0,1,3.0
84,Ebtehal.,1,0,0,0,0,0,0,1,2.0
195,Helal.,0,0,0,1,0,0,0,1,2.0
206,Dalal.,1,0,0,1,0,0,0,1,2.0
380,Faisal.,0,0,0,0,0,0,0,1,3.0
490,Amthal.,1,0,0,0,0,0,0,1,1.0
498,Mazmal.,0,0,0,0,0,0,0,1,2.0
596,Manal.,1,0,0,1,0,0,0,1,2.0
