In [None]:
import pandas as pd 
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import LeavePOut
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import warnings
warnings.filterwarnings('always')

In [None]:
df = pd.read_csv('gender-prediction.csv')
df

Unnamed: 0,height,weight,beard,hair_length,shoe_size,scarf,eye_color,gender
0,71,176,yes,short,44,no,black,male
1,68,165,no,bald,41,no,black,male
2,62,132,no,medium,37,yes,blue,female
3,65,138,no,long,38,no,gray,female
4,70,197,yes,medium,43,no,gray,male
...,...,...,...,...,...,...,...,...
75,65,99,no,short,39,yes,green,female
76,61,98,no,short,37,no,brown,female
77,67,119,yes,short,40,no,black,male
78,70,190,yes,medium,43,no,gray,male


# Measuring Accuracies (using 67-33 train-test split)

In [None]:
le = preprocessing.LabelEncoder()

df['beard'] = le.fit_transform(df['beard'])
df['hair_length'] = le.fit_transform(df['hair_length'])
df['scarf'] = le.fit_transform(df['scarf'])
df['eye_color'] = le.fit_transform(df['eye_color'])

input = list(zip(df['height'], df['weight'], df['beard'], df['hair_length'], df['shoe_size'], df['scarf'], df['eye_color']))

In [None]:
tg = df['gender']
target = le.fit_transform(tg)

input_train, input_test, target_train, target_test = train_test_split(input, target, test_size = 0.33, random_state = 2)

In [None]:
rf = RandomForestClassifier()
svc = SVC()
mlp = MLPClassifier()

In [None]:
rf.fit(input_train,target_train)
svc.fit(input_train,target_train)
mlp.fit(input_train,target_train)

MLPClassifier()

In [None]:
rf_prediction = rf.predict(input_test)
svc_prediction = svc.predict(input_test)
mlp_prediction = mlp.predict(input_test)

In [None]:
rf_accuracy = accuracy_score(target_test, rf_prediction)*100
svc_accuracy = accuracy_score(target_test, svc_prediction)*100
mlp_accuracy = accuracy_score(target_test, mlp_prediction)*100
print("Random forest accuracy: ", rf_accuracy, "%")
print("Support Vector Machine accuracy: ", svc_accuracy, "%")
print("Multilayer Perceptron accuracy: ", mlp_accuracy, "%")

Random forest accuracy:  100.0 %
Support Vector Machine accuracy:  77.77777777777779 %
Multilayer Perceptron accuracy:  62.96296296296296 %


# Measuring Accuracies (using 80-20 train-test split)

In [None]:
input_train, input_test, target_train, target_test = train_test_split(input, target, test_size = 0.2, random_state = 2)

In [None]:
rf = RandomForestClassifier()
svc = SVC()
mlp = MLPClassifier()

In [None]:
rf.fit(input_train,target_train)
svc.fit(input_train,target_train)
mlp.fit(input_train,target_train)

MLPClassifier()

In [None]:
rf_prediction = rf.predict(input_test)
svc_prediction = svc.predict(input_test)
mlp_prediction = mlp.predict(input_test)

In [None]:
rf_accuracy = accuracy_score(target_test, rf_prediction)*100
svc_accuracy = accuracy_score(target_test, svc_prediction)*100
mlp_accuracy = accuracy_score(target_test, mlp_prediction)*100
print("Random forest accuracy: ", rf_accuracy, "%")
print("Support Vector Machine accuracy: ", svc_accuracy, "%")
print("Multilayer Perceptron accuracy: ", mlp_accuracy, "%")

Random forest accuracy:  100.0 %
Support Vector Machine accuracy:  81.25 %
Multilayer Perceptron accuracy:  62.5 %


# Excluding most powerful attributes

Beard and scarf are two most important attributes. 
After dropping these 2 attributes:

In [None]:
input = list(zip(df['height'], df['weight'], df['hair_length'], df['shoe_size'], df['eye_color']))
tg = df['gender']
target = le.fit_transform(tg)

In [None]:
input_train, input_test, target_train, target_test = train_test_split(input, target, test_size = 0.33, random_state = 2)

In [None]:
rf = RandomForestClassifier()
svc = SVC()
mlp = MLPClassifier()

In [None]:
rf.fit(input_train,target_train)
svc.fit(input_train,target_train)
mlp.fit(input_train,target_train)

rf_prediction = rf.predict(input_test)
svc_prediction = svc.predict(input_test)
mlp_prediction = mlp.predict(input_test)



In [None]:
rf_accuracy = accuracy_score(target_test, rf_prediction)*100
svc_accuracy = accuracy_score(target_test, svc_prediction)*100
mlp_accuracy = accuracy_score(target_test, mlp_prediction)*100
print("Random forest accuracy: ", rf_accuracy, "%")
print("Support Vector Machine accuracy: ", svc_accuracy, "%")
print("Multilayer Perceptron accuracy: ", mlp_accuracy, "%")

Random forest accuracy:  100.0 %
Support Vector Machine accuracy:  77.77777777777779 %
Multilayer Perceptron accuracy:  85.18518518518519 %


# Monte Carlo cross-validation and Leave P-Out cross-validation

In [None]:
input = list(zip(df['height'], df['weight'], df['beard'], df['hair_length'], df['shoe_size'], df['scarf'], df['eye_color']))
tg = df['gender']
target = le.fit_transform(tg)

In [None]:
monte_carlo = ShuffleSplit(n_splits=5,test_size=0.33,random_state=7)
decision_tree = DecisionTreeClassifier()

In [None]:
mc_accuracy = cross_val_score(decision_tree,input,target,cv=monte_carlo).mean() * 100
mc_f1 = cross_val_score(decision_tree,input,target, scoring="f1", cv=monte_carlo).mean() * 100
print("Monte Carlo cross-validation accuracy: ", mc_accuracy, "%")
print("Monte Carlo cross-validation F1 score: ", mc_f1, "%")

Monte Carlo cross-validation accuracy:  92.59259259259258 %
Monte Carlo cross-validation F1 score:  95.07975243147658 %


In [None]:
#Leave P-Out cross-validation
lpo = LeavePOut(2)
lpo.get_n_splits(input)

NameError: ignored

In [None]:
lpo_accuracy = cross_val_score(decision_tree,input,target,cv=lpo).mean() *100
lpo_f1 = cross_val_score(decision_tree,input,target,cv=lpo, scoring="f1_weighted").mean() * 100
print("Leave P-Out cross-validation accuracy: ", lpo_accuracy, "%")
print("Leave P-Out cross-validation F1 score: ", lpo_f1, "%")

Leave P-Out cross-validation accuracy:  94.17721518987342 %
Leave P-Out cross-validation F1 score:  94.12447257383965 %


# Gaussian Naive Bayes (on updated dataset)

In [None]:
df2 = pd.read_csv('gender-prediction_updated.csv')
df2

Unnamed: 0,height,weight,beard,hair_length,shoe_size,scarf,eye_color,gender
0,71,176,yes,short,44,no,black,male
1,68,165,no,bald,41,no,black,male
2,62,132,no,medium,37,yes,blue,female
3,65,138,no,long,38,no,gray,female
4,70,197,yes,medium,43,no,gray,male
...,...,...,...,...,...,...,...,...
80,64,113,no,medium,38,no,brown,female
81,67,123,no,long,36,no,blue,female
82,73,147,yes,medium,43,no,black,male
83,71,152,no,short,41,no,black,male


In [None]:
train_data = df2.iloc[:80]
train_data

test_data = df2.iloc[80:]
test_data

Unnamed: 0,height,weight,beard,hair_length,shoe_size,scarf,eye_color,gender
80,64,113,no,medium,38,no,brown,female
81,67,123,no,long,36,no,blue,female
82,73,147,yes,medium,43,no,black,male
83,71,152,no,short,41,no,black,male
84,63,114,no,long,38,yes,brown,female


In [None]:
le = preprocessing.LabelEncoder()

df['beard'] = le.fit_transform(df['beard'])
df['hair_length'] = le.fit_transform(df['hair_length'])
df['scarf'] = le.fit_transform(df['scarf'])
df['eye_color'] = le.fit_transform(df['eye_color'])

input = list(zip(df['height'], df['weight'], df['beard'], df['hair_length'], df['shoe_size'], df['scarf'], df['eye_color']))
target = train_data['gender']

In [None]:
gnb = GaussianNB()

gnb.fit(input,target)

GaussianNB()

In [None]:
gnb_prediction = gnb.predict(input)

gnb_accuracy = accuracy_score(target, gnb_prediction)*100
gnb_precision = precision_score(target, gnb_prediction, average=None).mean() *100
gnb_recall = recall_score(target, gnb_prediction, average=None).mean() *100
print("Gaussian Naive Bayes accuracy: ", gnb_accuracy, "%")
print("Gaussian Naive Bayes precision: ", gnb_precision, "%")
print("Gaussian Naive Bayes recall: ", gnb_recall, "%")

Gaussian Naive Bayes accuracy:  95.0 %
Gaussian Naive Bayes precision:  96.0 %
Gaussian Naive Bayes recall:  94.11764705882352 %
