In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, ShuffleSplit, LeavePOut, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.feature_selection import mutual_info_classif as MIC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


  # Loading data into pandas dataframe 

In [3]:
df=pd.read_csv('gender-prediction.csv')
df

Unnamed: 0,height,weight,beard,hair_length,shoe_size,scarf,eye_color,gender
0,71,176,yes,short,44,no,black,male
1,68,165,no,bald,41,no,black,male
2,62,132,no,medium,37,yes,blue,female
3,65,138,no,long,38,no,gray,female
4,70,197,yes,medium,43,no,gray,male
...,...,...,...,...,...,...,...,...
75,65,99,no,short,39,yes,green,female
76,61,98,no,short,37,no,brown,female
77,67,119,yes,short,40,no,black,male
78,70,190,yes,medium,43,no,gray,male


Finding male female ratio

In [None]:
male = len(df[df['gender'] == 'male'])
female = len(df[df['gender'] == 'female'])

maleRatio = male/len(df['gender'])
print(maleRatio*100)

femaleRatio = female/len(df['gender'])
print(femaleRatio*100)

# Applying Encoding

In [None]:
label_encoder = LabelEncoder()
df['beard']= label_encoder.fit_transform(df['beard'])
df['hair_length']= label_encoder.fit_transform(df['hair_length'])
df['scarf']= label_encoder.fit_transform(df['scarf'])
df['eye_color']= label_encoder.fit_transform(df['eye_color'])
df['gender']= label_encoder.fit_transform(df['gender'])

## Seprating input and target columns

In [None]:
X = df.drop(columns=['gender'])
y = df['gender']

## Appling Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=df['gender'], random_state = 8)

# 1. Random Forest Alogorithm

In [None]:
randomforest = RandomForestClassifier()
randomforest.fit(X_train, y_train)
prediction = randomforest.predict(X_test)

In [None]:
metrics.accuracy_score(y_test, prediction)

In [None]:
metrics.confusion_matrix(y_test, prediction)

# 2. SVM Algorithm

In [None]:
svm = SVC()
svm.fit(X_train, y_train)
prediction = svm.predict(X_test)

In [None]:
metrics.accuracy_score(y_test, prediction)

In [None]:
metrics.confusion_matrix(y_test, prediction)

# 3. Multilayer Perceptron


In [None]:
mlp = MLPClassifier()

In [None]:
mlp.fit(X_train, y_train)

In [None]:
prediction = mlp.predict(X_test)

In [None]:
metrics.accuracy_score(y_test, prediction)

In [None]:
metrics.confusion_matrix(y_test, prediction)

# Using 80/20 train test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=df['gender'], random_state = 8)

In [None]:
randomforest = RandomForestClassifier()
randomforest.fit(X_train, y_train)
prediction = randomforest.predict(X_test)
print(metrics.accuracy_score(y_test, prediction))
metrics.confusion_matrix(y_test, prediction)

In [None]:
svm = SVC()
svm.fit(X_train, y_train)
prediction = svm.predict(X_test)
print(metrics.accuracy_score(y_test, prediction))
metrics.confusion_matrix(y_test, prediction)

In [None]:
mlp = MLPClassifier()
mlp.fit(X_train, y_train)
prediction = mlp.predict(X_test)
print(metrics.accuracy_score(y_test, prediction))
metrics.confusion_matrix(y_test, prediction)

## Removing powerful attributes and training models again.

In [None]:
df

In [None]:


X = df.drop(columns=['gender','height','shoe_size'])
y = df['gender']
     

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=df['gender'], random_state = 8)

In [None]:
randomforest = RandomForestClassifier()
randomforest.fit(X_train, y_train)
prediction = randomforest.predict(X_test)
print(metrics.accuracy_score(y_test, prediction))
metrics.confusion_matrix(y_test, prediction)

In [None]:
svm = SVC()
svm.fit(X_train, y_train)
prediction = svm.predict(X_test)
print(metrics.accuracy_score(y_test, prediction))
metrics.confusion_matrix(y_test, prediction)

In [None]:
mlp = MLPClassifier()
mlp.fit(X_train, y_train)
prediction = mlp.predict(X_test)
print(metrics.accuracy_score(y_test, prediction))
metrics.confusion_matrix(y_test, prediction)

# Decision Tree

In [None]:
dt = DecisionTreeClassifier()

In [None]:
X = df.drop(columns=['gender'])
y = df['gender']

## Monte Carlo Cross Validation


In [None]:
s_split=ShuffleSplit(test_size=0.3,train_size=0.2,n_splits=10)
scores=cross_val_score(randomforest, X, y, cv=s_split, scoring='f1')

In [None]:
print(f"F1 Score :\n {scores}")

In [None]:
print(f"F1 Score :\n {scores.mean}")
     

## Leave p out Cross Validation

In [None]:
lpo=LeavePOut(p=2)

In [None]:
lpo.get_n_splits(X)

In [None]:
scores=cross_val_score(dt, X, y, cv=lpo, scoring='f1')

In [None]:
print(f"F1 Score :\n {scores}")

In [None]:
print(f"F1 Score :\n {scores.mean()}")

## Adding new Instances

In [61]:
data = df

In [62]:
test_data = pd.DataFrame([
              [69, 167, 'no', 'short', 43, 'no', 'brown', 'female'],
              [62, 125, 'no', 'long', 37, 'yes', 'black', 'female'],
              [73, 181, 'yes', 'medium', 42, 'no', 'black', 'male'],
              [65, 102, 'no', 'long', 36, 'no', 'grey', 'female'],
    [77, 180, 'yes', 'short', 44, 'no', 'blue', 'male'],
              
            ],
           columns=['height', 'weight', 'beard', 'hair_length', 'shoe_size', 'scarf','eye_color', 'gender'],
           index=[80, 81, 82, 83, 84])




In [63]:
label_encoder = LabelEncoder()

data['scarf']= label_encoder.fit_transform(data['scarf'])
data['eye_color']= label_encoder.fit_transform(data['eye_color'])
data['gender']= label_encoder.fit_transform(data['gender'])
data['shoe_size']= label_encoder.fit_transform(data['shoe_size'])
data['height']= label_encoder.fit_transform(data['height'])
data['weight']= label_encoder.fit_transform(data['weight'])
data['beard']= label_encoder.fit_transform(data['beard'])
data['hair_length']= label_encoder.fit_transform(data['hair_length'])

test_data['height']= label_encoder.fit_transform(test_data['height'])
test_data['weight']= label_encoder.fit_transform(test_data['weight'])
test_data['beard']= label_encoder.fit_transform(test_data['beard'])
test_data['hair_length']= label_encoder.fit_transform(test_data['hair_length'])
test_data['shoe_size']= label_encoder.fit_transform(test_data['shoe_size'])
test_data['scarf']= label_encoder.fit_transform(test_data['scarf'])
test_data['eye_color']= label_encoder.fit_transform(test_data['eye_color'])
test_data['gender']= label_encoder.fit_transform(test_data['gender'])


In [64]:
X_train = data.drop(columns=['gender'])
y_train = data['gender']
X_test = test_data.drop(columns=['gender'])
y_test = test_data['gender']

In [65]:
nb = GaussianNB()


In [66]:
nb.fit(X_train, y_train)

GaussianNB()

In [67]:
prediction = nb.predict(X_test)

In [68]:

print("Accuracy = ", metrics.accuracy_score(y_test, prediction))
print("Precision = ", metrics.precision_score(y_test, prediction))
print("Recall = ", metrics.recall_score(y_test, prediction))


Accuracy =  1.0
Precision =  1.0
Recall =  1.0


In [69]:
metrics.confusion_matrix(y_test, prediction)

array([[3, 0],
       [0, 2]], dtype=int64)