# **Tugas UAS**

In [1]:
import pandas as pd

## Read Data

In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/ABDHanifAzhari/dataset/main/gender_classification_v7%20(1).csv")
data

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,Male
1,0,14.0,5.4,0,0,1,0,Female
2,0,11.8,6.3,1,1,1,1,Male
3,0,14.4,6.1,0,1,1,1,Male
4,1,13.5,5.9,0,0,0,0,Female
...,...,...,...,...,...,...,...,...
4996,1,13.6,5.1,0,0,0,0,Female
4997,1,11.9,5.4,0,0,0,0,Female
4998,1,12.9,5.7,0,0,0,0,Female
4999,1,13.2,6.2,0,0,0,0,Female


## Exploration Data

In [3]:
data.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,Male
1,0,14.0,5.4,0,0,1,0,Female
2,0,11.8,6.3,1,1,1,1,Male
3,0,14.4,6.1,0,1,1,1,Male
4,1,13.5,5.9,0,0,0,0,Female


In [4]:
data[["forehead_width_cm","forehead_height_cm"]].agg(['min','max'])

Unnamed: 0,forehead_width_cm,forehead_height_cm
min,11.4,5.1
max,15.5,7.1


In [5]:
data.gender.value_counts()

Female    2501
Male      2500
Name: gender, dtype: int64

## preprocessing data

In [6]:
X = data.drop(columns="gender")
y = data.gender

In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(y)
y = le.transform(y)
y

array([1, 0, 1, ..., 0, 0, 1])

In [8]:
le.inverse_transform(y)

array(['Male', 'Female', 'Male', ..., 'Female', 'Female', 'Male'],
      dtype=object)

In [9]:
labels = pd.get_dummies(data.gender).columns.values.tolist()
labels

['Female', 'Male']

## normalize data

In [10]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)
X

array([[1.        , 0.09756098, 0.5       , ..., 0.        , 1.        ,
        1.        ],
       [0.        , 0.63414634, 0.15      , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.09756098, 0.6       , ..., 1.        , 1.        ,
        1.        ],
       ...,
       [1.        , 0.36585366, 0.3       , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.43902439, 0.55      , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.97560976, 0.15      , ..., 1.        , 1.        ,
        1.        ]])

In [11]:
X.shape, y.shape

((5001, 7), (5001,))

## K-Fold Validation

In [12]:
# scikit-learn k-fold cross-validation
from numpy import array
from sklearn.model_selection import KFold

def cross_validation(model, X, y):
    # prepare cross validation
    kf = KFold(n_splits=4)
    kf.get_n_splits(X)

    # enumerate splits
    i = 1

    score = 0
    for train_index, test_index in kf.split(X):
        # print("TRAIN:", train_index, "TEST:", test_index)
        print("fold-", i)
        i += 1
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        print(model.score(X_train, y_train))
        score_test = model.score(X_test, y_test)
        if score_test > score:
            X_train_best = train_index
            y_train_best = train_index
            score = score_test


    return model.fit(X[X_train_best], y[y_train_best])

## Gaussian Naive Bayes

In [13]:
from sklearn.naive_bayes import GaussianNB

model1 = cross_validation(GaussianNB(), X, y)
y_pred = model1.predict(array([[1,11.8,6.1,1,0,1,1]]))
le.inverse_transform(y_pred)[0]

fold- 1
0.9728
fold- 2
0.9706744868035191
fold- 3
0.9685417222074113
fold- 4
0.9680085310583845


'Male'

## k-nearest neighbors

k = 3 

karena mendapatkan nilai tertinggi

In [14]:
from sklearn.neighbors import KNeighborsClassifier

model2 = KNeighborsClassifier(n_neighbors=3)
model2 = cross_validation(model2, X, y)
y_pred = model2.predict(array([[1,11.8,6.1,1,0,1,1]]))
le.inverse_transform(y_pred)[0]

fold- 1
0.9816
fold- 2
0.9792055451879499
fold- 3
0.9773393761663557
fold- 4
0.9778725673153825


'Male'

## Decision Tree

In [15]:
from sklearn import tree

model3 = tree.DecisionTreeClassifier(criterion="gini")
model3 = cross_validation(model3, X, y)
y_pred = model3.predict(array([[1,11.8,6.1,1,0,1,1]]))
le.inverse_transform(y_pred)[0]

fold- 1
0.9984
fold- 2
0.9992002132764596
fold- 3
0.9989336177019461
fold- 4
0.9984004265529193


'Male'

## Export 



*   Label Encoder
*   Model



In [16]:
from sklearn.utils.validation import joblib
# label encoder
joblib.dump(le, "le.save") 
joblib.dump(scaler, "scaler.save") 

# model
joblib.dump(model1, "nb.joblib")
joblib.dump(model2, "knn.joblib")
joblib.dump(model3, "tree.joblib")

['tree.joblib']