In [1]:
import numpy as np
import pandas as pd

In [2]:
raw_data = {
    'name' : ['kevin', 'sally', 'hoyeon', 'lux'],
    'height' : [178.2, 162.9, 160.6, 156.2],
    'gender' : ['male', 'female', 'female', None]
}

pd_data = pd.DataFrame(raw_data)
pd_data.head()

Unnamed: 0,name,height,gender
0,kevin,178.2,male
1,sally,162.9,female
2,hoyeon,160.6,female
3,lux,156.2,


In [3]:
filtered_data = pd_data.dropna()
filtered_data.head()

Unnamed: 0,name,height,gender
0,kevin,178.2,male
1,sally,162.9,female
2,hoyeon,160.6,female


- NULL값은 없애버리자

In [4]:
del filtered_data['name']
filtered_data.head()

Unnamed: 0,height,gender
0,178.2,male
1,162.9,female
2,160.6,female


- 변수 name은 필요없으니 지워버리자

In [8]:
female_mean = np.average(filtered_data[filtered_data['gender'] == 'female']['height'].values)
male_mean = np.average(filtered_data[filtered_data['gender'] == 'male']['height'].values)
print(female_mean, male_mean)

161.75 178.2


In [11]:
np.random.seed(0)

variance = 3
female_heights = variance * np.random.randn(200) + female_mean
male_heights = variance * np.random.randn(200) + male_mean

print(female_heights[:10])

[167.04215704 162.95047163 164.68621395 168.4726796  167.35267397
 158.81816636 164.60026525 161.29592838 161.44034344 162.98179551]


- 위에서 구한 성별 평균, variance는 3, Gaussian Dist로 값생성

In [12]:
generated_data = {'gender' : ['female']*200 + ['male']*200, 'height' : list(female_heights)+list(male_heights)}

filtered_data = filtered_data.append(pd.DataFrame(generated_data), ignore_index = True)
filtered_data.head()

Unnamed: 0,height,gender
0,178.2,male
1,162.9,female
2,160.6,female
3,167.042157,female
4,162.950472,female


In [13]:
filtered_data.shape

(403, 2)

In [14]:
from sklearn.preprocessing import LabelEncoder

le_gender = LabelEncoder()

final_data = filtered_data.copy()
final_data['gender'] = le_gender.fit_transform(filtered_data['gender'])

final_data.tail()

Unnamed: 0,height,gender
398,175.915523,1
399,180.773772,1
400,181.623306,1
401,182.599736,1
402,180.757656,1


- gender부분을 라벨 인코딩을 한다

In [19]:
# Linear Regression 적용

from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

features = ['height']

kf = KFold(n_splits = 5, shuffle = True)

accrs = []
fold_idx = 1
for train_idx ,test_idx in kf.split(final_data):
    print('Fold {}'.format(fold_idx))
    train_d, test_d = final_data.iloc[train_idx], final_data.iloc[test_idx]
    
    train_y = train_d['gender']
    train_x = train_d[features]
    
    test_y = test_d['gender']
    test_x = test_d[features]
    
    model = LinearRegression()
    model.fit(train_x, train_y)
    
    mean_accr = model.score(test_x, test_y)
    accrs.append(mean_accr)
    
    fold_idx +=1
    
print(np.average(accrs))

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
0.876311708251516


In [20]:
# Logistic Regression

from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

features = ['height']

kf = KFold(n_splits = 5, shuffle = True)

accrs = []
fold_idx = 1
for train_idx ,test_idx in kf.split(final_data):
    print('Fold {}'.format(fold_idx))
    train_d, test_d = final_data.iloc[train_idx], final_data.iloc[test_idx]
    
    train_y = train_d['gender']
    train_x = train_d[features]
    
    test_y = test_d['gender']
    test_x = test_d[features]
    
    model = LogisticRegression(solver = 'lbfgs')
    model.fit(train_x, train_y)
    
    mean_accr = model.score(test_x, test_y)
    accrs.append(mean_accr)
    
    fold_idx +=1
    
print(np.average(accrs))

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
1.0
