In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
dir_path = "/kaggle/input/playground-series-s4e2/"
train_data = pd.read_csv(dir_path + "train.csv")
test_data = pd.read_csv(dir_path + "test.csv")

In [None]:
train_data.head()

In [None]:
for col in train_data.columns:
    print(col.ljust(30), train_data[col].dtype)

In [None]:
train_data.describe()

In [None]:
test_data.describe()

In [None]:
for col in train_data.columns:
    print("train_data.", col.ljust(30), "\tcontaining null:", train_data[col].isnull().any())
print()
for col in test_data.columns:
    print("test_data. ", col.ljust(30), "\tcontaining null:", test_data[col].isnull().any())

In [None]:
for col in train_data.columns:
    if train_data[col].dtype == np.dtype('O') and col != "NObeyesdad":
        print(col, "\n\t", train_data[col].unique())
        print("\ttest in train: ", set(test_data[col].unique()).issubset(set(train_data[col].unique())))

In [None]:
print("train_data.CALC:", train_data.CALC.unique())
print("test_data.CALC :", test_data.CALC.unique())
for cat in test_data.CALC.unique():
    print(cat.ljust(10), len(test_data[test_data.CALC == cat]))

In [None]:
print(train_data.NObeyesdad.unique())

**データの出所**
　　「Obesity or CVD risk」のデータによって訓練された深層学習モデルが作り出したもの
**目的変数**  
　　NObeyesdad  
　　-Insufficient_Weight  
　　-Normal_Weight  
　　-Overweight_Level_I  
　　-Overweight_Level_II  
　　-Obesity_Type_I  
　　-Obesity_Type_II  
　　-Obesity_Type_III  

**データの要約**  
1. Gender  
　　性別  
　　-Male  
　　-Female  
2. Age  
　　年齢  
　　-14.0~61.0
3. Height  
　　身長  
　　-1.45~1.98
4. Weight  
　　体重  
　　-39.0~166.0
5. family_history_with_overweight  
　　家族の過体重歴史  
　　-yes  
　　-no  
6. FAVC  
　　Frequent consumption of high caloric food = 高カロリーの食べ物を頻繁に摂取する  
　　-yes  
　　-no  
7. FCVC  
　　Frequency of consumption of vegetables = 野菜の摂取頻度  
　　-1.0~3.0  
8. NCP  
　　Number of main meals = 主な食事の数  
　　-1.0~4.0
9. CAEC  
　　Consumption of food between meals = 食間の食品の摂取  
　　-no  
　　-Sometimes  
　　-Frequently  
　　-Always  
10. SMOKE  
　　喫煙  
　　-yes  
　　-no  
11. CH2O  
　　Consumption of water daily = 1日の水の摂取量  
　　-1.0~3.0
12. SCC  
　　Calories consumption monitoring = カロリー消費量の管理  
　　-yes  
　　-no
13. FAF  
　　Physical activity frequency = 身体活動の頻度  
　　-0.0~3.0
14. TUE  
　　Time using technology devices = テクノロジーデバイスの使用時間  
　　-0.0~2.0
15. CALC  
　　Consumption of alcohol = アルコールの摂取  
　　-no  
　　-Sometimes  
　　-Frequently  
　　-Always
16. MTRANS  
　　Transportation used = 利用した交通機関  
　　-Walking  
　　-Bike  
　　-Public_Transportation  
　　-Motorbike  
　　-Automobile  

**参考**  
Obesity or CVD risk (Classify/Regressor/Cluster) : https://www.kaggle.com/datasets/aravindpcoder/obesity-or-cvd-risk-classifyregressorcluster

**特徴量の前処理**  
Gender: Male, Female  
family_history_with_overweight: no -> yes  
FAVC: no -> yes  
SMOKE: no -> yes  
SCC: no -> yes  

CAEC: no -> Sometimes -> Frequently -> Always  
CALC: no -> Sometimes -> Frequently -> Always  
MTRANS: Walking -> Bike -> Public_Transportation -> Mortorbike ->  Automobile  

In [None]:
# 特徴量の前処理
binary_features = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
boe = OrdinalEncoder()
train_data[binary_features] = pd.DataFrame(boe.fit_transform(train_data[binary_features]))
test_data[binary_features] = pd.DataFrame(boe.fit_transform(test_data[binary_features]))

cat_features = ['CAEC', 'CALC', 'MTRANS']
coe = OrdinalEncoder(categories = [
    ['no', 'Sometimes', 'Frequently', 'Always'],
    ['no', 'Sometimes', 'Frequently', 'Always'],
    ['Walking', 'Bike', 'Public_Transportation', 'Motorbike', 'Automobile']
])
train_data[cat_features] = pd.DataFrame(coe.fit_transform(train_data[cat_features]))
test_data[cat_features] = pd.DataFrame(coe.fit_transform(test_data[cat_features]))

# 目的変数の前処理
ooe = OrdinalEncoder(categories = [['Insufficient_Weight', 'Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II', 'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III']])
train_data.NObeyesdad = pd.DataFrame(ooe.fit_transform(train_data[['NObeyesdad']]))

In [None]:
#データの分布の確認
train_data.hist(bins = 14)
plt.tight_layout()

In [None]:
# フィルタ法
fig = plt.figure(figsize = (15, 12))
cor = train_data.corr()
sns.heatmap(abs(cor), annot=True, cmap = 'PiYG')
plt.show()

In [None]:
cor_o = pd.DataFrame(abs(cor.NObeyesdad))
cor_o.sort_values('NObeyesdad', ascending = False)

In [None]:
# 相関係数の絶対値が0.3以下のものはほぼ無関係なので、それらを除く
# 相関のある特徴量の間の相関係数を確認する
high_cor_features = ['Weight', 'family_history_with_overweight', 'CAEC', 'Age']
cor_f = train_data[high_cor_features].corr()
sns.heatmap(abs(cor_f), annot=True, cmap = 'PiYG')
plt.show()

In [None]:
# Weightを第三変数として、
#family_history_with_overweightとNObeyesdadの偏相関係数を求める
r12 = np.array(cor_o.NObeyesdad[cor_o.index == 'family_history_with_overweight'])[0]
r13 = np.array(cor_f.Weight[cor_f.index == 'family_history_with_overweight'])[0]
r23 = np.array(cor_o.NObeyesdad[cor_o.index == 'Weight'])[0]
r12_3 = (r12 - r13*r23)/((1-r13**2)**0.5 * (1-r23**2)**0.5)
r12_3

r12_3 = 0.1449776より、
family_history_with_overweightとNObeyesdadの間は疑似相関である。
フィルタ法によって選ばれた特徴量はWeight, CAEC, Ageの3つである

In [None]:
train_data['BMI'] = pd.DataFrame(train_data.Weight / (train_data.Height**2))
test_data['BMI'] = pd.DataFrame(test_data.Weight / (test_data.Height**2))
train_data.BMI.corr(train_data.NObeyesdad)

In [None]:
train_data.BMI.corr(train_data.Weight)

BMIとWeightの相関が大きいため、Weightを除去する

In [None]:
# 各肥満度のBMI分布
for i in range(7):
    plt.hist(train_data.BMI[train_data.NObeyesdad == i], width = 0.5)
plt.axvline(18.5, lw=1, color = 'black')
plt.axvline(25, lw = 1, color = 'black')
plt.axvline(30, lw = 1, color = 'black')
plt.axvline(35, lw = 1, color = 'black')
plt.axvline(40, lw = 1, color = 'black')
plt.show()

In [None]:
# 各肥満度、性別の年齢とBMIの散布図
fig = plt.figure(figsize = (10, 20))
for i in range(7):
    for j in range(2):
        plt.subplot(4, 2, i+1)
        plt.xlim(0, 70)
        plt.ylim(0, 60)
        plt.scatter(train_data.Age[(train_data.NObeyesdad == i) & (train_data.Gender == j)], train_data.BMI[(train_data.NObeyesdad == i) & (train_data.Gender == j)], s = 3, alpha = 0.2)
        plt.xlabel('Age')
        plt.ylabel('BMI')
        plt.title('NObeyesdad='+str(i))
plt.show()