In [67]:
import os
import pandas as pd
import mglearn

In [68]:
adult_path = os.path.join(mglearn.datasets.DATA_PATH, "adult.data")
data = pd.read_csv(adult_path, header=None, names=["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
                                                   "occupation", "relationship", "race", "gender", "capital-gain", "capital-loss",
                                                   "hours-pre-week", "native-country", "income"])

In [69]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-pre-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [70]:
data["workclass"].value_counts()

workclass
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64

In [71]:
data["workclass"].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [72]:
replace_source = data["workclass"].unique()[5]
replace_dist = data["workclass"].unique()[2]


In [73]:
data["workclass"] = data["workclass"].replace({data["workclass"].unique()[5]: data["workclass"].unique()[2]})

In [74]:
data["workclass"].value_counts()

workclass
Private             24532
Self-emp-not-inc     2541
Local-gov            2093
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64

In [75]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-pre-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [76]:
data = data.drop("race", axis=1)
data_dummies = pd.get_dummies(data, columns=["workclass", "education", "gender", "occupation", "income"])
data_dummies.head()

Unnamed: 0,age,fnlwgt,education-num,marital-status,relationship,capital-gain,capital-loss,hours-pre-week,native-country,workclass_ Federal-gov,...,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,income_ <=50K,income_ >50K
0,39,77516,13,Never-married,Not-in-family,2174,0,40,United-States,False,...,False,False,False,False,False,False,False,False,True,False
1,50,83311,13,Married-civ-spouse,Husband,0,0,13,United-States,False,...,False,False,False,False,False,False,False,False,True,False
2,38,215646,9,Divorced,Not-in-family,0,0,40,United-States,False,...,False,False,False,False,False,False,False,False,True,False
3,53,234721,7,Married-civ-spouse,Husband,0,0,40,United-States,False,...,False,False,False,False,False,False,False,False,True,False
4,28,338409,13,Married-civ-spouse,Wife,0,0,40,Cuba,False,...,False,False,False,True,False,False,False,False,True,False


In [77]:
data_dummies.columns

Index(['age', 'fnlwgt', 'education-num', 'marital-status', 'relationship',
       'capital-gain', 'capital-loss', 'hours-pre-week', 'native-country',
       'workclass_ Federal-gov', 'workclass_ Local-gov',
       'workclass_ Never-worked', 'workclass_ Private',
       'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc',
       'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th',
       'education_ 11th', 'education_ 12th', 'education_ 1st-4th',
       'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th',
       'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors',
       'education_ Doctorate', 'education_ HS-grad', 'education_ Masters',
       'education_ Preschool', 'education_ Prof-school',
       'education_ Some-college', 'gender_ Female', 'gender_ Male',
       'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces',
       'occupation_ Craft-repair', 'occupation_ Exec-managerial',
       'occupation_ Farming-fishin

In [78]:
features = data_dummies.drop(["marital-status", "relationship", "native-country", "income_ <=50K", "income_ >50K"], axis=1)

# NumPy配列を取り出す
X = features.to_numpy()
y = data_dummies["income_ >50K"].to_numpy()


print(f"X.shape: {X.shape}, y.shape: {y.shape}")


X.shape: (32561, 47), y.shape: (32561,)


In [85]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_selected, y_train)
print(f"Test score: {logreg.score(X_test_selected, y_test)}")

Test score: 0.821152192605331


In [89]:
demo_df = pd.DataFrame({"Integer Feature": [0, 1, 2, 1],
                        "Category Feature": ["socks", "fox", "socks", "box"]})
demo_df.head()

Unnamed: 0,Integer Feature,Category Feature
0,0,socks
1,1,fox
2,2,socks
3,1,box


In [93]:
demo_df.columns

Index(['Integer Feature', 'Category Feature'], dtype='object')

In [94]:
pd.get_dummies(data=demo_df, columns=demo_df.columns[[0, 1]])

Unnamed: 0,Integer Feature_0,Integer Feature_1,Integer Feature_2,Category Feature_box,Category Feature_fox,Category Feature_socks
0,True,False,False,False,False,True
1,False,True,False,False,True,False
2,False,False,True,False,False,True
3,False,True,False,True,False,False
