### Step5. 來進行一些真正的預測吧
我們來輸入自己的資料，看看自己是不是人生贏家。
### **把自己未來可能的生活狀況餵給模型吧！**

In [2]:
import pandas as pd
from keras import models
from keras.optimizers import Adam

In [3]:
def preprocessing(alldata_df):

  # 補齊 workclass
  alldata_df['workclass 工作類別'].fillna(value='Private', inplace=True)
  alldata_df.loc[alldata_df['occupation 職業'].isnull(), 'workclass 工作類別'].unique()

  # 補齊 occupation
  workclasses = alldata_df['workclass 工作類別'].unique()

  # 用相同工作類別中職業的眾數(mode)填值
  occupation_mode = alldata_df.groupby('workclass 工作類別')['occupation 職業'].apply(pd.Series.mode)

  for workclass in workclasses:
    if (workclass != "Never-worked"): # 記得區分出無工作者
      alldata_df.loc[(alldata_df['occupation 職業'].isnull()) & (alldata_df['workclass 工作類別'] == workclass), 'occupation 職業'] = occupation_mode[workclass][0]
    else:
      alldata_df.loc[(alldata_df['occupation 職業'].isnull()) & (alldata_df['workclass 工作類別'] == workclass), 'occupation 職業'] = "Never-worked"


  # 補齊 native_country
  alldata_df['native_country 國籍'].fillna(value='United-States', inplace=True)

  # 年齡分群
  alldata_df["age_group"]=alldata_df["age 年齡"] #先複製一欄出來，不要改動到原始資料
  alldata_df["age_group"] = pd.cut(alldata_df["age_group"],bins=[16,27,37,47,57,67,77,90],labels=[0,1,2,3,4,5,6])

  # 收入二元化
  alldata_df.loc[(alldata_df['income_bracket 收入'] == '>50K') | (alldata_df['income_bracket 收入'] == '>50K.'), 'salary_bin'] = 1
  alldata_df['salary_bin'].fillna(0, inplace=True)

  used_variables = ['age_group', 'workclass 工作類別', 'education 教育程度', 'education_num 教育時間', 'marital_status 婚姻', 'occupation 職業', 'relationship 社會角色', 'race 種族', 'capital_gain 資本收益', 'capital_loss 資本損失', 'hours_per_week 每週工作時間', 'native_country 國籍']
  label = 'salary_bin'
  useddata_df = pd.DataFrame()

  for variable in used_variables:
    useddata_df[variable] = alldata_df[variable]
  useddata_df[label] = alldata_df[label]

  useddata_df.info()

  # One-hot encoding
  useddata_df_onehot = pd.get_dummies(useddata_df)
  cols = useddata_df_onehot.columns

  return useddata_df_onehot


In [4]:
# used_variables = ['age_group', 'workclass 工作類別', 'education 教育程度', 'education_num 教育時間', 'marital_status 婚姻', 'occupation 職業', 'relationship 社會角色', 'race 種族', 'capital_gain 資本收益', 'capital_loss 資本損失', 'hours_per_week 每週工作時間', 'native_country 國籍']

In [5]:
#@title 選項
# ========== workclass 工作類別 ==========
# ['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov', 'Self-emp-inc' 'Without-pay' 'Never-worked']

# ========== education 教育程度 ==========
# ['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
# 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th'
# '1st-4th' 'Preschool' '12th']

# ========== marital_status 婚姻 ==========
#['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
# 'Separated' 'Married-AF-spouse' 'Widowed']

# ========== occupation 職業 ==========
# ['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
# 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
# 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' 'Protective-serv'
# 'Armed-Forces' 'Priv-house-serv' 'Never-worked']

# ========== relationship 社會角色 ==========
# ['Not-in-family' 'Husband' 'Wife' 'Own-child' 'Unmarried' 'Other-relative']

# ========== race 種族 ==========
# ['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other']

# ========== gender 性別 ==========
# ['Male' 'Female']

# ========== native_country 國籍 ==========
# ['United-States' 'Cuba' 'Jamaica' 'India' 'Mexico' 'South' 'Puerto-Rico'
# 'Honduras' 'England' 'Canada' 'Germany' 'Iran' 'Philippines' 'Italy'
# 'Poland' 'Columbia' 'Cambodia' 'Thailand' 'Ecuador' 'Laos' 'Taiwan'
# 'Haiti' 'Portugal' 'Dominican-Republic' 'El-Salvador' 'France'
# 'Guatemala' 'China' 'Japan' 'Yugoslavia' 'Peru'
# 'Outlying-US(Guam-USVI-etc)' 'Scotland' 'Trinadad&Tobago' 'Greece'
# 'Nicaragua' 'Vietnam' 'Hong' 'Ireland' 'Hungary' 'Holand-Netherlands']

In [6]:
#@title Predict My Income!

# Load model
model = models.load_model('/content/drive/MyDrive/GDSC-AI/model')

COLUMNS = ["age 年齡",
           "workclass 工作類別",
           "fnlwgt 序號",
           "education 教育程度",
           "education_num 教育時間",
           "marital_status 婚姻",
           "occupation 職業",
           "relationship 社會角色",
           "race 種族",
           "gender 性別",
           "capital_gain 資本收益",
           "capital_loss 資本損失",
           "hours_per_week 每週工作時間",
           "native_country 國籍",
           "income_bracket 收入"]


# My data
mydata_df = pd.DataFrame({
    'age 年齡': 50,   # Must bigger than 17
    'workclass 工作類別': 'Private',
    'fnlwgt 序號': 0,   # Unused in demo model
    'education 教育程度': 'Bachelors',
    'education_num 教育時間': 16,    #years
    'marital_status 婚姻': 'Never-married',
    'occupation 職業': 'Tech-support',
    'relationship 社會角色': 'Unmarried',
    'race 種族': 'Asian-Pac-Islander',
    'gender 性別': 'Male',
    'capital_gain 資本收益': 0,
    'capital_loss 資本損失': 0,
    'hours_per_week 每週工作時間': 40,
    'native_country 國籍': 'Taiwan'
}, index=[0])

# Someone's data
# mydata_df = pd.DataFrame({
#     'age 年齡': 52,   # Must bigger than 17
#     'workclass 工作類別': 'Self-emp-inc',
#     'fnlwgt 序號': 287927,   # Unused in demo model
#     'education 教育程度': 'HS-grad',
#     'education_num 教育時間': 9,    #years
#     'marital_status 婚姻': 'Married-civ-spouse',
#     'occupation 職業': 'Exec-managerial',
#     'relationship 社會角色': 'Wife',
#     'race 種族': 'White',
#     'gender 性別': 'Female',
#     'capital_gain 資本收益': 15024,
#     'capital_loss 資本損失': 0,
#     'hours_per_week 每週工作時間': 40,
#     'native_country 國籍': 'United-States'
# }, index=[0])

train_df = pd.read_csv(
    "/content/drive/MyDrive/GDSC-AI/adult.data",
    names=COLUMNS,
    sep=r'\s*,\s*',
    engine='python',
    na_values="?")


alldata_df = pd.concat([train_df, mydata_df], axis=0, ignore_index=True)



preprocessed_data = preprocessing(alldata_df)
X_predict = preprocessed_data.iloc[32561:,:].drop('salary_bin', axis=1).values


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32562 entries, 0 to 32561
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   age_group              32562 non-null  category
 1   workclass 工作類別         32562 non-null  object  
 2   education 教育程度         32562 non-null  object  
 3   education_num 教育時間     32562 non-null  int64   
 4   marital_status 婚姻      32562 non-null  object  
 5   occupation 職業          32562 non-null  object  
 6   relationship 社會角色      32562 non-null  object  
 7   race 種族                32562 non-null  object  
 8   capital_gain 資本收益      32562 non-null  int64   
 9   capital_loss 資本損失      32562 non-null  int64   
 10  hours_per_week 每週工作時間  32562 non-null  int64   
 11  native_country 國籍      32562 non-null  object  
 12  salary_bin             32562 non-null  float64 
dtypes: category(1), float64(1), int64(4), object(7)
memory usage: 3.0+ MB


In [7]:
optimizer = Adam()
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

Y_predict = model.predict(X_predict)
print('The probability that your annual income >50K $ is {}'.format(Y_predict[0][0]))

The probability that your annual income >50K $ is 0.22769400477409363
