# XGBoostによる分類モデルのテスト
最適なパラメータを探す
## データ準備

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("./fake_job_postings.csv")
df = df.drop("job_id", axis=1) # job_id = 0,1,2,... 学習価値なし
df.fillna('null', inplace=True) # 空の文字列 -> null

# 偽文書と本物文書
Fakedf = df[ df['fraudulent'] == 1 ]
Realdf = df[ df['fraudulent'] == 0 ]

# 問題文データのランダム抽出 (偽文書割合50%)
detaset_Fake, quiz_Fake = train_test_split(Fakedf, test_size=50)
detaset_Real, quiz_Real = train_test_split(Realdf, test_size=50)

# 問題文データのランダムシャッフル
quizdf = pd.concat([quiz_Real, quiz_Fake])
shuffled_df = quizdf.sample(frac=1).reset_index(drop=True)
quiz = shuffled_df.drop("fraudulent", axis=1)        # 問題文
quiz_solution = shuffled_df["fraudulent"].to_numpy() # 解答

# 訓練データとテストデータへの分割 (テストデータ25%)
train_Fake, test_Fake = train_test_split(detaset_Fake)
train_Real, test_Real = train_test_split(detaset_Real)
traindf = pd.concat([train_Real, train_Fake])
testdf = pd.concat([test_Real, test_Fake])
traindf = traindf.sample(frac=1).reset_index(drop=True)
testdf = testdf.sample(frac=1).reset_index(drop=True)
train_X = testdf.drop('fraudulent', axis=1)
train_y = testdf['fraudulent']
test_X = testdf.drop('fraudulent', axis=1)
test_y = testdf['fraudulent']

## 訓練データ加工
データの傾向を確認し、適切な前処理を行う。

In [4]:
print(train_X.head())

                                   title                location  \
0                               Attorney  US, VA, Virginia Beach   
1     Senior QA Engineer (3-4 Years Exp)       IN, DL, New Delhi   
2  Project Administrator Project Support      PH, , QUEZON CITY    
3            Creative Digital Copywriter           GR, I, Athens   
4                         Marketing Lead       US, CA, San Diego   

       department   salary_range  \
0            null           null   
1            null  300000-600000   
2  Administrator     10000-30000   
3        Creative           null   
4            null           null   

                                     company_profile  \
0  Tidewater Finance Co. was established in 1992 ...   
1  Practical Fish was founded in June, 2012 with ...   
2  Collabera is a fast growing, end-to-end inform...   
3  Tribal Worldwide Athens is a digitally centric...   
4  Cashie Commerce is the fastest way to create a...   

                                     

In [6]:
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'matplotlib'

In [5]:
train_y.plot('pie')

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.

## モデル構築1 (データ加工なし)

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

model = XGBClassifier(random_state=0) # fix seed

parameters = {
    "n_estimators" : [10, 20, 50, 100, 200, 500],
    "max_depth" : [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
}
model_opt = GridSearchCV(model, parameters, scoring="neg_mean_absolute_error")
model_opt.fit(train_X, train_y)