In [1]:
# set file path
filepath = '../fact/'
trainfile = 'train.csv'
testfile = 'test.csv'

In [3]:
# read train.csv
import pandas as pd
df_train = pd.read_csv(filepath+trainfile)
df_test = pd.read_csv(filepath+testfile)

In [123]:
#from sklearn.naive_bayes import MultinomialNB
import lightgbm as lgb
from sklearn.metrics import confusion_matrix, classification_report, f1_score

def get_text_model(train, valid):    
    prefix = 'word_'
    c_word = [column for column in train.columns.tolist() if prefix == column[:len(prefix)]]
    prefix = 'url_'
    c_url = [column for column in train.columns.tolist() if prefix == column[:len(prefix)]]
    prefix = 'hashtag_'
    c_hashtag = [column for column in train.columns.tolist() if prefix == column[:len(prefix)]]
    
    # fill nan
    train.fillna(0, inplace=True)
    
    X_train, X_valid, X_test = train[c_word+c_url+c_hashtag].values, valid[c_word+c_url+c_hashtag].values, test[c_word+c_url+c_hashtag].values
    y_train, y_valid = train.target.values, valid.target.values
    
    # fit model
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
    lgb_test = lgb.Dataset(X_test)
    
    lgbm_params = {
        'objective': 'binary',
        'metric':'binary_logloss', 
        'verbose': -1,
        'learning_rate': 0.01,
        'num_iterations': 1000
    }

    model = lgb.train(
        lgbm_params, 
        lgb_train, 
        valid_sets=lgb_valid,
        verbose_eval=False,
        early_stopping_rounds=10
    )
    
    def report(X, y):
        # print train report
        y_pred = model.predict(X, num_iteration=model.best_iteration)
        y_pred_cls = y_pred >= 0.5
        print('f1:{}'.format(f1_score(y, y_pred_cls, average=None)[0]))
        print(confusion_matrix(y, y_pred_cls))
        print(classification_report(y, y_pred_cls))
    
    report(X_train, y_train)
    report(X_valid, y_valid)
    
    # fit train and valid
    X = np.concatenate([X_train, X_valid], 0)
    y = np.concatenate([y_train, y_valid], 0)
    lgb_train_valid = lgb.Dataset(X, y)
    model = lgb.train(
        lgbm_params, 
        lgb_train_valid, 
        verbose_eval=False
    )
    
    report(X_test, y_test)
    
    # retrun proba
    return (
        model.predict(X_train, num_iteration=model.best_iteration), 
        model.predict(X_valid, num_iteration=model.best_iteration),
        model.predict(X_test, num_iteration=model.best_iteration)
    )

y_train_text_proba, y_test_text_proba = get_text_model(df_train, df_test)

f1:0.8554968795007201
[[2673  271]
 [ 632 2312]]
              precision    recall  f1-score   support

           0       0.81      0.91      0.86      2944
           1       0.90      0.79      0.84      2944

    accuracy                           0.85      5888
   macro avg       0.85      0.85      0.85      5888
weighted avg       0.85      0.85      0.85      5888

f1:0.823793490460157
[[367  68]
 [ 89 238]]
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       435
           1       0.78      0.73      0.75       327

    accuracy                           0.79       762
   macro avg       0.79      0.79      0.79       762
weighted avg       0.79      0.79      0.79       762

f1:0.8177934154310129
[[1627  234]
 [ 491  911]]
              precision    recall  f1-score   support

           0       0.77      0.87      0.82      1861
           1       0.80      0.65      0.72      1402

    accuracy                           0

In [126]:
from catboost import CatBoost
from catboost import Pool
from sklearn.metrics import confusion_matrix, classification_report, f1_score

def get_category_model(train, valid, test):
    c_text = ['keyword', 'location']
    X_train, X_valid, X_test = train[c_text].values, valid[c_text].values, test[c_text].values
    y_train, y_valid = train.target, valid.target
    
    # CatBoost が扱うデータセットの形式に直す
    train_pool = Pool(X_train, label=y_train)
    valid_pool = Pool(X_valid, label=y_valid)
    test_pool = Pool(X_test)

    # 学習用のパラメータ
    params = {
        # タスク設定と損失関数
        'loss_function': 'Logloss',
        # 学習ラウンド数
        'num_boost_round': 1000,
        'eval_metric': 'F1',
        'silent': False,
        'verbose': None,
        'early_stopping_rounds': 10
    }

    # モデルを学習する
    model = CatBoost(params)
    model.fit(train_pool, logging_level='Silent')
    
    def report(X_pool, y):
        y_pred = model.predict(X_pool, prediction_type='Class')
        print('f1:{}'.format(f1_score(y, y_pred, average=None)[0]))
        print(confusion_matrix(y, y_pred))
        print(classification_report(y, y_pred))
    
    report(train_pool, y_train)
    report(valid_pool, y_valid)
    
    # fit train and valid
    X = np.concatenate([X_train, X_valid], 0)
    y = np.concatenate([y_train, y_valid], 0)
    pool = Pool(X, label=y)
    model.fit(pool, logging_level='Silent')
    
    report(test_pool, y_test)
    
    # retrun proba
    return (
        model.predict(train_pool, prediction_type='Probability')[:, 1], 
        model.predict(valid_pool, prediction_type='Probability')[:, 1],
        model.predict(test_pool, prediction_type='Probability')[:, 1]
    )

y_train_cat_proba, y_valid_cat_proba, y_test_cat_proba = get_category_model(df_train, df_valid, df_test)

f1:0.7745313291673704
[[2293  651]
 [ 684 2260]]
              precision    recall  f1-score   support

           0       0.77      0.78      0.77      2944
           1       0.78      0.77      0.77      2944

    accuracy                           0.77      5888
   macro avg       0.77      0.77      0.77      5888
weighted avg       0.77      0.77      0.77      5888

f1:0.7765830346475507
[[325 110]
 [ 77 250]]
              precision    recall  f1-score   support

           0       0.81      0.75      0.78       435
           1       0.69      0.76      0.73       327

    accuracy                           0.75       762
   macro avg       0.75      0.76      0.75       762
weighted avg       0.76      0.75      0.76       762

f1:0.7518267929634641
[[1389  472]
 [ 445  957]]
              precision    recall  f1-score   support

           0       0.76      0.75      0.75      1861
           1       0.67      0.68      0.68      1402

    accuracy                           

In [127]:
from sklearn.linear_model import LogisticRegression
import numpy as np

X_train = np.stack([y_train_text_proba, y_train_cat_proba], 1)
X_valid = np.stack([y_valid_text_proba, y_valid_cat_proba], 1)
X_test = np.stack([y_test_text_proba, y_test_cat_proba], 1)
y_train, y_valid = df_train.target, df_valid.target

clf = LogisticRegression(
    class_weight = 'balanced',
    random_state = 0,
    penalty = 'elasticnet',
    l1_ratio = 0.0, 
    C = 0.001,
    solver='saga'
)

def report(X, y):
    y_pred = clf.predict(X)
    print('f1:{}'.format(f1_score(y, y_pred, average=None)[0]))
    print(confusion_matrix(y, y_pred))
    print(classification_report(y, y_pred))

# 再学習
X = np.concatenate([X_train, X_valid], 0)
y = np.concatenate([y_train, y_valid], 0)
clf.fit(X, y)

report(X_train, y_train)
report(X_valid, y_valid)
report(X_test, y_test)

f1:0.8547120418848166
[[2612  332]
 [ 556 2388]]
              precision    recall  f1-score   support

           0       0.82      0.89      0.85      2944
           1       0.88      0.81      0.84      2944

    accuracy                           0.85      5888
   macro avg       0.85      0.85      0.85      5888
weighted avg       0.85      0.85      0.85      5888

f1:0.8849162011173185
[[396  39]
 [ 64 263]]
              precision    recall  f1-score   support

           0       0.86      0.91      0.88       435
           1       0.87      0.80      0.84       327

    accuracy                           0.86       762
   macro avg       0.87      0.86      0.86       762
weighted avg       0.87      0.86      0.86       762

f1:0.8153489569920165
[[1583  278]
 [ 439  963]]
              precision    recall  f1-score   support

           0       0.78      0.85      0.82      1861
           1       0.78      0.69      0.73      1402

    accuracy                           

In [96]:
df_submit.to_csv('../output/submit.csv', index=None)