In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def lgb_model(train_data, test_data):
    null_index = train_data[train_data['label'].isnull()].index
    train_data.drop(null_index, axis=0, inplace=True)
    train_data.reset_index(drop=True, inplace=True)
    train_data['label'] = train_data['label'].map({'Positive': 1, 'Negative': -1})
    label = train_data['label']

    data_all = pd.concat([train_data, test_data], ignore_index=True, sort=False)
    len_train = len(train_data)

    print('TF-IDF计算......')
    tfidf = TfidfVectorizer()
    data_matrix = tfidf.fit_transform(data_all['review']).toarray()
    data = pd.DataFrame(data_matrix, columns=tfidf.get_feature_names())

    train_data = data[:len_train]
    test_data = data[len_train:]

    params = {
        'num_leaves': 32,
        'objective': 'binary',
        'learning_rate': 0.02,
        "boosting": "gbdt",
        "feature_fraction": 0.8,
        "bagging_freq": 1,
        "bagging_fraction": 0.85,
        # "bagging_seed": 23,
        "metric": 'auc',
        "nthread": 6,
        "verbose": -1
    }
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
    lgb_pre = np.zeros(len(test_data))

    for i, (train_index, test_index) in enumerate(folds.split(train_data.values, label.values)):
        print('Fold ', i+1, ' training......')
        train = lgb.Dataset(train_data.iloc[train_index], label=label.iloc[train_index])
        test = lgb.Dataset(train_data.iloc[test_index], label=label.iloc[test_index])

        model = lgb.train(params, train, num_boost_round=10000, valid_sets=[train, test],
                          verbose_eval=-1, early_stopping_rounds=80)
        lgb_pre += model.predict(test_data, num_iteration=model.best_iteration)
    print(model.best_score['valid_1']['auc'])
    print(lgb_pre / 5)

In [3]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('20190506_test.csv')
lgb_model(train_data, test_data)

TF-IDF计算......
Fold  1  training......


KeyboardInterrupt: 