In [6]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMClassifier
import joblib
import numpy as np
import pandas as pd

# 数据加载

In [7]:
data = pd.read_csv("../dataset/traindata/user_video_14day_action_train_data.csv")

In [8]:
watch_label = data['watch_label']
watch_label

0          0
1          0
2          3
3          9
4          0
          ..
7308013    0
7308014    2
7308015    1
7308016    0
7308017    0
Name: watch_label, Length: 7308018, dtype: int64

In [9]:
share_label = data['is_share']
share_label

0          0
1          0
2          0
3          0
4          0
          ..
7308013    0
7308014    0
7308015    0
7308016    0
7308017    0
Name: is_share, Length: 7308018, dtype: int64

In [10]:
data.drop(['is_share', 'watch_label'], axis=1, inplace=True)

In [11]:
data.head()

Unnamed: 0,user_id,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,gender_0,...,class_5,class_6,class_7,class_8,class_9,da_0,da_1,da_2,da_3,da_4
0,17938.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.041772,0.349582,0.041779,0.04178,0.041772,0.324848,0.083544,0.083544,0.083544,0.424519
1,17938.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.05,0.05,0.05,0.05,0.05,0.1,0.1,0.6,0.1,0.1
2,17938.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.03548,0.035478,0.035479,0.035482,0.03548,0.432491,0.075271,0.073186,0.071487,0.347565
3,4263520.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.037601,0.037601,0.037601,0.037601,0.037601,0.075201,0.075202,0.699193,0.075201,0.075202
4,5181723.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.034261,0.034301,0.034266,0.034261,0.034261,0.50428,0.072258,0.070551,0.069007,0.283903


# watch_label预测

In [12]:
# 划分训练数据和测试数据
X_train, X_test, y_train, y_test = train_test_split(data, watch_label, test_size=0.2, random_state=0)

In [None]:
gbm = LGBMClassifier(
    num_leaves=31, 
    learning_rate=0.05, 
    n_estimators=160,
    objective='multiclass',
    num_class=4,
    silent=False
)


In [None]:
# 模型训练
gbm = LGBMClassifier(num_leaves=31, learning_rate=0.05, n_estimators=20)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5)

In [None]:

# 模型存储
joblib.dump(gbm, 'loan_model.pkl')
# 模型加载
gbm = joblib.load('loan_model.pkl')

# 模型预测
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)

# 模型评估
print('The accuracy of prediction is:', accuracy_score(y_test, y_pred))

# 特征重要度
print('Feature importances:', list(gbm.feature_importances_))