In [None]:
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd


# modin
# import os
# os.environ["MODIN_ENGINE"] = "ray"  # Modin will use Ray
# os.environ["MODIN_ENGINE"] = "dask"  # Modin will use Dask
# import modin.pandas as pd

# 数据加载

In [None]:
data = pd.read_csv("../dataset/traindata/user_video_14day_action_train_data.csv")

In [None]:
data.shape

In [None]:
watch_label = data['watch_label']
watch_label

In [None]:
share_label = data['is_share']
share_label

In [None]:
data.drop(['is_share', 'watch_label'], axis=1, inplace=True)

In [None]:
data.head()

# scikit-learn 实现XGBoost

## watch_labe 预测

### 模型训练

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, watch_label, test_size=0.2, random_state=0)

In [None]:
X_train.shape[0] == y_train.shape[0]

In [None]:
model = xgb.XGBClassifier(max_depth=7, num_class=10, learning_rate=0.1, n_estimators=160, silent=False,  objective='multi:softmax')

In [None]:
model.fit(X_train, y_train.values, eval_set=[(X_test, y_test)],  early_stopping_rounds=20)

### 模型保存

In [None]:
#保存
# from sklearn.externals import joblib
import joblib
filename = 'watch_xgb_v1.pkl'
joblib.dump(model, filename)
# joblib.dump(model, './ckp/watch_xgb_v1.pkl')

#读取
# model = joblib.load('my_xgbregressor.pkl')

In [None]:
ans = model.predict(X_test)

In [None]:
y_test = y_test.values

### 计算准确率

In [None]:
# 计算准确率
cnt1 = 0
cnt2 = 0
for i in range(len(y_test)):
    if ans[i] == y_test[i]:
        cnt1 += 1
    else:
        cnt2 += 1

In [None]:
print("Accuracy: %.2f %% " % (100 * cnt1 / (cnt1 + cnt2)))

### 显示特征重要性

In [None]:
fig,ax = plt.subplots(figsize=(15,15))
plot_importance(model,
                height=0.5,
                ax=ax,
                max_num_features=64)
# plt.savefig('fea_importance.png')
plt.show()

### 预测test数据集 

In [43]:
# load test data
test_data = pd.read_csv("../dataset/testdata/user_video_test.csv")

In [None]:
test_data.shape

In [None]:
test_ans = model.predict(test_data)

In [None]:
test_ans_df = pd.DataFrame(test_ans)

In [None]:
test_ans_df.describe()

In [None]:
test_ans_df.info()

In [None]:
test_ans_df.to_csv("../dataset/testdata/watch_label_result.csv", index=None)

## share_label 预测

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, share_label, test_size=0.2, random_state=666)

In [None]:
del model

### 模型训练

In [None]:
model = xgb.XGBClassifier(max_depth=7, 
                          objective='binary:logistic', 
                          learning_rate=0.1, 
                          n_estimators=160, 
                          silent=False)

In [None]:
    model.fit(X_train, y_train.values, eval_set=[(X_test, y_test)],  early_stopping_rounds=10, verbose=True, eval_metric='auc')

In [None]:
results = model.evals_result()
results

In [None]:
from matplotlib import pyplot
# retrieve performance metrics
results = model.evals_result()
# epochs = len(results['validation_0']['ac'])
x_axis = range(0, 147)
# plot log loss
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['auc'], label='Train')
ax.plot(x_axis, results['validation_1']['auc'], label='Test')
ax.legend()
pyplot.ylabel('AUC')
pyplot.title('XGBoost AUC')
pyplot.show()
# plot classification error
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Test')
ax.legend()
pyplot.ylabel('Classification ACC')
pyplot.title('XGBoost Classification ACC')
pyplot.show()

In [None]:
model.fit?

### 模型保存

In [None]:
#保存
import joblib
filename = 'share_xgb_v1.pkl'
joblib.dump(model, filename)

### 预测test数据集

In [44]:
# load test data
test_data = pd.read_csv("../dataset/testdata/user_video_test.csv")

In [45]:
predictions = model.predict(test_data)

In [46]:
test_share_ans_df = pd.DataFrame(predictions)

In [47]:
test_share_ans_df.describe()

Unnamed: 0,0
count,2822180.0
mean,3.54336e-07
std,0.0005952613
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [48]:
test_share_ans_df.to_csv("../dataset/testdata/share_label_result.csv", index=None)

In [None]:
from sklearn.metrics import accuracy_score
# 对测试集进行预测
predictions = model.predict(X_test)
y_test = y_test.values
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f " % (accuracy))

In [None]:
fig,ax = plt.subplots(figsize=(15,15))
plot_importance(model,
                height=0.5,
                ax=ax,
                max_num_features=64)
plt.savefig('share_fea_importance.png')
plt.show()

In [None]:
fig,ax = plt.subplots(figsize=(15,15))
plot_tree(model, ax=ax)
plt.show()