In [1]:
import sys
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
import datetime
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from xgboost import XGBRegressor
import joblib

import warnings
warnings.filterwarnings("ignore")

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
cd ./drive/MyDrive/Colab Notebooks/unilevel

/content/drive/MyDrive/Colab Notebooks/unilevel


In [6]:
predict_sales = pd.read_csv('./train_sales.csv')
predict_sales.drop_duplicates(inplace=True)

In [7]:
final_id = predict_sales[['uuid','channel','sales_period_']]

# 特征 only channel & sales_period_

In [8]:
## 渠道
def channel(df_in):
    df = df_in.copy()
    df = pd.concat([df, pd.get_dummies(df['channel'], prefix='channel')], axis=1)
    df_out = df.drop(['channel'],axis=1)
    return df_out

In [9]:
df_proc = predict_sales.copy()
print('Start processing...')
print('channel')
df_proc = channel(df_proc)
print('processing finish!')

Start processing...
channel
processing finish!


In [10]:
df_proc

Unnamed: 0,uuid,sales_period_,sales_value,channel_EC,channel_RT
0,112777,6,8.201381,1,0
1,112777,6,5.497663,0,1
2,157886,6,8.775864,1,0
3,157886,6,7.876226,0,1
4,162194,6,10.007262,1,0
...,...,...,...,...,...
157,962209,12,10.235670,1,0
158,962209,12,10.067857,0,1
159,982584,12,9.870377,1,0
160,982584,12,9.844962,0,1


# normalization

In [11]:
df_nor = df_proc.copy()

Y = df_nor[['uuid','sales_value']]
X = df_nor.drop(['sales_value','uuid'],axis=1)

ss = StandardScaler()
std_data = ss.fit_transform(X)
origin_data = ss.inverse_transform(std_data)

df_std_ = pd.DataFrame(std_data)
df_std = pd.concat([Y, df_std_], axis=1)

In [12]:
df_std

Unnamed: 0,uuid,sales_value,0,1,2
0,112777,8.201381,-1.0,1.206045,-1.206045
1,112777,5.497663,-1.0,-0.829156,0.829156
2,157886,8.775864,-1.0,1.206045,-1.206045
3,157886,7.876226,-1.0,-0.829156,0.829156
4,162194,10.007262,-1.0,1.206045,-1.206045
...,...,...,...,...,...
157,962209,10.235670,1.0,1.206045,-1.206045
158,962209,10.067857,1.0,-0.829156,0.829156
159,982584,9.870377,1.0,1.206045,-1.206045
160,982584,9.844962,1.0,-0.829156,0.829156


# 模型训练

In [13]:
df_model = df_std.copy()

x = df_model.drop(['uuid','sales_value'],axis=1)
y = df_model['sales_value']

xgb = XGBRegressor()
model = xgb.fit(x,y)

# 模型保存
joblib.dump(model, './xgboost.pkl')



['./xgboost.pkl']

# 读取模型

In [14]:
xgb = joblib.load('./xgboost.pkl')



In [15]:
predict_result = xgb.predict(x)
print(xgb)
print(predict_result)

XGBRegressor(missing=nan)
[9.402147 8.818059 9.402147 8.818059 9.402147 8.818059 9.402147 8.818059
 9.402147 8.818059 8.818059 8.818059 9.402147 8.818059 8.818059 9.402147
 8.818059 9.402147 8.818059 9.402147 8.818059 9.402147 8.818059 9.402147
 8.818059 8.818059 9.402147 8.818059 9.402147 8.818059 9.402147 9.402147
 8.818059 8.818059 9.402147 8.818059 9.402147 8.818059 9.402147 8.818059
 9.402147 8.818059 9.402147 8.818059 9.402147 8.818059 8.818059 9.402147
 8.818059 8.818059 9.402147 8.818059 9.402147 8.818059 9.402147 8.818059
 8.818059 8.818059 8.818059 8.818059 8.818059 9.402147 8.818059 9.402147
 8.818059 9.402147 8.818059 8.818059 9.402147 8.818059 8.818059 9.402147
 8.818059 8.818059 9.402147 8.818059 9.402147 8.818059 9.402147 8.818059
 8.818059 9.693809 9.12475  9.693809 9.12475  9.693809 9.12475  9.693809
 9.12475  9.693809 9.12475  9.12475  9.12475  9.693809 9.12475  9.12475
 9.693809 9.12475  9.693809 9.12475  9.693809 9.12475  9.693809 9.12475
 9.693809 9.12475  9.12475 

# 生成最终结果

In [16]:
final_id.insert(3,'predict_result',predict_result)

df_final = pd.merge(predict_sales,final_id,on=['uuid','channel','sales_period_'],how='outer')
df_final.drop('sales_value', axis=1, inplace=True)
    
df_final['predict_result'].fillna(9.206592402, inplace=True)
    
df_final.rename({'predict_result':'sales_value'}, axis=1, inplace=True)

In [17]:
print(df_final.info())
df_final

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162 entries, 0 to 161
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   uuid           162 non-null    int64  
 1   channel        162 non-null    object 
 2   sales_period_  162 non-null    int64  
 3   sales_value    162 non-null    float32
dtypes: float32(1), int64(2), object(1)
memory usage: 5.7+ KB
None


Unnamed: 0,uuid,channel,sales_period_,sales_value
0,112777,EC,6,9.402147
1,112777,RT,6,8.818059
2,157886,EC,6,9.402147
3,157886,RT,6,8.818059
4,162194,EC,6,9.402147
...,...,...,...,...
157,962209,EC,12,9.693809
158,962209,RT,12,9.124750
159,982584,EC,12,9.693809
160,982584,RT,12,9.124750
