# 本notebook部分代码参考了论坛的0.872的baseline
https://tianchi.aliyun.com/notebook-ai/detail?spm=5176.12586969.1002.9.163c5cfdonMcBc&postId=89854

In [None]:
import warnings
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import catboost as cab
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from matplotlib import pyplot as plt
#from featexp import get_univariate_plots#用于特征筛选，需要先安装featexp

%matplotlib inline
warnings.filterwarnings("ignore")
plt.rcParams['font.sans-serif']=['Simhei']
plt.rcParams['axes.unicode_minus']=False

In [None]:
#处理类别不均衡
class get_features():
    def __init__(self,path,train):
        self.train=train
        self.features=[]
        self.path=path
    #提取出了n个特征
    def results(self):
        if self.train:
            for i in range(7000):
                path=self.path+str(i)+".csv"
                self.build(path)
        else:
            for i in range(7000,9000):     #(9000,11000)
                path=self.path+str(i)+".csv"
                self.build(path)
    def build(self,path):
        df_raw=pd.read_csv(path)
        Len=len(df_raw)
        if self.train==True:
            if df_raw["type"].iloc[0]=='拖网':
                dfs=[df_raw]
            elif df_raw["type"].iloc[0]=='围网':
                dfs=[df_raw,df_raw.loc[0:Len/2,:],df_raw.loc[Len/2:Len,:]]
            else:
                dfs=[df_raw,df_raw.loc[0:Len/4,:],df_raw.loc[Len/4:Len/2,:],df_raw.loc[Len/2:Len*3/4,:],df_raw.loc[Len*3/4:Len,:]]
                #dfs=[df_raw.loc[0:Len/4,:],df_raw.loc[Len/4:Len/2,:],df_raw.loc[Len/2:Len*3/4,:],df_raw.loc[Len*3/4:Len,:]]
        else:
            #dfs=[df_raw,df_raw,df_raw.loc[0:Len/2,:],df_raw.loc[Len/2:Len,:],df_raw,
                 #df_raw.loc[0:Len/4,:],df_raw.loc[Len/4:Len/2,:],df_raw.loc[Len/2:Len*3/4,:],df_raw.loc[Len*3/4:Len,:]]
            dfs=[df_raw]
    
        for df in dfs:
            k=df['y']/df['x']
            self.features.append(k.min())#k_min
            self.features.append(k.max())#k_max
            self.features.append(k.mean())#k_mean

            b=df['y']-k.mean()*df['x']
            self.features.append(b.quantile(0.25))#b_1/4
            self.features.append(b.max())#b_max
            self.features.append(b.mean())#b_mean

            self.features.append(df['x'].min())#x_min
            self.features.append(df['x'].max())#x_max
            self.features.append(df['x'].mean())#x_mean
            self.features.append(df['x'].quantile(0.25))#x_1/4
            self.features.append(df['x'].quantile(0.75))#x_3/4
            self.features.append(df['x'].std())#x_std

            self.features.append(df['y'].min())#y_min
            self.features.append(df['y'].max())#y_max
            self.features.append(df['y'].mean())#y_mean
            self.features.append(df['y'].quantile(0.25))#y_1/4
            self.features.append(df['y'].quantile(0.75))#y_3/4
            self.features.append(df['y'].std())#y_std

            self.features.append(df['x'].cov(df['y']))#xy_cov
            self.features.append(df['速度'].cov(df['方向']))#vd_cov

            area=(df['x'].quantile(0.95)-df['x'].quantile(0.05))*(df['y'].quantile(0.95)-df['y'].quantile(0.05))
            self.features.append(area) #area

            df['time']=pd.to_datetime(df['time'],format='%m%d %H:%M:%S')
            df['hour'] = df['time'].dt.hour
            t_diff=df['time'].diff().iloc[1:].dt.total_seconds()
            x_diff=df['x'].diff().iloc[1:].abs()
            y_diff=df['y'].diff().iloc[1:].abs()
            dis=sum(np.sqrt(x_diff**2+y_diff**2))
            x_a_mean=(x_diff/t_diff).mean()
            y_a_mean=(y_diff/t_diff).mean()

            self.features.append(dis)#dis
            self.features.append(np.sqrt(x_a_mean**2+y_a_mean**2)) #a

            self.features.append(df['速度'].mean())#v_mean
            self.features.append(df['速度'].std())#v_std
            self.features.append(df['速度'].quantile(0.75))#v_3/4

            v_diff=df['速度'].diff().iloc[1:].abs()
            self.features.append(v_diff.mean())#a_mean
            self.features.append(v_diff.std())#a_std


            self.features.append(df['方向'].std())#d_std
            d_diff=df['方向'].diff().iloc[1:].abs()
            self.features.append((d_diff/t_diff).mean())#d_dif_mean
            self.features.append((d_diff/t_diff).std())#d_dif_std

            self.features.append(df['x'].skew())#x_skew
            self.features.append(df['x'].kurt())#x_kurt
            self.features.append(df['y'].skew())#y_skew
            self.features.append(df['y'].kurt())#y_kurt
            self.features.append(df['速度'].skew())#v_skew
            self.features.append(df['速度'].kurt())#v_kurt
            self.features.append(df['方向'].skew())#d_skew
            self.features.append(df['方向'].kurt())#d_kurt
            self.features.append(df['hour'].skew())#h_skew
            self.features.append(df['hour'].kurt())#h_kurt

            self.features.append(df['y'].max()-df['x'].min())#y_x
            self.features.append(df['x'].max()-df['y'].min())#x_y
            self.features.append(df['y'].max()-df['y'].min()/(1+(df['x'].max()-df['x'].min())))#x_y_k
          
            if(self.train):
                if(df["type"].iloc[0]=='拖网'):
                    self.features.append(2)
                elif(df["type"].iloc[0]=='刺网'):
                    self.features.append(1)
                else:
                    self.features.append(0)



In [None]:
#train data
train_features=[]
path_train="/kaggle/input/zhhy-data/hy_round1_train_20200102/hy_round1_train_20200102/"
feature_class=get_features(path_train,train=True)
feature_class.results()
train_features=feature_class.features

In [None]:
#test data
test_features=[]
path_test="/kaggle/input/zhhy-data/hy_round1_testA_20200102/hy_round1_testA_20200102/"
feature_class=get_features(path_test,train=False)
feature_class.results()
test_features=feature_class.features

In [None]:
train_data=pd.DataFrame(np.array(train_features).reshape(14314,int(len(train_features)/14314)))
train_data.columns=['k_min','k_max','k_mean','b_1/4','b_max',
                    'b_mean','x_min','x_max','x_mean','x_1/4',
                    'x_3/4','x_std','y_min','y_max','y_mean','y_1/4',
                    'y_3/4','y_std','xy_cov','vd_cov','area','dis',
                    'a','v_mean','v_std','v_3/4',
                    'a_mean','a_std',
                    'd_std','d_dif_mean','d_dif_std','x_skew','x_kurt','y_skew','y_kurt',
                    'v_skew','v_kurt','d_skew','d_kurt','h_skew','h_kurt','y_x','x_y','x_y_k',
                    'type']


In [None]:
test_data=pd.DataFrame(np.array(test_features).reshape(2000,int(len(test_features)/2000)))
test_data.columns=['k_min','k_max','k_mean','b_1/4','b_max',
                    'b_mean','x_min','x_max','x_mean','x_1/4',
                    'x_3/4','x_std','y_min','y_max','y_mean','y_1/4',
                    'y_3/4','y_std','xy_cov','vd_cov','area','dis',
                    'a','v_mean','v_std','v_3/4',
                    'a_mean','a_std',
                    'd_std','d_dif_mean','d_dif_std','x_skew','x_kurt','y_skew','y_kurt',
                    'v_skew','v_kurt','d_skew','d_kurt','h_skew','h_kurt','y_x','x_y','x_y_k'
                   ]

# featex可以用来作特征筛选

In [None]:
#from featexp import get_univariate_plots

# Plots drawn for all features if nothing is passed in feature_list parameter.

#get_univariate_plots(data=train_data, target_col='type', features_list=['k_min'], bins=10)

In [None]:
kind=train_data.type
train_data.drop(['type'],axis=1,inplace=True)
kind.map({0:'围网',1:'刺网',2:'拖网'}).value_counts()/1000

In [None]:
x_train,x_test,y_train,y_test=train_test_split(train_data,kind,test_size=0.1,random_state=8)

In [None]:
#x_train=np.log1p(x_train)
#x_test=np.log1p(x_test)

In [None]:
clf=cab.CatBoostClassifier(iterations=125
                          ,learning_rate=0.3
                          ,depth=6
                          ,loss_function='MultiClass'
                          ,silent=True
                          ,gpu_cat_features_storage=-1)
clf.fit(x_train,y_train)
clf_val_f1=f1_score(y_test,clf.predict(x_test),average='macro')
clf_train_f1=f1_score(y_train,clf.predict(x_train),average='macro')

print('val_f1:',clf_val_f1,'train_f1:',clf_train_f1)

In [None]:
tick_label=train_data.columns
clf_dic={}
for i,c in enumerate(clf.feature_importances_):
    clf_dic[tick_label[i]]=c
#clf_dic
sorted(clf_dic.items(), key=lambda d: d[1],reverse=True)

In [None]:
xlf=xgb.XGBClassifier(max_depth=6
                      ,learning_rate=0.09
                      ,n_estimators=90
                      ,reg_alpha=0.004
                      ,n_jobs=-1
                      ,importance_type='total_cover')
xlf.fit(x_train,y_train)
xlf_val_f1=f1_score(y_test,xlf.predict(x_test),average='macro')
xlf_train_f1=f1_score(y_train,xlf.predict(x_train),average='macro')

print('val_f1:',xlf_val_f1,'train_f1:',xlf_train_f1)

- val_f1: 0.9555585471895444 
- train_f1: 1.0

In [None]:
llf=lgb.LGBMClassifier(num_leaves=9
                       ,max_depth=5
                       ,learning_rate=0.31
                       ,n_estimators=120
                       ,objective='multiclass'
                       ,n_jobs=-1
                       ,reg_alpha=0
                       ,reg_lambda=0)
llf.fit(x_train,y_train)
llf_val_f1=f1_score(y_test,llf.predict(x_test),average='macro')
llf_train_f1=f1_score(y_train,llf.predict(x_train),average='macro')

print('val_f1:',llf_val_f1,'train_f1:',llf_train_f1)

In [None]:
'''
如果要实用随机森林，需要先填补缺省值
clf_train=train_data.fillna(value=0)
x_train,x_test,y_train,y_test=train_test_split(clf_train,kind,test_size=0.1,random_state=78)
rlf=RandomForestClassifier(n_estimators=100, oob_score=True) #随机森林模型
rlf.fit(clf_train,y_train)
rlf_val_f1=f1_score(y_test,rlf.predict(x_test),average='macro')
rlf_train_f1=f1_score(y_train,rlf.predict(x_train),average='macro')

print('val_f1:',rlf_val_f1,'train_f1:',rlf_train_f1)
'''


# 不使用交叉验证

In [None]:
class pre_submit():
    def __init__(self,block,csv=False): #block表示的是test被分成了几块,默认为１表示不分块
        self.block=block
        self.csv=csv
        
    def print_result(self):
        if self.block==1:
            xlf_result=xlf.predict(test_data)
            clf_result=clf.predict(test_data)
            llf_result=llf.predict(test_data).reshape(1,2000)[0]
        else:
            xlf_pre=xlf.predict_proba(test_data)#(2000*test_count,3)
            clf_pre=clf.predict_proba(test_data)
            llf_pre=llf.predict_proba(test_data)
            xlf_result=[]
            clf_result=[]
            llf_result=[]
            for i in range(2000):
                xlf_result.append(np.argmax(xlf_pre[i*self.block:i*self.block+self.block].mean(axis=0)))
                clf_result.append(np.argmax(clf_pre[i*self.block:i*self.block+self.block].mean(axis=0)))
                llf_result.append(np.argmax(llf_pre[i*self.block:i*self.block+self.block].mean(axis=0)))
        
        print('xgb的预测比例：',self.cal_three(xlf_result))
        print('cat的预测比例：',self.cal_three(clf_result))
        print('lgb的预测比例：',self.cal_three(llf_result))
        print('xgb和cat的diff:',self.cal_diff(xlf_result,clf_result))
        print('xgb和lgb的diff:',self.cal_diff(xlf_result,llf_result))
        print('cat和lgb的diff:',self.cal_diff(clf_result,llf_result))
        
        #是否生成提交的csv文件
        if self.csv==True:
            sub=pd.DataFrame(np.arange(7000,9000,1))
            sub["type"]=pd.Series(xlf_result).map({0:'围网',1:'刺网',2:'拖网'})
            sub.to_csv('Submit_block'+str(self.block)+'.csv',index=None, header=None,encoding="utf-8")
            print(sub.head(5))

    def cal_diff(self,list1,list2):
        diff=0
        for i in range(2000):
            if list1[i]!=list2[i]:
                diff=diff+1
        return diff
    def cal_three(self,pres):
        dic={'拖网':0,'围网':0,'刺网':0}
        for pre in pres:
            if pre==0:
                dic['围网']=dic['围网']+1
            elif pre==1:
                dic['刺网']=dic['刺网']+1
            else:
                dic['拖网']=dic['拖网']+1
        re={'拖网':dic['拖网']/2000,'围网':dic['围网']/2000,'刺网':dic['刺网']/2000}
        return (re,dic)

pre_class=pre_submit(block=1,csv=False)
pre_class.print_result()

## **原始训练数据的分布：**
- 拖网：    0.623000
- 围网：    0.231571
- 刺网：    0.145429

## **将训练分为了９块（拖网１块，围网１＋２＝３块，刺网１＋４＝５块）,加上权重共14314条**
- 拖网    0.305
- 围网    0.340
- 刺网    0.356

## **测试不分块**：全快权重为１
- xgb的预测比例： ({'拖网': 0.617, '围网': 0.24, '刺网': 0.143}, {'拖网': 1234, '围网': 480, '刺网': 286})
- cat的预测比例： ({'拖网': 0.6205, '围网': 0.241, '刺网': 0.1385}, {'拖网': 1241, '围网': 482, '刺网': 277})
- lgb的预测比例： ({'拖网': 0.603, '围网': 0.242, '刺网': 0.155}, {'拖网': 1206, '围网': 484, '刺网': 310})
- xgb和cat的diff: 152
- xgb和lgb的diff: 145
- cat和lgb的diff: 148

## **训练刺网、围网分块，测试分９块**：全块权重为３/9
- xgb的预测比例： ({'拖网': 0.564, '围网': 0.284, '刺网': 0.152}, {'拖网': 1128, '围网': 568, '刺网': 304})
- cat的预测比例： ({'拖网': 0.552, '围网': 0.288, '刺网': 0.16}, {'拖网': 1104, '围网': 576, '刺网': 320})
- lgb的预测比例： ({'拖网': 0.5415, '围网': 0.287, '刺网': 0.1715}, {'拖网': 1083, '围网': 574, '刺网': 343})
- xgb和cat的diff: 91
- xgb和lgb的diff: 84
- cat和lgb的diff: 94


# 采取交叉验证的方式

In [None]:
details=[]
answers=[]

sk=StratifiedKFold(n_splits=10,shuffle=True,random_state=2020)
for train,test in sk.split(train_data,kind):
    
    x_train=train_data.iloc[train]
    y_train=kind.iloc[train]
    x_test=train_data.iloc[test]
    y_test=kind.iloc[test]
    
    xlf.fit(x_train,y_train)
    pred_xgb=xlf.predict(x_test)
    weight_xgb=f1_score(y_test,pred_xgb,average='macro')
    
    llf.fit(x_train,y_train)
    pred_llf=llf.predict(x_test)
    weight_lgb=f1_score(y_test,pred_llf,average='macro')
    
    clf.fit(x_train,y_train)
    pred_cab=clf.predict(x_test)
    weight_cab=f1_score(y_test,pred_cab,average='macro')
    
    prob_xgb=xlf.predict_proba(x_test)
    prob_lgb=llf.predict_proba(x_test)
    prob_cab=clf.predict_proba(x_test)
    
    scores=[]
    ijk=[]
    weight=np.arange(0,1.05,0.1)
    for i,item1 in enumerate(weight):
        for j,item2 in enumerate(weight[weight<=(1-item1)]):
                prob_end=prob_xgb*item1+prob_lgb*item2+prob_cab*(1-item1-item2)
                score=f1_score(y_test,np.argmax(prob_end,axis=1),average='macro')
                scores.append(score)
                ijk.append((item1,item2,1-item1-item2))
    
    ii=ijk[np.argmax(scores)][0]
    jj=ijk[np.argmax(scores)][1]
    kk=ijk[np.argmax(scores)][2]
    
    details.append(max(scores))
    details.append(weight_xgb)
    details.append(weight_lgb)
    details.append(weight_cab)
    details.append(ii)
    details.append(jj)
    details.append(kk)

    print(max(scores))
   
    test_xgb=xlf.predict_proba(test_data)
    test_lgb=llf.predict_proba(test_data)
    test_cab=clf.predict_proba(test_data)
    ans=test_xgb*ii+test_lgb*jj+test_cab*kk
    
    answers.append(np.argmax(ans,axis=1))

In [None]:
df=pd.DataFrame(np.array(details).reshape(int(len(details)/7),7)
                ,columns=['test_end_score','xgboost','lightgbm','catboost'
                ,'weight_xgboost','weight_lightgbm','weight_catboost'])
df

In [None]:
df.mean()

In [None]:
fina=[]
for i in range(2000):
    counts=np.bincount(np.array(answers,dtype='int')[:,i])
    fina.append(np.argmax(counts))
end=pd.DataFrame(np.arange(7000,9000,1))
end["type"]=pd.Series(fina).map({0:'围网',1:'刺网',2:'拖网'})
end.to_csv('result.csv',index=None, header=None,encoding="utf-8")
end.head()

# 特征重要性结合featexp的特征筛选

In [None]:
width=0.25
address=np.arange(len(train_data.columns))
tick_label=train_data.columns
plt.figure(dpi=600,figsize=(3.6,1.5))
plt.bar(address
        ,xlf.feature_importances_
        ,width
        ,color='y'
        ,label='xgboost')
plt.bar(address+width
        ,llf.feature_importances_/sum(llf.feature_importances_)
        ,width
        ,color='c'
        ,label='lightgbm')
plt.bar(address+2*width
        ,clf.feature_importances_/sum(clf.feature_importances_)
        ,width
        ,color='pink'
        ,label='catboost')
plt.legend(fontsize=4)
plt.xticks(address+width
           ,tick_label
           ,fontsize=4
           ,rotation=90
           ,horizontalalignment='center')
plt.yticks(fontsize=4,rotation=0)
plt.title('xgboost&lightgbm&catboost——feature_importances_',fontsize=4)