In [1]:
import datetime
import gc
import math
import os

import lightgbm as lgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from tqdm.notebook import tqdm

In [17]:
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', 20)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)

## 拼接文件

In [2]:
new_train_path = '../../input/train_time'
new_test_path = '../../input/test_time'

new_train_files = os.listdir(new_train_path)
new_test_files = os.listdir(new_test_path)

In [4]:
ret = []
for file in tqdm(new_train_files):
    df_train = pd.read_csv(f'{new_train_path}/{file}')
    df_train = df_train.drop_duplicates(['x', 'y'], keep='last')
    ret.append(df_train) 
df_train = pd.concat(ret)
df_train.to_hdf('../../input/train_id_all.h5', 'df', mode='w')

HBox(children=(FloatProgress(value=0.0, max=7000.0), HTML(value='')))




In [5]:
del ret
gc.collect()

64

In [6]:
ret = []
for file in tqdm(new_test_files):
    df_test = pd.read_csv(f'{new_test_path}/{file}')
    ret.append(df_test)
df_test = pd.concat(ret)
df_test.to_hdf('../../input/test_id_all.h5', 'df', mode='w')

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [7]:
del ret
gc.collect()

64

## 读取文件

In [8]:
# 读取文件
df_train = pd.read_hdf('../../input/train_id_all.h5')
df_test = pd.read_hdf('../../input/test_id_all.h5')

In [9]:
df_train.shape

(1173557, 120)

In [10]:
df_test.shape

(782378, 119)

In [12]:
df_train = df_train.drop(['Unnamed: 0'], axis=1)
df_test = df_test.drop(['Unnamed: 0'], axis=1)

In [13]:
df_train.head(3)

Unnamed: 0,渔船ID,x,y,速度,方向,time,type,sin_angle,cos_angle,month,...,long_stop_1000_count,long_stop_600_count,long_stop_1000_rate,long_stop_600_rate,short_report_count,short_report_rete,no_move_distance_count,no_move_distance_rate,daytime_count,daytime_rate
0,0,6152038.0,5124873.0,2.59,102,1900-11-10 11:58:19,拖网,0.978148,-0.207912,11,...,18,80,0.043478,0.193237,2,0.004831,391,0.944444,228,0.550725
1,0,6151230.0,5125218.0,2.7,113,1900-11-10 11:48:19,拖网,0.920505,-0.390731,11,...,18,80,0.043478,0.193237,2,0.004831,391,0.944444,228,0.550725
2,0,6150421.0,5125563.0,2.7,116,1900-11-10 11:38:19,拖网,0.898794,-0.438371,11,...,18,80,0.043478,0.193237,2,0.004831,391,0.944444,228,0.550725


In [14]:
def drop_unnecessary_col(df):
    unneccessary = [# 'x', 'y', 'time', '速度', 'time',  # 结束时候的原始数据
                    'time',
                    'sin_angle', 'cos_angle',
                    'month', 'day', 'hour', 'minute', 'second', 'time_past_sec', 'time_step_sec',
                    'x_past', 'y_past', 'x_step', 'y_step', 
                    'xy_distance_step', 'xy_velocity_step', 
                    'sin_hour', 'cos_hour', 
                    'sin_minute', 'cos_minute',
                    'sin_second', 'cos_second',
                    # 'x_max', 'x_min', 'x_mean', 'x_median', 'x_std', 'x_skew', 'x_mode',
                    # 'y_max', 'y_min', 'y_mean', 'y_median', 'y_std', 'y_skew', 'y_mode', 
                    'day_max', 'day_min', 'day_mean', 'day_median', 'day_std', 'day_skew', 'day_mode',
                    #'速度_max', '速度_min', '速度_mean', '速度_median', '速度_std', '速度_skew', '速度_mode',
                    'xy_velocity_step_max', 'xy_velocity_step_min', 
                    'xy_velocity_step_mean', 
                    'xy_velocity_step_median', 'xy_velocity_step_std', 'xy_velocity_step_skew', 'xy_velocity_step_mode',
                    'xy_distance_step_max', 'xy_distance_step_min', 
                    'xy_distance_step_mean', 
                    'xy_distance_step_median', 'xy_distance_step_std', 'xy_distance_step_skew', 'xy_distance_step_mode',
                    'time_step_sec_max', 
                    'time_step_sec_min', 
                    'time_step_sec_mean', 
                    'time_step_sec_median', 'time_step_sec_std', 'time_step_sec_skew', 'time_step_sec_mode',
                    'x_step_max', 
                    'x_step_min', 
                    'x_step_mean', 'x_step_median', 'x_step_std', 'x_step_skew', 'x_step_mode', 
                    'y_step_max', 
                    'y_step_min', 
                    'y_step_mean', 'y_step_median', 'y_step_std', 'y_step_skew', 'y_step_mode', 
                    'sin_angle_max','sin_angle_min', 'sin_angle_mean', 'sin_angle_median', 'sin_angle_std', 'sin_angle_skew', 'sin_angle_mode',
                    'cos_angle_max','cos_angle_min', 'cos_angle_mean', 'cos_angle_median', 'cos_angle_std', 'cos_angle_skew', 'cos_angle_mode',
                    'day_past',
                    'x_past_max', 'y_past_max', 
                    'square_area', 
                    'long_stop_1000_count', 
                    'long_stop_1000_rate', 
                    'long_stop_600_count', 
                    'long_stop_600_rate',
                    'short_report_count', 'short_report_rete',
                    'no_move_distance_count', 
                    'no_move_distance_rate',
                    'daytime_count', 
                    'daytime_rate',
                    ]
    df = df.drop(unneccessary, axis=1)
    
    return df

In [15]:
df_train_clean = drop_unnecessary_col(df_train.copy())
df_test_clean = drop_unnecessary_col(df_test.copy())

In [None]:
df_train_clean = df_train.drop_duplicates('渔船ID')

### 训练

In [19]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn import preprocessing

In [20]:
label_dict = {"拖网": 0,
             "围网": 1,
             "刺网": 2}

df_train_clean['type'] = df_train_clean['type'].apply(lambda x: label_dict[x])

In [22]:
df_train_clean = df_train_clean.drop_duplicates(['x', 'y'], keep='last')

In [23]:
df_train_clean.head(3)

Unnamed: 0,渔船ID,x,y,速度,方向,type,x_max,x_min,x_mean,x_median,x_std,x_skew,x_mode,y_max,y_min,y_mean,y_median,y_std,y_skew,y_mode,速度_max,速度_min,速度_mean,速度_median,速度_std,速度_skew,速度_mode
1,0,6151230.0,5125218.0,2.7,113,0,6152038.0,6118352.0,6119351.0,6118352.0,5037.320747,5.255558,,5130781.0,5124873.0,5130494.0,5130672.0,850.264541,-4.762308,,9.39,0.0,0.265966,0.0,1.321248,5.520205,
2,0,6150421.0,5125563.0,2.7,116,0,6152038.0,6118352.0,6119351.0,6118352.0,5037.320747,5.255558,,5130781.0,5124873.0,5130494.0,5130672.0,850.264541,-4.762308,,9.39,0.0,0.265966,0.0,1.321248,5.520205,
3,0,6149612.0,5125907.0,3.29,95,0,6152038.0,6118352.0,6119351.0,6118352.0,5037.320747,5.255558,,5130781.0,5124873.0,5130494.0,5130672.0,850.264541,-4.762308,,9.39,0.0,0.265966,0.0,1.321248,5.520205,


In [24]:
X = df_train_clean.drop(['type', '渔船ID'], axis=1)
y = df_train_clean['type']

In [25]:
clf = xgb.XGBClassifier(max_depth=6, 
                          learning_rate=0.01, 
                          n_estimators=1000, 
                          objective='multi:softmax',
                          n_jobs=6, 
                          )


fold = StratifiedKFold(n_splits=6, shuffle=True, random_state=42)
models = []

for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):
    clf.fit(X.iloc[train_idx], y.iloc[train_idx])
    # val_pred = model.predict_proba(X.iloc[val_idx])
    val_pred = clf.predict(X.iloc[val_idx])
    val_y = y.iloc[val_idx]
    print(index, 'val f1', metrics.f1_score(val_y, val_pred, average='macro'))

0 val f1 0.9900348157883424


KeyboardInterrupt: 