In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
import xgboost
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import warnings
import argparse
warnings.filterwarnings('ignore')

In [2]:
def _get_parser(args=[]):
    parser = argparse.ArgumentParser()
    parser.add_argument('--train_path', type=str, default='D:\自我学习\天池比赛\hy_round1_train_20200102', help='Directory of train file.')
    parser.add_argument('--test_path', type=str, default='D:\自我学习\天池比赛\hy_round1_testA_20200102', help='Directory of test file.')
    parser.add_argument('--max_size', type=int, default=3969, help='number of a ship record.')
#     parser.add_argument('--stop_words', type=str, default='./data/stop_words', help='Directory of stop words.')
#     parser.add_argument('--sample_number', type=int, default=5, choices=[Range(1)], help='Sample number for each bucket.')
#     parser.add_argument('--threshold', type=float, default=0.3, choices=[Range(0.0, 1.0)], help='Threshold for matching.')
#     parser.add_argument('--name_len', type=int, default=9, choices=[Range(2)], help='Filename length.')
#     parser.add_argument('--name_len_update', type=bool, default=False, help='To update file name length.')
#     parser.add_argument('--lang', type=str, choices=['cn', 'en'], default='cn', help='Segmentor language setting.')
    args = parser.parse_args(args=[])
    return args
args = _get_parser(args=[])

In [None]:
args.train_path

In [3]:
#导入数据
train_files = os.listdir(args.train_path)
test_files = os.listdir(args.test_path)
print(len(train_files), len(test_files))

#将多个Dataframe拼接成一个，速度超级快
ret = []
train_labels = []
for file in tqdm(train_files):
    df = pd.read_csv(f'{args.train_path}/{file}')
    train_labels.append(df['type'][0])
    ret.append(df)
df_train = pd.concat(ret)
df_train.columns = ['ship','x','y','v','d','time','type']
#删除无用的两列
del df_train['time']
del df_train['type']

  0%|▍                                                                              | 34/7000 [00:00<00:20, 337.67it/s]

7000 2000


100%|█████████████████████████████████████████████████████████████████████████████| 7000/7000 [00:19<00:00, 355.23it/s]


In [None]:
len(train_labels)

In [4]:
#得到测试数据
ret = []
for file in tqdm(test_files):
    df = pd.read_csv(f'{args.test_path}/{file}')
    ret.append(df)
df_test = pd.concat(ret)
df_test.columns = ['ship','x','y','v','d','time']
#删除无用的time
del df_test['time']

100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:04<00:00, 401.81it/s]


In [5]:
df_train_ = df_train.drop(['ship'], axis=1)
df_train_.head()

Unnamed: 0,x,y,v,d
0,6152038.0,5124873.0,2.59,102
1,6151230.0,5125218.0,2.7,113
2,6150421.0,5125563.0,2.7,116
3,6149612.0,5125907.0,3.29,95
4,6148803.0,5126252.0,3.18,108


In [6]:
# df_train.shape
#做归一化
std = MinMaxScaler(feature_range=(0, 1))
df_train_ = df_train.drop(['ship'], axis=1)
scaled_train = std.fit_transform(df_train_)
print(scaled_train.shape)

df_test_ = df_test.drop(['ship'], axis=1)
scaled_test = std.fit_transform(df_test_)
print(scaled_test.shape)

(2699638, 4)
(782378, 4)


In [7]:
#将ship序号重新放入到数据中
scaled_train_df = pd.DataFrame(scaled_train)
scaled_train_df['ship'] = np.array(df_train['ship'])
# scaled_train_df.head()

scaled_test_df = pd.DataFrame(scaled_test)
scaled_test_df['ship'] = np.array(df_test['ship'])
print(scaled_test_df.head())

          0         1         2         3  ship
0  0.997602  0.740036  0.001211  0.000000  7000
1  0.997647  0.740039  0.003523  0.961111  7000
2  0.997651  0.740006  0.001211  0.000000  7000
3  0.997651  0.740006  0.001211  0.197222  7000
4  0.997651  0.740006  0.001211  0.083333  7000


In [8]:
#换个列名
scaled_train_df.columns = ['x','y','v','d','ship']
scaled_test_df.columns = ['x','y','v','d','ship']

In [None]:
#查看每艘船的采集点数
num_point_ship = pd.DataFrame(df_train['ship'].value_counts())
num_point_ship = num_point_ship.reset_index(drop=False)
num_point_ship.columns = ['ship','num_point']
num_point_ship.head()

In [None]:
num_point_ship.tail()

In [None]:
num_point_ship[num_point_ship['num_point']>500].shape

In [None]:
#求采集点数的众数和平均数
#众数
counts = np.bincount(np.array(num_point_ship['num_point']))
zs = np.argmax(counts)
print('众数: ',zs)

print('平均数: ',num_point_ship['num_point'].mean())

In [None]:
df_train[df_train['ship']==254]['type'][0]

In [None]:
a = [254,0,0,0,0,0,'刺网']*10
b = pd.DataFrame(np.array(a).reshape([-1,7]))
b.shape

In [None]:
7000*3969

In [9]:
#先按照最大的进行取padding,将每一艘船的采集点数都padding到相同的数，该数为最大的一个
def convert_ship_to_same(df,limit_size = args.max_size):
    all_ = []
    ship_codes = df['ship'].unique()
    if 'type' in df.columns:
        del df['type']
    for ship_code in tqdm(ship_codes):
        temp = df[df['ship']==ship_code].copy()
        #pad
        if temp.shape[0] < limit_size:
            rest_num = limit_size - temp.shape[0]
            a = [0,0,0,0,ship_code]*rest_num
            pad_ = pd.DataFrame(np.array(a).reshape([-1,5]))
            pad_.columns = ['x','y','v','d','ship']
            temp_ = pd.concat([temp,pad_],axis=0)
            #print(temp_.shape)
       #Trunc
        else:
            temp_ = temp[:limit_size]
        all_.append(temp_.values)
    all_data = np.array(all_)
    print(all_data.shape)
    return all_data

In [10]:
df_train_pad = convert_ship_to_same(scaled_train_df)

100%|█████████████████████████████████████████████████████████████████████████████| 7000/7000 [00:47<00:00, 148.67it/s]


(7000, 3969, 5)


In [11]:
df_test_pad = convert_ship_to_same(scaled_test_df)

100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:07<00:00, 267.24it/s]


(2000, 3969, 5)


In [12]:
#删除ship的序号
def delete_ship(df_arr):
    a = []
    for i in tqdm(range(len(df_arr))):
        b = np.delete(df_arr[i], [4], axis=1) 
        a.append(b)
    end = np.array(a)
    print(end.shape)
    return end
    

In [13]:
train_input = delete_ship(df_train_pad)
test_input = delete_ship(df_test_pad)

100%|███████████████████████████████████████████████████████████████████████████| 7000/7000 [00:00<00:00, 10182.80it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 10610.24it/s]

(7000, 3969, 4)





(2000, 3969, 4)


In [14]:
#保存好输入
import pickle

with open('train_input.pkl', 'wb') as file:
    pickle.dump(train_input, file)

with open('test_input.pkl', 'wb') as file:
    pickle.dump(test_input, file)

In [15]:
#将标签转化成数字类别
dict_label = {'拖网':0,'围网':1,'刺网':2}
label = []
for i in train_labels:
    a = dict_label[i]
    label.append(a)
y = np.array(label)
print(y.shape)
print(y[:10])

(7000,)
[0 0 0 0 1 0 1 1 0 0]


In [16]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
y = np.reshape(y,newshape=[7000,-1])
y_new = enc.fit_transform(y)
y_new = y_new.toarray()
print(y_new.shape)


(7000, 3)


In [17]:
from keras.models import Sequential
from keras.layers import Conv2D, Activation, MaxPool2D, Flatten, Dense
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [18]:
X_train,X_test, y_train, y_test  = train_test_split(train_input,y_new,test_size=0.25, random_state=0)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5250, 3969, 4)
(1750, 3969, 4)
(5250, 3)
(1750, 3)


In [20]:
X_train = X_train.reshape([5250, 3969, 4,-1])
print(X_train.shape)

(5250, 3969, 4, 1)


In [21]:
X_test = X_test.reshape([1750, 3969, 4,-1])

In [25]:
# 初始化一个模型
model = Sequential()

# 模型卷积层设计
model.add(Conv2D(
    nb_filter=128,  # 第一层设置32个滤波器
    nb_row=5,
    nb_col=5,  # 设置滤波器的大小为5*5
    padding='same',  # 选择滤波器的扫描方式，即是否考虑边缘
    input_shape=(3969, 4,1)  # 设置输入的形状
))

# 选择激活函数
model.add(Activation('relu'))

# 设置下采样(池化层）
model.add(MaxPool2D(
    pool_size=(2,2),  # 下采样格为2*2
    strides=(2,2),  # 向右向下的步长
    padding='same', # padding mode is 'same'
))

model.add(Conv2D(64, (5, 5), padding='same'))
model.add(Activation('relu'))
model.add(MaxPool2D(strides=(2, 2), padding='same'))


# 使用Flatten函数，将输入数据扁平化（因为输入数据是一个多维的形式，需要将其扁平化）
model.add(Flatten())  # 将多维的输入一维化
model.add(Dense(128))  # 全连接层1024个点
model.add(Activation('relu'))

# 在建设一层
model.add(Dense(3))  # 输入是个类别
model.add(Activation('softmax'))  # 用于分类的softmax函数

adam = Adam()  # 学习速率lr=0.0001

model.compile(optimizer=adam,
    loss='categorical_crossentropy',
    metrics=['accuracy'])

print("training ==========~~~~~~~~=======")
model.fit(X_train, y_train,validation_split=0.2,epochs=15, batch_size=64,verbose=1)  # 全部训练次数epochs=1次，每次训练批次大小batch_size=64

print("Testing ==========~~~~~~~~~~~~======")
loss, accuracy = model.evaluate(X_test, y_test)

print("\nloss:", loss)
print("\nTest:", accuracy)

Train on 4200 samples, validate on 1050 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15

KeyboardInterrupt: 

In [None]:
test_input = test_input.reshape([test_input.shape[0],test_input.shape[1],test_input.shape[2],-1])
results = model.predict(test_input)
print(results.shape)

In [None]:
label_index = np.array([np.argmax(x) for x in results])
label_index

In [None]:
ship_index = df_test['ship'].unique()

label_index = label_index.reshape([len(label_index),-1])
ship_index = ship_index.reshape([len(ship_index),-1])
ss = pd.DataFrame(np.concatenate([ship_index,label_index],axis=1))
ss.columns = ['ship','label']
print(ss.head())

In [None]:
reverse_label = {0:'拖网', 1:'围网', 2:'刺网'}
ss['y'] = ss['label'].map(reverse_label)
print(ss)

In [None]:
ss.to_csv('sub_2_0110.csv',index=False,header=None)