In [1]:
import os
import csv
import pandas as pd
import numpy as np 
import lightgbm as lgb 
import gc 
from sklearn.model_selection import train_test_split, cross_val_score

if os.path.exists("./data/PINGAN-2018-train_demo.csv"):
    path_train = "./data/PINGAN-2018-train_demo.csv"
    PREDICT = False 
else:
    PREDICT = True 
    path_train = "/data/dm/train.csv"  # 训练文件
    path_test = "/data/dm/test.csv"  # 测试文件
    path_test_out = "model/"  # 预测结果输出路径为model/xx.csv,有且只能有一个文件并且是CSV格式。


def read_csv():
    """
    文件读取模块，头文件见columns.
    :return: 
    """
    # for filename in os.listdir(path_train):
    train = pd.read_csv(path_train)
    test = pd.DataFrame()
    if PREDICT:
        test = pd.read_csv(path_test)
    tempdata = pd.concat([train, test], 0)
    nrow_train = train.shape[0]
    tempdata.columns = ["TERMINALNO", "TIME", "TRIP_ID", "LONGITUDE", "LATITUDE", "DIRECTION", "HEIGHT", 
                        "SPEED", "CALLSTATE", "Y"]
    return tempdata, nrow_train

In [2]:
train ,nrow_train = read_csv()
print(train.head())
print(train.tail())
print(nrow_train)

   TERMINALNO        TIME  TRIP_ID   LONGITUDE   LATITUDE  DIRECTION  \
0           1  1476923580        1  122.985168  41.103741         12   
1           1  1476923640        1  122.984398  41.104904         24   
2           1  1476923700        1  122.986496  41.106388         74   
3           1  1476923760        1  122.989769  41.106884        115   
4           1  1476923820        1  122.991089  41.105442        151   

      HEIGHT  SPEED  CALLSTATE    Y  
0  39.402588   2.15          0  0.0  
1  39.311157   4.11          0  0.0  
2  34.178955   2.99          0  0.0  
3  37.765381   7.59          0  0.0  
4  36.049194   0.24          0  0.0  
       TERMINALNO        TIME  TRIP_ID   LONGITUDE   LATITUDE  DIRECTION  \
69301         100  1476939540       85  120.333733  36.103889         29   
69302         100  1476939480       85  120.333778  36.103771        353   
69303         100  1476939420       85  120.334412  36.103283        317   
69304         100  1476939360      

## 速度处理

In [3]:
groupby_userid = train.groupby('TERMINALNO')
print(groupby_userid['SPEED'].max().head(10))
print(groupby_userid['SPEED'].min().head(10))
print(groupby_userid['SPEED'].mean().head(10))
print(groupby_userid['Y'].mean().head(10))

TERMINALNO
1     32.779999
2     36.119999
3     25.440001
4     33.310001
5     53.480000
6     29.570000
7     35.680000
8     42.720001
9     35.680000
10    32.330002
Name: SPEED, dtype: float64
TERMINALNO
1    -1.0
2     0.0
3     0.0
4    -1.0
5    -1.0
6    -1.0
7     0.0
8    -1.0
9     0.0
10    0.0
Name: SPEED, dtype: float64
TERMINALNO
1     17.489840
2      9.287734
3      7.987331
4      6.312753
5      7.695846
6     11.456653
7      5.705086
8      9.100117
9     16.274006
10     7.297735
Name: SPEED, dtype: float64
TERMINALNO
1     0.00000
2     0.00000
3     0.00000
4     0.00000
5     0.00000
6     0.00000
7     0.00000
8     2.36856
9     0.00000
10    0.00000
Name: Y, dtype: float64


## 方向处理
计算每条路径方向的方差

In [4]:
groupby_userid_tripid = train.groupby(['TERMINALNO', 'TRIP_ID'])
print(groupby_userid_tripid['DIRECTION'].var().fillna(0))

TERMINALNO  TRIP_ID
1           1           3486.700000
            2              0.000000
            3          14384.944444
            4           4952.700000
            5           4743.700000
            6             88.700000
            7           2686.093407
            8          22732.800000
            9           1753.000000
            10         23254.333333
            11           882.000000
            12         21329.200000
            13          1697.333333
            14         11083.277778
            15          1159.500000
            16         22819.700000
            17             8.000000
            18            40.500000
            19           196.700000
            20         28723.066667
            21            68.446970
            22            59.563636
            23            72.500000
            24             3.000000
            25             0.000000
            26          8601.360000
            27           315.743590
        

计算方差的max, min, mean

In [5]:
print(groupby_userid_tripid['DIRECTION'].var().fillna(0).groupby('TERMINALNO').mean().head(10))
print(groupby_userid_tripid['DIRECTION'].var().fillna(0).groupby('TERMINALNO').min().head(10))
print(groupby_userid_tripid['DIRECTION'].var().fillna(0).groupby('TERMINALNO').max().head(10))

TERMINALNO
1      5859.356044
2      4480.411525
3     11222.736912
4      5365.872172
5      6896.961786
6      6374.002252
7      6576.589573
8      7903.142782
9      3035.523867
10     7969.677782
Name: DIRECTION, dtype: float64
TERMINALNO
1       0.000000
2       0.000000
3       9.250000
4       0.000000
5       0.000000
6       0.000000
7       0.000000
8       0.000000
9     695.928571
10      0.000000
Name: DIRECTION, dtype: float64
TERMINALNO
1     30583.142857
2     26149.666667
3     34105.410714
4     64440.500000
5     52812.500000
6     28849.666667
7     52164.500000
8     33135.500000
9     12340.700000
10    35100.333333
Name: DIRECTION, dtype: float64


统计电话状态为2和3的个数

In [14]:
call2train = train[train['CALLSTATE'] == 2]
call3train = train[train['CALLSTATE'] == 3]
print(call2train.head())
print(call3train.head())

      TERMINALNO        TIME  TRIP_ID   LONGITUDE   LATITUDE  DIRECTION  \
1207           3  1471863360        6  112.978920  28.161814        216   
2010           4  1482069900        1  116.852028  33.885815        298   
2011           4  1482069840        1  116.857452  33.883640        281   
2015           4  1482069600        1  116.902390  33.860420        332   
4318           7  1480904160       20  108.367073  30.740995        140   

          HEIGHT      SPEED  CALLSTATE    Y  
1207   63.192932   0.000000          2  0.0  
2010   17.766174   9.080000          2  0.0  
2011   34.483704  26.870001          2  0.0  
2015   25.543945  30.160000          2  0.0  
4318  282.113525  10.880000          2  0.0  
     TERMINALNO        TIME  TRIP_ID   LONGITUDE   LATITUDE  DIRECTION  \
58            1  1482488700       12  122.988625  41.100342        325   
562           2  1479977280        6  113.341125  23.011570        159   
570           2  1479959100        8  113.344711  2