In [1]:
# -*- coding:utf8 -*-
import os
import csv
import pandas as pd
import numpy as np
import math
import time
import random
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

In [2]:
path_train = 'train.csv'

In [3]:
train = pd.read_csv('train.csv')
# test = pd.read_csv('test.csv')

In [4]:
train.describe()

Unnamed: 0,TERMINALNO,TIME,TRIP_ID,LONGITUDE,LATITUDE,DIRECTION,HEIGHT,SPEED,CALLSTATE,Y
count,69306.0,69306.0,69306.0,69306.0,69306.0,69306.0,69306.0,69306.0,69306.0,69306.0
mean,52.104233,1478202000.0,38.330996,114.547988,32.039129,178.369405,172.926933,9.192463,1.635832,0.221681
std,29.131318,4355266.0,33.879349,6.484526,4.960202,105.717157,365.8454,8.779666,1.948844,1.65045
min,1.0,1467605000.0,1.0,82.823676,21.574338,-1.0,-271.488617,-1.0,0.0,0.0
25%,28.0,1475554000.0,12.0,109.156754,29.102076,88.0,6.0,1.52,0.0,0.0
50%,54.0,1479284000.0,29.0,114.539589,31.264475,178.0,28.115479,7.15,0.0,0.0
75%,78.0,1482121000.0,55.0,120.184822,34.734129,267.0,105.432938,14.011726,4.0,0.0
max,100.0,1483199000.0,223.0,127.403597,44.000393,360.0,3411.894775,53.48,4.0,20.742695


In [5]:
def timestamp_datetime(value):
    format = '%Y-%m-%d %H:%M:%S'
    value = time.localtime(value)
    dt = time.strftime(format, value)
    return dt

def conver_time(data):
    data['Conver_TIME'] = data.TIME.apply(timestamp_datetime)
#     data['month'] = data.Conver_TIME.apply(lambda x: int(x[5:7]))
    data['hour'] = data.Conver_TIME.apply(lambda x: int(x[11:13]))
#     data = data.drop('TIME',axis=1)
#     data = data.drop('Conver_TIME',axis=1)
    return data

def label_process(data):
    pre_label = data.drop_duplicates()
    return pre_label['Y']

In [6]:
def feature_process(data):
    set_data = set(data['TERMINALNO'])
    columns=['p_id',
             'maxTime','phonerisk','dir_risk','height_risk',
             'speed_max','speed_mean','speed_var','height_max','height_mean','height_var','sp_he_mean',
             'zao','wan','shenye',
             'weizhi_ratio','huchu_ratio','huru_ratio','liantong_ratio','duanlian_ratio'
            ]
    feature = pd.DataFrame(columns=columns)
    
    for p_id in set_data:
        tempData = data.loc[data['TERMINALNO'] == p_id]
        tempData = tempData.sort_values(["TIME"])

        tempTime = tempData["TIME"].iloc[0]
        tempSpeed = tempData["SPEED"].iloc[0]
        tempDir = tempData["DIRECTION"].iloc[0]
        tempHeight = tempData["HEIGHT"].iloc[0]

        maxTime = 0
        maxTimelist = []

        phonerisk = 0

        dir_risk = 0

        height_risk = 0
        zao=0
        wan=0
        shenye=0
        
        weizhi = 0
        huchu = 0
        huru = 0
        liantong = 0
        duanlian = 0

        for index, row in tempData.iterrows():
            
            hour = row['hour']
            if 7 <= hour <= 9:
                zao = 1
            elif 17 <= hour <= 19:
                wan = 1
            elif 0 <= hour < 7:
                shenye = 1

            if tempSpeed > 0 and row['CALLSTATE'] != 4:
                if row["CALLSTATE"] == 0:
                    phonerisk += math.exp(tempSpeed / 10) * 0.02
                else:
                    phonerisk += math.exp(tempSpeed / 10)
       
            if row["TIME"] - tempTime == 60:
                maxTime += 60
                tempTime = row["TIME"]

                dir_change = (min(abs(row["DIRECTION"] - tempDir), abs(360 + tempDir - row["DIRECTION"])) / 90.0)
                if tempSpeed != 0 and row["SPEED"] > 0:
                    dir_risk += math.pow((row["SPEED"] / 10), dir_change)

                height_risk += math.pow(abs(row["SPEED"] - tempSpeed) / 10,(abs(row["HEIGHT"] - tempHeight) / 100))
                
                tempHeight = row["HEIGHT"]

            elif row["TIME"] - tempTime > 60:
                maxTimelist.append(maxTime)
                maxTime = 0
                tempTime = row["TIME"]

                tempDir = row["DIRECTION"]
                tempHeight = row["HEIGHT"]
                tempSpeed = row["SPEED"]
                
            if row["CALLSTATE"] == 0:
                weizhi += 1
            elif row["CALLSTATE"] == 1:
                huchu += 1
            elif row["CALLSTATE"] == 2:
                huru += 1
            elif row["CALLSTATE"] == 3:
                liantong += 1
            elif row["CALLSTATE"] == 4:
                duanlian += 1

        speed_max = tempData["SPEED"].max()
        speed_mean = tempData["SPEED"].mean()
        speed_var = tempData["SPEED"].var()
        
        height_max = tempData["HEIGHT"].max()
        height_mean = tempData["HEIGHT"].mean()
        height_var = tempData['HEIGHT'].var()

        sp_he_mean = speed_mean * height_mean

        maxTimelist.append(maxTime)
        maxTime = max(maxTimelist)
        
        total_callstate = len(tempData["CALLSTATE"])
        weizhi_ratio = weizhi / float(total_callstate)
        huchu_ratio = huchu / float(total_callstate)
        huru_ratio = huru / float(total_callstate)
        liantong_ratio = liantong / float(total_callstate)
        duanlian_ratio = duanlian / float(total_callstate)
        
        tempfeature = pd.DataFrame([[p_id,
                                     maxTime,phonerisk,dir_risk,height_risk,
                                     speed_max,speed_mean,speed_var,height_max,height_mean,height_var,sp_he_mean,
                                     zao,wan,shenye,
                                     weizhi_ratio,huchu_ratio,huru_ratio,liantong_ratio,duanlian_ratio
                                    ]],
                                    index=['0'],
                                    columns=columns)
        feature = feature.append(tempfeature,ignore_index=True)

    return feature


In [7]:
def norm_feature(df,feature_name):
    for name in feature_name:
#         df[name] = df[name].map(lambda x:(x-df[name].min())/(df[name].max()-df[name].min()))
        df[name] = pd.DataFrame(MinMaxScaler().fit_transform(pd.DataFrame(df[name])),columns=[name])

# def stan_feature(df,feature_name):
#     for name in feature_name:
#         df[name] = preprocessing.scale(df[name])
#         df[name] = df[name].map(lambda x: preprocessing)

norm_feature_name = ['speed_max','speed_mean','speed_var','height_max','height_var','height_mean','sp_he_mean',
                    ]

In [8]:
def data_process(data):
    data = conver_time(data)
    feature_data = feature_process(data)
    feature_data = feature_data.fillna(method='pad')
    # feature_data = feature_data.dropna(axis=0, how="any")
#     norm_feature(feature_data, norm_feature_name)
    return feature_data

In [9]:
# test_len = len(test)
# train = train.sample(test_len*1.3)
# pre_label = label_process(train[["TERMINALNO","Y"]])

# train = train.drop('Y',axis=1)

In [10]:
feature_train = data_process(train)

In [11]:
feature_train.shape

(100, 20)

In [12]:
feature_train

Unnamed: 0,p_id,maxTime,phonerisk,dir_risk,height_risk,speed_max,speed_mean,speed_var,height_max,height_mean,height_var,sp_he_mean,zao,wan,shenye,weizhi_ratio,huchu_ratio,huru_ratio,liantong_ratio,duanlian_ratio
0,1.0,3540.0,22.698271,759.505736,253.833059,0.393851,0.493323,0.760889,0.064576,0.016106,1.403423e-03,0.016615,1.0,1.0,1.0,0.489796,0.000000,0.000000,0.003401,0.506803
1,2.0,3180.0,67.556593,3646.161361,376.753980,0.491654,0.186809,0.850271,0.153262,0.024443,1.630378e-02,0.013110,0.0,1.0,1.0,0.396978,0.000000,0.000000,0.038462,0.564560
2,3.0,3600.0,41.291745,597.444549,840.111283,0.178917,0.138213,0.158747,0.035725,0.014724,4.559916e-04,0.006810,1.0,1.0,0.0,0.870197,0.000000,0.001038,0.000000,0.128764
3,4.0,2700.0,110.310492,659.554444,656.575946,0.409370,0.075634,0.170058,0.032831,0.010376,2.561429e-04,0.003760,1.0,1.0,0.0,0.228535,0.007576,0.003788,0.008838,0.751263
4,5.0,5520.0,39.709307,616.796195,776.837669,1.000000,0.127320,0.337637,0.033364,0.009598,3.208416e-04,0.004302,1.0,1.0,0.0,1.000000,0.000000,0.000000,0.000000,0.000000
5,6.0,2700.0,24.592781,571.754381,428.472839,0.299854,0.267862,0.474974,0.037098,0.002493,2.947094e-04,0.001897,1.0,1.0,0.0,1.000000,0.000000,0.000000,0.000000,0.000000
6,7.0,2460.0,78.806705,515.198829,627.895442,0.478770,0.052925,0.304126,0.125709,0.086392,5.485380e-03,0.028097,1.0,1.0,0.0,0.220922,0.002753,0.001376,0.009635,0.765313
7,8.0,3600.0,28.676366,567.978879,556.496038,0.684920,0.179798,0.508446,0.054044,0.012115,1.098151e-03,0.006443,1.0,1.0,1.0,0.948424,0.000000,0.000000,0.001433,0.050143
8,9.0,5340.0,26.787776,339.889983,290.027412,0.478770,0.447887,0.749851,0.008278,0.001143,1.093164e-04,0.001558,1.0,1.0,0.0,1.000000,0.000000,0.000000,0.000000,0.000000
9,10.0,2760.0,223.813791,663.019670,511.592672,0.380674,0.112443,0.243831,0.116244,0.107328,2.779563e-03,0.044729,1.0,1.0,1.0,0.004658,0.027950,0.007764,0.099379,0.860248


In [13]:
feature_train.head()

Unnamed: 0,p_id,maxTime,phonerisk,dir_risk,height_risk,speed_max,speed_mean,speed_var,height_max,height_mean,height_var,sp_he_mean,zao,wan,shenye,weizhi_ratio,huchu_ratio,huru_ratio,liantong_ratio,duanlian_ratio
0,1.0,3540.0,22.698271,759.505736,253.833059,0.393851,0.493323,0.760889,0.064576,0.016106,0.001403,0.016615,1.0,1.0,1.0,0.489796,0.0,0.0,0.003401,0.506803
1,2.0,3180.0,67.556593,3646.161361,376.75398,0.491654,0.186809,0.850271,0.153262,0.024443,0.016304,0.01311,0.0,1.0,1.0,0.396978,0.0,0.0,0.038462,0.56456
2,3.0,3600.0,41.291745,597.444549,840.111283,0.178917,0.138213,0.158747,0.035725,0.014724,0.000456,0.00681,1.0,1.0,0.0,0.870197,0.0,0.001038,0.0,0.128764
3,4.0,2700.0,110.310492,659.554444,656.575946,0.40937,0.075634,0.170058,0.032831,0.010376,0.000256,0.00376,1.0,1.0,0.0,0.228535,0.007576,0.003788,0.008838,0.751263
4,5.0,5520.0,39.709307,616.796195,776.837669,1.0,0.12732,0.337637,0.033364,0.009598,0.000321,0.004302,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [14]:
feature_train.describe()

Unnamed: 0,p_id,maxTime,phonerisk,dir_risk,height_risk,speed_max,speed_mean,speed_var,height_max,height_mean,height_var,sp_he_mean,zao,wan,shenye,weizhi_ratio,huchu_ratio,huru_ratio,liantong_ratio,duanlian_ratio
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,50.5,3459.0,78.495711,627.319222,531.612678,0.383113,0.243328,0.34876,0.103307,0.065985,0.023241,0.044516,0.9,0.91,0.45,0.634783,0.002828,0.001542,0.01536,0.345486
std,29.011492,1872.215508,174.317182,469.379451,211.447543,0.180433,0.182496,0.232188,0.170066,0.145312,0.10808,0.124709,0.301511,0.287623,0.5,0.395661,0.005574,0.00593,0.028534,0.373129
min,1.0,900.0,1.580266,214.867458,162.413532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,25.75,2160.0,28.363466,453.219205,382.246475,0.26003,0.1133,0.161849,0.016646,0.001448,0.000121,0.001288,1.0,1.0,0.0,0.211501,0.0,0.0,0.0,0.0
50%,50.5,3090.0,42.805348,529.768872,509.024135,0.392387,0.203245,0.306601,0.031532,0.007573,0.000278,0.004287,1.0,1.0,0.0,0.842496,0.0,0.0,0.0,0.152999
75%,75.25,4035.0,65.691064,682.041173,634.660305,0.494363,0.34872,0.481951,0.116445,0.041345,0.00142,0.022054,1.0,1.0,1.0,1.0,0.002724,0.001598,0.017564,0.754751
max,100.0,11820.0,1685.436788,3646.161361,1177.407773,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.028986,0.057911,0.157971,0.961224


In [15]:
# feature_train.isnull().sum()

In [16]:
# tempData.isnull().sum()

In [17]:
# tempData[tempData['SPEED_DIFF'].isnull()!=False]

In [18]:
# tempData.head()

In [19]:
# tempData[tempData['SPEED_DIFF']<0].count()