# 計程車時間預測
---
[連結](https://www.kaggle.com/c/pkdd-15-taxi-trip-time-prediction-ii)

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import os
import datetime

## 資料說明
---
Each data sample corresponds to one completed trip. It contains a total of 
9 (nine) features, described as follows:

1. TRIP_ID: (String) It contains an unique identifier for each trip;  
2. CALL_TYPE: (char) It identifies the way used to demand this service. It may contain one of three possible values:  
a. ‘A’ if this trip was dispatched from the central;  
b. ‘B’ if this trip was demanded directly to a taxi driver on a specific stand;  
c. ‘C’ otherwise (i.e. a trip demanded on a random street).
3. ORIGIN_CALL: (integer) It contains an unique identifier for each phone number which was used to demand, at least, one service. It identifies the trip’s customer if CALL_TYPE=’A’. Otherwise, it assumes a NULL value;
4. ORIGIN_STAND: (integer): It contains an unique identifier for the taxi stand. It identifies the starting point of the trip if CALL_TYPE=’B’. Otherwise, it assumes a NULL value;
5. TAXI_ID: (integer): It contains an unique identifier for the taxi driver that performed each trip;
6. TIMESTAMP: (integer) Unix Timestamp (in seconds). It identifies the trip’s start; 
7. DAYTYPE: (char) It identifies the daytype of the trip’s start. It assumes one of three possible values:
a. ‘B’ if this trip started on a holiday or any other special day (i.e. extending holidays, floating holidays, etc.);
b. ‘C’ if the trip started on a day before a type-B day;
c. ‘A’ otherwise (i.e. a normal day, workday or weekend).
8. MISSING_DATA: (Boolean) It is FALSE when the GPS data stream is complete and TRUE whenever one (or more) locations are missing
9. POLYLINE: (String): It contains a list of GPS coordinates (i.e. WGS84 format) mapped as a string. The beginning and the end of the string are identified with brackets (i.e. [ and ], respectively). Each pair of coordinates is also identified by the same brackets as [LONGITUDE, LATITUDE]. This list contains one pair of coordinates for each 15 seconds of trip. The last list item corresponds to the trip’s destination while the first one represents its start;

**The total travel time of the trip (the prediction target of this competition) is defined as the (number of points-1) x 15 seconds. For example, a trip with 101 data points in POLYLINE has a length of (101-1) * 15 = 1500 seconds. Some trips have missing data points in POLYLINE, indicated by MISSING_DATA column, and it is part of the challenge how you utilize this knowledge.** 

## 讀取資料
---

In [2]:
TrainData = []
TestData = []

TrainFile = open("./Data/train.csv", "r", encoding="utf-8")
TestFile = open("./Data/test.csv", "r", encoding="utf-8")
TrainFileLine = TrainFile.read().splitlines()
TestFileLine = TestFile.read().splitlines()

# 刪除第一欄的垃圾
TrainFileLine = TrainFileLine[1:len(TrainFileLine)]
TestFileLine = TestFileLine[1:len(TestFileLine)]

# 關閉檔案
TrainFile.close()
TestFile.close()

print("Train Size => " + format(len(TrainFileLine)))
print("Test Size =>" + format(len(TestFileLine)))

Train Size => 1710670
Test Size =>320


In [3]:
print(TrainFileLine[0])

"1372636858620000589","C","","","20000589","1372636858","A","False","[[-8.618643,41.141412],[-8.618499,41.141376],[-8.620326,41.14251],[-8.622153,41.143815],[-8.623953,41.144373],[-8.62668,41.144778],[-8.627373,41.144697],[-8.630226,41.14521],[-8.632746,41.14692],[-8.631738,41.148225],[-8.629938,41.150385],[-8.62911,41.151213],[-8.629128,41.15124],[-8.628786,41.152203],[-8.628687,41.152374],[-8.628759,41.152518],[-8.630838,41.15268],[-8.632323,41.153022],[-8.631144,41.154489],[-8.630829,41.154507],[-8.630829,41.154516],[-8.630829,41.154498],[-8.630838,41.154489]]"


## 為資料做處理
---

In [4]:
# 存 Type A 的資料
ATempList = []
ADataList = []
BTempList = []
BDataList = []
CTempList = []
CDataList = []

In [5]:
# 將原本的 String 拆解成 Array
def splitInPart(data):
    # 刪除前面 & 最後面的雙引號
    data = data[1:len(data) - 1]
    dataList = data.split("\",\"")
    return dataList

In [6]:
# 將資料丟進來
def ParseDataInList(string):
    if(string != ""):
        dataVector = splitInPart(string)

        # 丟掉 Missing Data 是 True 的部分
        if(dataVector[7] == "False"):
            TrainData.append(dataVector)

In [7]:
# 開始 Parse 資料
for i in range(0, len(TrainFileLine)):
    # 先將原本的資料裝成 List
    ParseDataInList(TrainFileLine[i])
print(len(TrainData))

1710660


拆解資料  
讓資料會根據 Call Type  
來做出三種不同的 Model 來預測

In [8]:
# 先丟到對應的堆裡面
for i in range(0, len(TrainData)):
    # 先分類
    if(TrainData[i][1] == 'A'):
        ATempList.append(TrainData[i])
    elif(TrainData[i][1] == 'B'):
        BTempList.append(TrainData[i])
    else:
        CTempList.append(TrainData[i])
TrainData = []
print("A type size => " + format(len(ATempList)))
print("B type size => " + format(len(BTempList)))
print("C type size => " + format(len(CTempList)))

A type size => 364769
B type size => 817878
C type size => 528013


In [9]:
# 計算時間
def CountTripTime(polyLine):
    # 避免遇到 [] 這個 case
    if(polyLine == "[]"):
        return 0, 0, 0
    
    polyLine = polyLine[2:len(polyLine) - 2]
    GPSPoints = polyLine.split("],[")
    Time = (len(GPSPoints) - 1) * 15
    
    # 計算終點和起點的差距
    StartPosStr = GPSPoints[0].split(",")
    StartPosX = float(StartPosStr[0])
    StartPosY = float(StartPosStr[1])
    EndPosStr = GPSPoints[len(GPSPoints) - 1].split(",")
    EndPosX = float(EndPosStr[0])
    EndPosY = float(EndPosStr[1])
    return (EndPosX - StartPosX), (EndPosY - StartPosY), Time

In [10]:
def ReduceUnixTime(stamp):
    weekday = datetime.datetime.fromtimestamp(int(stamp)).strftime('%a')
    hour = datetime.datetime.fromtimestamp(int(stamp)).strftime('%H')
    
    if(weekday == "Mon"):
        weekday = "1"
    elif(weekday == "Tue"):
        weekday = "2"
    elif(weekday == "Wed"):
        weekday = "3"
    elif(weekday == "Thu"):
        weekday = "4"
    elif(weekday == "Fri"):
        weekday = "5"
    elif(weekday == "Sat"):
        weekday = "6"
    else:
        weekday = "0"
    return weekday, hour

### 將轉呈 Feature 的資料保存
---

In [11]:
def WriteAllToFile(data, FileName, Title):
    # 創建資料
    if(not os.path.isdir("./Transform Data/")):
        os.mkdir("./Transform Data/")
        
    # 寫入資料
    file = open("./Transform Data/" + FileName, "w")
    file.write(Title)
    for lineData in data:
        for index in range(0, len(lineData)):
            if(index > 0):
                file.write(",")
            file.write(format(lineData[index]))
        file.write("\n")
    file.close()

### 處理 A Type 資料
---

In [12]:
# 把沒用的東西丟掉
def ThrowUselessThingInA():
    for i in range(0, len(ATempList)):
        tempData = []
        
        # 發現好像全部 A Type 的，DataType 都是 A(Normal Day)
        W, D = ReduceUnixTime(ATempList[i][5])
        tempData.append(W)
        tempData.append(D)
        
        # 轉換後的結果
        X, Y, T = CountTripTime(ATempList[i][8])
        tempData.append(format(X))
        tempData.append(format(Y))
        tempData.append(format(T))
        
        if(T > 0):
            ADataList.append(tempData)

In [13]:
ThrowUselessThingInA()
WriteAllToFile(ADataList, "A.csv", "\"Weekday\",\"Hour\",\"DiffLong\",\"DiffLat\",\"Time\"\n")

### 處理 B Type 的資料
---

In [14]:
# 把沒用的東西丟掉
def ThrowUselessThingInB():
    for i in range(0, len(BTempList)):
        tempData = []
        
        # 同上，DataType 都是 A(Normal Day)
        W, D = ReduceUnixTime(BTempList[i][5])
        tempData.append(W)
        tempData.append(D)
        
         # 轉換後的結果
        X, Y, T = CountTripTime(BTempList[i][8])
        tempData.append(format(X))
        tempData.append(format(Y))
        tempData.append(format(T))
         
        if(T > 0):
            BDataList.append(tempData)

In [15]:
ThrowUselessThingInB()
WriteAllToFile(BDataList, "B.csv", "\"Weekday\",\"Hour\",\"DiffLong\",\"DiffLat\",\"Time\"\n")

### 處理 C Type 的資料
---

In [16]:
# 把沒用的東西丟掉
def ThrowUselessThingInC():
    for i in range(0, len(CTempList)):
        tempData = []
        
        # 同上，DataType 都是 A(Normal Day)
        W, D = ReduceUnixTime(CTempList[i][5])
        tempData.append(W)
        tempData.append(D)
        
         # 轉換後的結果
        X, Y, T = CountTripTime(CTempList[i][8])
        tempData.append(format(X))
        tempData.append(format(Y))
        tempData.append(format(T))
         
        if(T > 0):
            CDataList.append(tempData)

In [17]:
ThrowUselessThingInC()
WriteAllToFile(CDataList, "C.csv", "\"Weekday\",\"Hour\",\"DiffLong\",\"DiffLat\",\"Time\"\n")

## 轉換 Test Data
---

In [18]:
TestDataList = []

In [19]:
def ParseDataInTestList(string):
    if(string != ""):
        dataVector = splitInPart(string)

    # 丟掉 Missing Data 是 True 的部分
    if(dataVector[7] == "False"):
        TestData.append(dataVector)

In [20]:
# 開始 Parse 資料
for i in range(0, len(TestFileLine)):
    # 先將原本的資料裝成 List
    ParseDataInTestList(TestFileLine[i])
print(len(TestData))

320


### 處理 Test Data
---

In [21]:
# 把沒用的東西丟掉
def ThrowUselessThingInTest():
    for i in range(0, len(TestData)):
        tempData = []
        
        tempData.append(TestData[i][0])
        tempData.append(TestData[i][1])
        
        # 發現好像全部 A Type 的，DataType 都是 A(Normal Day)
        W, D = ReduceUnixTime(TestData[i][5])
        tempData.append(W)
        tempData.append(D)
        
        # 轉換後的結果
        X, Y, T = CountTripTime(TestData[i][8])
        tempData.append(format(X))
        tempData.append(format(Y))
        tempData.append(format(T))
        
        TestDataList.append(tempData)

In [22]:
ThrowUselessThingInTest()
WriteAllToFile(TestDataList, "Test.csv", "\"TRIP_ID\",\"CALL_TYPE\",\"Weekday\",\"Hour\",\"DiffLong\",\"DiffLat\",\"Time\"\n")