In [6]:
%%HTML
<style>
    div#notebook-container    { width: 100%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>

In [7]:
import numpy as np
import pandas as pd
import urllib
import os
import time
import glob
import collections
import lhafile
import datetime
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder

class RaceResults:
    def __init__(self):
        self.baseuri = "http://www1.mbrace.or.jp/od2/K/%s/k%s.lzh" # http://www1.mbrace.or.jp/od2/K/201612/k161201.lzh
        self.results = [] # List of (RACE_TIME, RACE_ID, RACER_NO, RACER_NAME, RACE_DATE, DAY_COUNT_FROM_1995, PLACE, COURSE, EXHIBITION_TIME)

    def download(self, start, end):
        period = pd.date_range(start, end)

        for date in period:
            # Get file from the website
            dirname = date.strftime("%Y%m")
            lzhname = date.strftime("%y%m%d")
            uri = self.baseuri % (dirname, lzhname)
            savename = "./data/results/lzh/%s.lzh" % lzhname
            if not os.path.exists(savename):
                print("Send request to", uri)
                urllib.request.urlretrieve(uri, savename)
                time.sleep(3)

            unpackedpath = "./data/results/K%s.TXT" % lzhname
            unpackedname = os.path.basename(unpackedpath)
            if not os.path.exists(unpackedpath):
                print("Unpacking", savename)
                f = lhafile.Lhafile(savename)
                data = f.read(unpackedname)
                datastr = data.decode(encoding='shift-jis')
                fileobj = open(unpackedpath, "w")
                fileobj.write(datastr)
                fileobj.close()

    def load(self):
        base_date = datetime.date(1995, 1, 1)
        race_id = 1.0
        for filename in glob.glob("./data/results/K16*.TXT"):
            with open(filename, "r") as f:
                race_date_str = filename.replace("./data/results/K", "20").replace(".TXT","")
                tdatetime = datetime.datetime.strptime(race_date_str, '%Y%m%d')
                race_date = datetime.date(tdatetime.year, tdatetime.month, tdatetime.day)
                day_count_from_1995 = float((race_date - base_date).days)
                place = ''
                get_place = -1
                place_index = -1
                get_racer_info = -1
                count = 1
                remaining = 0
                race_time = 0
                for line in f:
                    if line.find('BGN') > -1:
                        get_place = 1
                    elif get_place == 1:
                        place_index = line.replace("\u3000", "").find('［成績］')
                        place = line.replace("\u3000", "")[0:place_index]
                        get_place = -1
                    elif line.startswith("----"):
                        get_racer_info = 1
                    elif get_racer_info == 1 and count != 7:
                        elems = line.replace("\u3000", "").split()
                        if elems[0] not in ['01','02','03','04','05','06']:
                            count += 1
                            continue
                        if elems[9] != '.':
                            elems_time = elems[9].split('.')
                            race_time = float(elems_time[0]) * 60 + float(elems_time[1]) +  float(elems_time[2])/10
                        else:
                            race_time = race_time + 1
                        racer_no = elems[2]
                        racer_name = elems[3]
                        course = float(elems[1])
                        exhibition_time = float(elems[6])
                        self.results.append((race_time, race_id, racer_no, racer_name, race_date, day_count_from_1995, place, course, exhibition_time))
                        count += 1
                    elif count == 7:
                        count = 1
                        get_racer_info = -1
                        race_id += 1

    def get_results_pd(self):
        return pd.DataFrame(self.results, columns=['RACE_TIME', 'RACE_ID', 'RACER_NO', 'RACER_NAME' \
                                                   , 'RACE_DATE',  'DAY_COUNT_FROM_1995', 'PLACE', 'COURSE', 'EXHIBITION_TIME'])
    
    def get_prepared_data(self):
        race_results_x = self.get_results_pd().copy().drop(["RACE_TIME"], axis=1)
        race_results_y = self.get_results_pd().copy()["RACE_TIME"]
        
        race_results_racer_name = race_results_x["RACER_NAME"]
        race_results_racer_name_encoded, race_results_racer_name_categories = race_results_racer_name.factorize()
        race_results_place = race_results_x["PLACE"]
        race_results_place_encoded, race_results_place_categories = race_results_place.factorize()
        
        race_results_x_dropped = race_results_x.drop(["RACE_ID", "RACER_NO", "RACE_DATE", "RACER_NAME", "PLACE"], axis=1)
        
        encoder_racer_name = OneHotEncoder(categories="auto")
        racer_name_1hot = encoder_racer_name.fit_transform(race_results_racer_name_encoded.reshape(-1,1))
        racer_name_pd = pd.DataFrame(racer_name_1hot.toarray(), columns=race_results_racer_name_categories)

        encoder_place = OneHotEncoder(categories="auto")
        place_1hot = encoder_place.fit_transform(race_results_place_encoded.reshape(-1,1))
        place_pd = pd.DataFrame(place_1hot.toarray(), columns=race_results_place_categories)
        
        race_results_1hot = pd.concat([race_results_x_dropped, racer_name_pd, place_pd], axis=1)
        
        min_max_scaler = preprocessing.MinMaxScaler()
        race_results_num = race_results_1hot[["DAY_COUNT_FROM_1995","EXHIBITION_TIME"]]
        race_results_num_scaled = min_max_scaler.fit_transform(race_results_num)
        race_results_num_scaled_pd = pd.DataFrame(race_results_num_scaled, columns=["DAY_COUNT_FROM_1995","EXHIBITION_TIME"])
        
        race_results_1hot_dropped = race_results_1hot.copy().drop(["DAY_COUNT_FROM_1995","EXHIBITION_TIME"], axis=1)
        
        race_results_prepared = pd.concat([race_results_1hot_dropped, race_results_num_scaled_pd], axis=1)
        
        return race_results_prepared, race_results_y

In [15]:
if __name__ == "__main__":
    r = RaceResults()
    r.download("2016-01-01","2016-1-03")
    r.load()