In [1]:
import pandas as pd
import numpy as np
import re, os

## Load files

In [2]:
os.chdir('/tmp2/b06902021/ML/learning2read/submissions')
df_track1 = pd.read_csv('track1.csv')
df_track2 = pd.read_csv('track2.csv')

In [3]:
df_track1.sample(10)

Unnamed: 0,1248213-25,1251481-26,1254969-27,1257047-28,1257740-29,1289243-30,1318852-31,1321601-32,1371927-33,1391016-34,...,1286506-9,1291784-10,1309036-11,1310260-12,1379040-13,1379329-14,1379768-15,1385392-16,1406629-17,1461343-18
122720,7,7,7,8,8,8,7,7,6,6,...,8,8,7,7,8,8,8,8,8,7
13428,9,9,9,8,9,9,8,8,8,8,...,8,9,8,8,8,9,9,9,8,8
151060,8,8,8,8,8,8,7,7,8,8,...,7,8,7,7,8,8,8,8,8,7
65514,5,6,6,7,7,6,6,6,6,6,...,6,7,6,6,8,8,8,8,8,8
81716,8,7,8,8,8,7,7,7,8,7,...,7,7,7,7,9,9,9,9,9,8
148122,8,7,8,8,8,7,8,7,8,8,...,8,8,8,7,9,9,9,9,9,9
65813,5,5,6,6,6,6,5,5,6,5,...,5,7,5,5,7,7,7,6,8,6
22722,8,7,8,8,8,7,7,7,8,8,...,7,8,7,7,8,8,8,8,8,7
140465,8,8,8,8,8,8,8,8,7,8,...,8,8,8,8,8,8,8,8,8,8
32444,8,8,8,7,7,7,7,7,7,7,...,7,8,7,7,8,8,8,8,8,7


In [4]:
col1, col2 = list(df_track1), list(df_track2)

def FindID(ls, id):
    a = list(filter(lambda x: re.search('-' + str(id) + '$', x), ls))
    if len(a):
        pos = a[0].find('-')
        return (a[0], int(a[0][:pos]))

cols = []
for i in range(len(col2)):
    a, b = FindID(col1, i), FindID(col2, i)
    cols.append((a, b))
cols[0]

(('1239182-0', 1239182), ('22702152-0', 22702152))

## Simple weighted median

In [5]:
def WeightedMedian(ls): # ls : list of (val, w)
    ls.sort()
    cum = np.cumsum(tuple(zip(*ls))[1])
    ind = np.searchsorted(cum, cum[-1] / 2.)
    return ls[ind][0]

In [6]:
from multiprocessing import cpu_count, Pool

class GetPredict(object):
    def __init__(self, w):
        self.w = w
    def __call__(self, df):
        return df.apply(lambda a: WeightedMedian([(a[i[0]], i[1]) for i in self.w]), axis = 1)

def ApplyPara(data, func):
    cores = cpu_count()
    data_split = np.array_split(data, cores, axis = 0)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

def Blend(base1, base2, id):
    w_track1 = [(i[0][0], 1 / (i[0][1] - base1)) for i in filter(lambda a: a[0], cols)]
    w_track2 = [(i[1][0], 1 / (i[1][1] - base2)) for i in cols]
    ApplyPara(df_track1, GetPredict(w_track1)).to_csv('blend/blend' + str(id) + '-1.csv', index = False)
    ApplyPara(df_track2, GetPredict(w_track2)).to_csv('blend/blend' + str(id) + '-2.csv', index = False)

In [87]:
Blend(1150000, 21000000, 1)
Blend(1160000, 21600000, 2)
Blend(1200000, 22000000, 3)
Blend(1180000, 21800000, 4)

In [89]:
Blend(1220000, 22100000, 5)

In [90]:
Blend(1225000, 22050000, 6)

In [92]:
Blend(1230000, 21950000, 7)

In [95]:
Blend(1231000, 22000000, 8)

## Conditional blending
Simple (select 1 out of 2), by 2 models

In [7]:
def Select(a, b, thresh, mode):
    a, b = float(a), float(b)
    if mode == 1:
        return a if a < thresh else b
    elif mode == 2:
        return b if b > thresh else a
    elif mode == 3:
        return a if (a + b) / 2 < thresh else b
class GetCondPredict(object):
    def __init__(self, thresh, mode):
        self.thresh = thresh
        self.mode = mode
    def __call__(self, df):
        return df.apply(lambda a: Select(a['low'], a['high'], self.thresh, self.mode), axis = 1)

def CondBlend(df, thresh, mode, id):
    ApplyPara(df, GetCondPredict(thresh, mode)).to_csv('blend/cond_blend' + str(id) + '.csv', index = False)

def Merge(file1, file2):
    df = pd.concat([pd.read_csv(file1, header=None), pd.read_csv(file2, header=None)], axis = 1)
    df.columns = ['low', 'high']
    return df

Merge('blend/blend3-2.csv', 'blend/blend7-1.csv').sample(10)

Unnamed: 0,low,high
81853,7.4318,8
169235,6.0,6
146219,7.0,7
138010,8.0,8
112379,7.0,8
19281,9.52048,10
21895,9.0,10
94331,7.0,8
74609,8.0,8
154577,6.0,6


In [141]:
CondBlend(Merge('blend/blend3-2.csv', 'blend/blend7-1.csv'), 6, 1, 1)
CondBlend(Merge('blend/blend3-2.csv', 'blend/blend7-1.csv'), 5, 1, 2)

In [142]:
CondBlend(Merge('blend/blend3-2.csv', 'blend/blend7-1.csv'), 8, 1, 3)
CondBlend(Merge('blend/blend3-2.csv', 'blend/blend7-1.csv'), 8.5, 1, 4)

In [144]:
CondBlend(Merge('blend/blend3-2.csv', 'blend/blend7-1.csv'), 8, 2, 5)
CondBlend(Merge('blend/blend3-2.csv', 'blend/blend7-1.csv'), 9, 2, 6)

In [145]:
CondBlend(Merge('blend/blend3-2.csv', 'blend/blend7-1.csv'), 9, 1, 7)
CondBlend(Merge('blend/blend3-2.csv', 'blend/blend7-1.csv'), 8, 3, 8)

In [8]:
class GetInt(object):
    def __init__(self): pass
    def __call__(self, df):
        return df.apply(lambda a: int(a.iloc[0] + 0.5), axis = 1)

def ToInteger(infile, outfile):
    ApplyPara(pd.read_csv(infile, header=None), GetInt()).to_csv(outfile, index = False)

for i in range(1, 9):
    ToInteger('blend/cond_blend' + str(i) + '.csv', 'blend/cond_blend_int' + str(i) + '.csv')

In [9]:
ToInteger('blend/blend3-2.csv', 'blend/blend_int3-2.csv')

In [11]:
CondBlend(Merge('blend/blend3-2.csv', 'blend/blend7-1.csv'), 4, 2, 9)
ToInteger('blend/cond_blend9.csv', 'blend/cond_blend_int9.csv')