In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import k_means
import community
import networkx as nx
from tqdm import tqdm
import gc

In [2]:
mat = np.load('mat.npy') + 1

In [3]:
mat[mat <= 2] = 0

In [4]:
flat_mat = mat[0]
for d in range(1, 31):
    flat_mat = np.hstack([flat_mat, mat[d]])

In [5]:
def coo_cosine_similarity(input_coo_matrix):
    sq = lambda x: x * x.T
    output_csr_matrix = input_coo_matrix.tocsr()
    sqrt_sum_square_rows = np.array(np.sqrt(sq(output_csr_matrix).sum(axis=1)))[:, 0]
    output_csr_matrix.data /= rows_sums_sqrt[input_coo_matrix.row]
    return sq(output_csr_matrix)

In [6]:
def csr_cosine_similarity(input_csr_matrix):
    similarity = np.dot(input_csr_matrix, input_csr_matrix.T)
    square_mag = similarity.diagonal()
    inv_square_mag = 1 / square_mag
    inv_square_mag[np.isinf(inv_square_mag)] = 0
    inv_mag = np.sqrt(inv_square_mag)
    return (similarity * inv_mag).T * inv_mag 

In [7]:
## Generation graph based on the time series similarity
cos = csr_cosine_similarity(flat_mat)

G = nx.Graph()
thres = np.percentile(cos[cos > 0], 75)

for i in tqdm(range(15584)):
    for j in range(i+1, 15584):
        if cos[i][j] > thres:
            G.add_edge(i, j, weight=cos[i][j])

partition = community.best_partition(G)

partition = pd.DataFrame({'link':list(partition.keys()),'sim_group':list(partition.values())})

mapp = pd.read_csv('map.csv')
mapp = dict(zip(mapp['id'], mapp['link']))

partition['link'] = partition['link'].apply(lambda x: mapp[x])

partition.to_csv('./data/partition_group.csv', index=False)

  after removing the cwd from sys.path.


In [None]:
# Generation graph based on the latent factor similarity

group = k_means(P, 24)

group_latent = group[1]

mapp_re = {y:x for x,y in mapp.items()}

latent_group = pd.DataFrame(np.array([[mapp_re[x] for x in range(len(P))],group_latent]).T)

latent_group.columns =  ['link','latent_group']

latent_group.to_csv('./data/latent_group.csv', index=False)

In [None]:
# Generation graph based on the spatial topology

with open('./data/topo.txt') as f:
    content = f.read()

content = [x.split('\t') for x in content.split('\n')]

G = nx.Graph()

for row in content:
    inr = row[0]
    outrs = row[1].split(',')
    for outr in outrs:
        G.add_edge(inr, outr)

partition = community.best_partition(G)

topo_group = pd.DataFrame(np.array([list(partition.keys()), list(partition.values())]).T)

topo_group.columns = ['link','topo_group']

topo_group.to_csv('./data/topo_group.csv', index=False)

In [None]:
#### Feature 1205
for date in range(0, 32):
    train = pd.read_csv('./data/traffic/train_table_%02d.gz'%date)

    ## 根据原始信息做的特征

    train['date'] = date - 1

    train['time_diff'] = train['predict_time'] - train['current_time']

    train['predict_hour'] = train['predict_time'] // 30

    train['week'] = train['date'] % 7

    def check_workday(x):
        if x in [5,6,12,13,19,20,26,27]:
            return 0
        else:
            return 1

    train['workday'] = train['date'].apply(check_workday)

    ## 加入路段特征

    with open('./data/attr.txt') as f:
        content = f.read()

    attr = pd.DataFrame([x.split('\t') for x in content.split('\n')])[:-1]

    attr.columns = ['attr_%d'%i for i in range(9)]

    attr = attr.astype(np.float32)

    train = pd.merge(left=train, right=attr, left_on = 'link', right_on='attr_0',how='left')

    train = train.drop(columns='attr_0')

    ## 加入隐含因子

    P = np.load('finalP.npy')

    Q = np.load('finalQ_pred.npy')

    Q_local = Q[720 * (date - 1): 720 * date]

    mapp = pd.read_csv('map.csv')
    mapp = dict(zip(mapp['link'], mapp['id']))

    features = []
    pred = []
    pmean, qmean = [],[]
    pstd, qstd = [],[]
    for i,eachrow in train.iterrows():
        pred.append(np.dot(P[mapp[int(eachrow['link'])]], Q_local[int(eachrow['predict_time'])]))
        pmean.append(np.average(P[mapp[int(eachrow['link'])]]))
        qmean.append(np.average(Q_local[int(eachrow['predict_time'])]))
        pstd.append(np.std(P[mapp[int(eachrow['link'])]]))
        qstd.append(np.std(Q_local[int(eachrow['predict_time'])]))
        features.append(np.hstack([P[mapp[int(eachrow['link'])]], Q_local[int(eachrow['predict_time'])]]))

    features = pd.DataFrame(features)
    features.columns = ['P%d'%x for x in range(64)] + ['Q%d'%x for x in range(64)]
    train = pd.concat([train, features], axis = 1)
    train['mf_pred'] = pred
    train['p_mean'] = pmean
    train['q_mean'] = qmean
    train['p_std'] = pstd
    train['q_std'] = qstd


    topo_group = pd.read_csv('./data/topo_group.csv')

    latent_group = pd.read_csv('./data/latent_group.csv')

    train = pd.merge(left=train, right = topo_group, how='left', on='link')

    train = pd.merge(left=train, right = latent_group, how='left', on='link')

    ## 抽取昨天的矩阵特征

    mat = np.load('mat.npy') + 1

    mat_last = mat[date - 1 - 1]

    time_average = np.sum(mat_last, axis = 0) / np.sum(mat_last > 0, axis = 0)

    link_average = np.sum(mat_last, axis = 1) / np.sum(mat_last > 0, axis = 1)

    ## 昨天所有统计值
    features = []
    for i,eachrow in train.iterrows():
        link = mapp[eachrow['link']]
        time = int(eachrow['predict_time'])
        features.append([time_average[time], link_average[link]])
    features = pd.DataFrame(features)
    features.columns = ['time_avg_lastday','link_avg_lastday']
    train = pd.concat([train, features], axis = 1)

    ## 昨天目标区域附近统计值
    features = []
    for i,eachrow in train.iterrows():
        link = mapp[eachrow['link']]
        time = int(eachrow['predict_time'])
        average5 = mat_last[link, time - 5: time+5]
        average10 = mat_last[link, time - 10: time+10]
        average15 = mat_last[link, time - 15: time+15]
        average20 = mat_last[link, time - 20: time+20]
        average30 = mat_last[link, time - 30: time+30]
        features.append([np.sum(average5) / np.sum(average5 > 0),
                         np.sum(average10) / np.sum(average10 > 0),
                         np.sum(average15) / np.sum(average15 > 0),
                         np.sum(average20) / np.sum(average20 > 0),
                         np.sum(average30) / np.sum(average30 > 0),
                        ])
    features = pd.DataFrame(features)
    features.columns = ['target_avg_lastday_5','target_avg_lastday_10',
                        'target_avg_lastday_15','target_avg_lastday_20','target_avg_lastday_30']
    train = pd.concat([train, features], axis = 1)

    ## 抽取前天的特征
    mat = np.load('mat.npy') + 1

    mat_last = mat[date - 1 - 2]

    time_average = np.sum(mat_last, axis = 0) / np.sum(mat_last > 0, axis = 0)

    link_average = np.sum(mat_last, axis = 1) / np.sum(mat_last > 0, axis = 1)

    ## 前天所有统计值
    features = []
    for i,eachrow in train.iterrows():
        link = mapp[eachrow['link']]
        time = int(eachrow['predict_time'])
        features.append([time_average[time], link_average[link]])
    features = pd.DataFrame(features)
    features.columns = ['time_avg_beforelastday','link_avg_beforelastday']
    train = pd.concat([train, features], axis = 1)

    ## 前天目标区域附近统计值
    features = []
    for i,eachrow in train.iterrows():
        link = mapp[eachrow['link']]
        time = int(eachrow['predict_time'])
        average5 = mat_last[link, time - 5: time+5]
        average10 = mat_last[link, time - 10: time+10]
        average15 = mat_last[link, time - 15: time+15]
        average20 = mat_last[link, time - 20: time+20]
        average30 = mat_last[link, time - 30: time+30]
        features.append([np.sum(average5) / np.sum(average5 > 0),
                         np.sum(average10) / np.sum(average10 > 0),
                         np.sum(average15) / np.sum(average15 > 0),
                         np.sum(average20) / np.sum(average20 > 0),
                         np.sum(average30) / np.sum(average30 > 0),
                        ])
    features = pd.DataFrame(features)
    features.columns = ['target_avg_beforelastday_5','target_avg_beforelastday_10',
                        'target_avg_beforelastday_15','target_avg_beforelastday_20','target_avg_beforelastday_30']
    train = pd.concat([train, features], axis = 1)

    ## 抽取一周前的矩阵特征

    mat = np.load('mat.npy') + 1

    mat_last = mat[date - 1 - 7]

    time_average = np.sum(mat_last, axis = 0) / np.sum(mat_last > 0, axis = 0)

    link_average = np.sum(mat_last, axis = 1) / np.sum(mat_last > 0, axis = 1)

    ## 上周所有统计值
    features = []
    for i,eachrow in train.iterrows():
        link = mapp[eachrow['link']]
        time = int(eachrow['predict_time'])
        features.append([time_average[time], link_average[link]])
    features = pd.DataFrame(features)
    features.columns = ['time_avg_lastweek','link_avg_lastweek']
    train = pd.concat([train, features], axis = 1)

    ## 上周目标区域附近统计值
    features = []
    for i,eachrow in train.iterrows():
        link = mapp[eachrow['link']]
        time = int(eachrow['predict_time'])
        average5 = mat_last[link, time - 5: time+5]
        average10 = mat_last[link, time - 10: time+10]
        average15 = mat_last[link, time - 15: time+15]
        average20 = mat_last[link, time - 20: time+20]
        average30 = mat_last[link, time - 30: time+30]
        features.append([np.sum(average5) / np.sum(average5 > 0),
                         np.sum(average10) / np.sum(average10 > 0),
                         np.sum(average15) / np.sum(average15 > 0),
                         np.sum(average20) / np.sum(average20 > 0),
                         np.sum(average30) / np.sum(average30 > 0),
                        ])
    features = pd.DataFrame(features)
    features.columns = ['target_avg_lastweek_5','target_avg_lastweek_10',
                        'target_avg_lastweek_15','target_avg_lastweek_20','target_avg_lastweek_30']
    train = pd.concat([train, features], axis = 1)

    ## 提取翌日特征
    mat = np.load('mat.npy') + 1

    week_index = []
    for i in range(-10, 10):
        week_index.append(date - 1 + i * 7)
    week_index = [x for x in week_index if 0 <= x <= 31]
    mat_last = mat[week_index]
    mat_last = np.sum(mat_last, axis = 0) / np.sum(mat_last > 0, axis = 0)

    time_average = np.sum(mat_last, axis = 0) / np.sum(mat_last > 0, axis = 0)

    link_average = np.sum(mat_last, axis = 1) / np.sum(mat_last > 0, axis = 1)

    ## 所有该翌日统计值
    features = []
    for i,eachrow in train.iterrows():
        link = mapp[eachrow['link']]
        time = int(eachrow['predict_time'])
        features.append([time_average[time], link_average[link]])
    features = pd.DataFrame(features)
    features.columns = ['time_avg_week','link_avg_week']
    train = pd.concat([train, features], axis = 1)

    ## 翌日目标区域附近统计值
    features = []
    for i,eachrow in train.iterrows():
        link = mapp[eachrow['link']]
        time = int(eachrow['predict_time'])
        average5 = mat_last[link, time - 5: time+5]
        average10 = mat_last[link, time - 10: time+10]
        average15 = mat_last[link, time - 15: time+15]
        average20 = mat_last[link, time - 20: time+20]
        average30 = mat_last[link, time - 30: time+30]
        features.append([np.sum(average5) / np.sum(average5 > 0),
                         np.sum(average10) / np.sum(average10 > 0),
                         np.sum(average15) / np.sum(average15 > 0),
                         np.sum(average20) / np.sum(average20 > 0),
                         np.sum(average30) / np.sum(average30 > 0),
                        ])
    features = pd.DataFrame(features)
    features.columns = ['target_avg_week_5','target_avg_week_10',
                        'target_avg_week_15','target_avg_week_20','target_avg_week_30']
    train = pd.concat([train, features], axis = 1)

    train.to_csv('./data/traffic/train_table_%02d_1205.gz'%date)
    gc.collect()

In [None]:
date = 32

train = pd.read_csv('./test_table.gz')

## 根据原始信息做的特征

train['date'] = date - 1

train['time_diff'] = train['predict_time'] - train['current_time']

train['predict_hour'] = train['predict_time'] // 30

train['week'] = train['date'] % 7

def check_workday(x):
    if x in [5,6,12,13,19,20,26,27]:
        return 0
    else:
        return 1

train['workday'] = train['date'].apply(check_workday)

## 加入路段特征

with open('./data/attr.txt') as f:
    content = f.read()

attr = pd.DataFrame([x.split('\t') for x in content.split('\n')])[:-1]

attr.columns = ['attr_%d'%i for i in range(9)]

attr = attr.astype(np.float32)

train = pd.merge(left=train, right=attr, left_on = 'link', right_on='attr_0',how='left')

train = train.drop(columns='attr_0')

## 加入隐含因子

P = np.load('finalP.npy')

Q = np.load('finalQ_pred.npy')

Q_local = Q[720 * (date - 1): 720 * date]

mapp = pd.read_csv('map.csv')
mapp = dict(zip(mapp['link'], mapp['id']))

features = []
pred = []
pmean, qmean = [],[]
pstd, qstd = [],[]
for i,eachrow in train.iterrows():
    pred.append(np.dot(P[mapp[int(eachrow['link'])]], Q_local[int(eachrow['predict_time'])]))
    pmean.append(np.average(P[mapp[int(eachrow['link'])]]))
    qmean.append(np.average(Q_local[int(eachrow['predict_time'])]))
    pstd.append(np.std(P[mapp[int(eachrow['link'])]]))
    qstd.append(np.std(Q_local[int(eachrow['predict_time'])]))
    features.append(np.hstack([P[mapp[int(eachrow['link'])]], Q_local[int(eachrow['predict_time'])]]))

features = pd.DataFrame(features)
features.columns = ['P%d'%x for x in range(64)] + ['Q%d'%x for x in range(64)]
train = pd.concat([train, features], axis = 1)
train['mf_pred'] = pred
train['p_mean'] = pmean
train['q_mean'] = qmean
train['p_std'] = pstd
train['q_std'] = qstd


## 加入link组信息

# group = k_means(P, 24)

# group_latent = group[1]

# mapp_re = {y:x for x,y in mapp.items()}

# latent_group = pd.DataFrame(np.array([[mapp_re[x] for x in range(len(P))],group_latent]).T)

# latent_group.columns =  ['link','latent_group']

# with open('./data/topo.txt') as f:
#     content = f.read()

# content = [x.split('\t') for x in content.split('\n')]

# G = nx.Graph()

# for row in content:
#     inr = row[0]
#     outrs = row[1].split(',')
#     for outr in outrs:
#         G.add_edge(inr, outr)

# topo_group = pd.DataFrame(np.array([list(partition.keys()), list(partition.values())]).T)

# topo_group.columns = ['link','topo_group']

# latent_group.to_csv('./data/latent_group.csv', index=False)

# topo_group.to_csv('./data/topo_group.csv', index=False)

topo_group = pd.read_csv('./data/topo_group.csv')

latent_group = pd.read_csv('./data/latent_group.csv')

train = pd.merge(left=train, right = topo_group, how='left', on='link')

train = pd.merge(left=train, right = latent_group, how='left', on='link')

## 抽取昨天的矩阵特征

mat = np.load('mat.npy') + 1

mat_last = mat[date - 1 - 1]

time_average = np.sum(mat_last, axis = 0) / np.sum(mat_last > 0, axis = 0)

link_average = np.sum(mat_last, axis = 1) / np.sum(mat_last > 0, axis = 1)

## 昨天所有统计值
features = []
for i,eachrow in train.iterrows():
    link = mapp[eachrow['link']]
    time = int(eachrow['predict_time'])
    features.append([time_average[time], link_average[link]])
features = pd.DataFrame(features)
features.columns = ['time_avg_lastday','link_avg_lastday']
train = pd.concat([train, features], axis = 1)

## 昨天目标区域附近统计值
features = []
for i,eachrow in train.iterrows():
    link = mapp[eachrow['link']]
    time = int(eachrow['predict_time'])
    average5 = mat_last[link, time - 5: time+5]
    average10 = mat_last[link, time - 10: time+10]
    average15 = mat_last[link, time - 15: time+15]
    average20 = mat_last[link, time - 20: time+20]
    average30 = mat_last[link, time - 30: time+30]
    features.append([np.sum(average5) / np.sum(average5 > 0),
                     np.sum(average10) / np.sum(average10 > 0),
                     np.sum(average15) / np.sum(average15 > 0),
                     np.sum(average20) / np.sum(average20 > 0),
                     np.sum(average30) / np.sum(average30 > 0),
                    ])
features = pd.DataFrame(features)
features.columns = ['target_avg_lastday_5','target_avg_lastday_10',
                    'target_avg_lastday_15','target_avg_lastday_20','target_avg_lastday_30']
train = pd.concat([train, features], axis = 1)

## 抽取前天的特征
mat = np.load('mat.npy') + 1

mat_last = mat[date - 1 - 2]

time_average = np.sum(mat_last, axis = 0) / np.sum(mat_last > 0, axis = 0)

link_average = np.sum(mat_last, axis = 1) / np.sum(mat_last > 0, axis = 1)

## 前天所有统计值
features = []
for i,eachrow in train.iterrows():
    link = mapp[eachrow['link']]
    time = int(eachrow['predict_time'])
    features.append([time_average[time], link_average[link]])
features = pd.DataFrame(features)
features.columns = ['time_avg_beforelastday','link_avg_beforelastday']
train = pd.concat([train, features], axis = 1)

## 前天目标区域附近统计值
features = []
for i,eachrow in train.iterrows():
    link = mapp[eachrow['link']]
    time = int(eachrow['predict_time'])
    average5 = mat_last[link, time - 5: time+5]
    average10 = mat_last[link, time - 10: time+10]
    average15 = mat_last[link, time - 15: time+15]
    average20 = mat_last[link, time - 20: time+20]
    average30 = mat_last[link, time - 30: time+30]
    features.append([np.sum(average5) / np.sum(average5 > 0),
                     np.sum(average10) / np.sum(average10 > 0),
                     np.sum(average15) / np.sum(average15 > 0),
                     np.sum(average20) / np.sum(average20 > 0),
                     np.sum(average30) / np.sum(average30 > 0),
                    ])
features = pd.DataFrame(features)
features.columns = ['target_avg_beforelastday_5','target_avg_beforelastday_10',
                    'target_avg_beforelastday_15','target_avg_beforelastday_20','target_avg_beforelastday_30']
train = pd.concat([train, features], axis = 1)

## 抽取一周前的矩阵特征

mat = np.load('mat.npy') + 1

mat_last = mat[date - 1 - 7]

time_average = np.sum(mat_last, axis = 0) / np.sum(mat_last > 0, axis = 0)

link_average = np.sum(mat_last, axis = 1) / np.sum(mat_last > 0, axis = 1)

## 上周所有统计值
features = []
for i,eachrow in train.iterrows():
    link = mapp[eachrow['link']]
    time = int(eachrow['predict_time'])
    features.append([time_average[time], link_average[link]])
features = pd.DataFrame(features)
features.columns = ['time_avg_lastweek','link_avg_lastweek']
train = pd.concat([train, features], axis = 1)

## 上周目标区域附近统计值
features = []
for i,eachrow in train.iterrows():
    link = mapp[eachrow['link']]
    time = int(eachrow['predict_time'])
    average5 = mat_last[link, time - 5: time+5]
    average10 = mat_last[link, time - 10: time+10]
    average15 = mat_last[link, time - 15: time+15]
    average20 = mat_last[link, time - 20: time+20]
    average30 = mat_last[link, time - 30: time+30]
    features.append([np.sum(average5) / np.sum(average5 > 0),
                     np.sum(average10) / np.sum(average10 > 0),
                     np.sum(average15) / np.sum(average15 > 0),
                     np.sum(average20) / np.sum(average20 > 0),
                     np.sum(average30) / np.sum(average30 > 0),
                    ])
features = pd.DataFrame(features)
features.columns = ['target_avg_lastweek_5','target_avg_lastweek_10',
                    'target_avg_lastweek_15','target_avg_lastweek_20','target_avg_lastweek_30']
train = pd.concat([train, features], axis = 1)

## 提取翌日特征
mat = np.load('mat.npy') + 1

week_index = []
for i in range(-10, 10):
    week_index.append(date - 1 + i * 7)
week_index = [x for x in week_index if 0 <= x <= 31]
mat_last = mat[week_index]
mat_last = np.sum(mat_last, axis = 0) / np.sum(mat_last > 0, axis = 0)

time_average = np.sum(mat_last, axis = 0) / np.sum(mat_last > 0, axis = 0)

link_average = np.sum(mat_last, axis = 1) / np.sum(mat_last > 0, axis = 1)

## 所有该翌日统计值
features = []
for i,eachrow in train.iterrows():
    link = mapp[eachrow['link']]
    time = int(eachrow['predict_time'])
    features.append([time_average[time], link_average[link]])
features = pd.DataFrame(features)
features.columns = ['time_avg_week','link_avg_week']
train = pd.concat([train, features], axis = 1)

## 翌日目标区域附近统计值
features = []
for i,eachrow in train.iterrows():
    link = mapp[eachrow['link']]
    time = int(eachrow['predict_time'])
    average5 = mat_last[link, time - 5: time+5]
    average10 = mat_last[link, time - 10: time+10]
    average15 = mat_last[link, time - 15: time+15]
    average20 = mat_last[link, time - 20: time+20]
    average30 = mat_last[link, time - 30: time+30]
    features.append([np.sum(average5) / np.sum(average5 > 0),
                     np.sum(average10) / np.sum(average10 > 0),
                     np.sum(average15) / np.sum(average15 > 0),
                     np.sum(average20) / np.sum(average20 > 0),
                     np.sum(average30) / np.sum(average30 > 0),
                    ])
features = pd.DataFrame(features)
features.columns = ['target_avg_week_5','target_avg_week_10',
                    'target_avg_week_15','target_avg_week_20','target_avg_week_30']
train = pd.concat([train, features], axis = 1)

train.to_csv('./data/test_table_1205.gz')