In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2022/11/08 15:54
# @Author  : Wang Yujia
# @File    : basicinfo_GT_models.ipynb
# @Description : 关于GT模型的计算结果的基本信息

# 0. What for
1. 关于GT模型的计算结果的基本信息

# 1. Preparations
## 1.1 全局设置

In [7]:
import numpy as np
import csv
import pandas as pd
from visdom import Visdom
from tqdm.notebook import tqdm

# 2. Info about 'P' which is the input of NNs
## 2.1 Are the length of ‘P’ from different GT models the same? Yes
1. 比较2个model的P的长度是否一样（一样的话方便画图）：
2. 注意这里的P是**未经过K筛选**的，

In [3]:
# GT model-0
filename_P_0 = "../data/SA_PT/results/PT_all1303_P.csv"
# GT model-1
filename_P_1 = "../data/info_asymm/results/asc_symmetry/GT_asc_symmetry_2_woKP.csv"

# GT model-2
filename_P_2 = "../data/SA_PT/results/PT_all1303_oneforall_P.csv"

# read data
P_0 = pd.read_csv(filename_P_0, encoding="utf-8")
P_1 = pd.read_csv(filename_P_1, encoding="utf-8")
P_2 = pd.read_csv(filename_P_2, encoding="utf-8")


In [1]:
# transform str 'P' into narray 'P'
def transform(str):
    a = np.array(np.mat(str),dtype=np.float64)
    d = a.flatten()
    return d

In [6]:
print(f"GT model-1 包括了 *{P_1.shape[0]}* 组setting")
print(f"GT model-2 包括了 *{P_2.shape[0]}* 组setting")

for i in range(0,P_1.shape[0]):
    # transform str into narray
    p_1_i = transform(P_1.loc[i,'P'])
    p_2_i = transform(P_2.loc[i,'P'])
    assert len(p_1_i) == len(p_2_i), "P不等长!"
print("两个GTmodel的'P'等长")

GT model-1 包括了 *612* 组setting
GT model-2 包括了 *612* 组setting
两个GTmodel的'P'等长


## 2.2 compare ‘P’ generated by different GT models [2 lines]
1. 比较两个P的图像，画在visdom中

In [7]:
# 添加,use_incoming_socket=False之后似乎没办法append？
env_str = "compare_P"
viz = Visdom(env = "compare_P")

Setting up a new session...


Exception in user code:
------------------------------------------------------------


Traceback (most recent call last):
  File "D:\Anaconda\envs\pythorch\lib\site-packages\urllib3\connection.py", line 175, in _new_conn
    (self._dns_host, self.port), self.timeout, **extra_kw
  File "D:\Anaconda\envs\pythorch\lib\site-packages\urllib3\util\connection.py", line 95, in create_connection
    raise err
  File "D:\Anaconda\envs\pythorch\lib\site-packages\urllib3\util\connection.py", line 85, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [WinError 10061] 由于目标计算机积极拒绝，无法连接。

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "D:\Anaconda\envs\pythorch\lib\site-packages\urllib3\connectionpool.py", line 710, in urlopen
    chunked=chunked,
  File "D:\Anaconda\envs\pythorch\lib\site-packages\urllib3\connectionpool.py", line 398, in _make_request
    conn.request(method, url, **httplib_request_kw)
  File "D:\Anaconda\envs\pythorch\lib\site-packages\urllib3\connection.py", line 239, in request
    super

In [None]:
# P_1.shape[0]
for i in range(0,P_1.shape[0]):
    # transform str into narray
    p_1_i = transform(P_1.loc[i,'P'])
    p_2_i = transform(P_2.loc[i,'P'])

    v = P_1.loc[i,'retail']
    b = P_1.loc[i,'bidfee']
    d = P_1.loc[i,'bidincrement']

    viz.line(Y = p_1_i, X = np.arange(0,len(p_1_i)), win = "P_"+str(i), env = env_str,update = 'append',name="model-1",
             opts= dict(title = f'P_{i}_v={v}_b={b}_d={d}',showlegend=True))
    viz.line(Y = p_2_i, X = np.arange(0,len(p_2_i)), win = "P_"+str(i), env = env_str,update = 'append',name="model-2")

print("Done")

In [None]:
# 保存整个环境
viz.save(envs=['compare_P'])

## 2.3 compare ‘P’ under different params [3 lines]
1. 这次画3条曲线，2条和上面一样，多一条表示的是对每个settings做一次infer得到params，这个params下generate出来的P曲线
2. 因此有1303而不是612组“settings”

In [None]:
P_1.head()

In [None]:
def get_P_from_i(i):
    v = P_0.loc[i,'retail']
    b = P_0.loc[i,'bidfee']
    d = P_0.loc[i,'bidincrement']

    p_1 = P_1[(P_1.loc[:,'retail'] == v) & (P_1.loc[:,'bidfee'] == b) & (P_1.loc[:,'bidincrement'] == d)]
    p_2 = P_2[(P_2.loc[:,'retail'] == v) & (P_2.loc[:,'bidfee'] == b) & (P_2.loc[:,'bidincrement'] == d)]

    p_1_arr = transform(p_1.P.item())
    p_2_arr = transform(p_2.P.item())

    return (p_1_arr,p_2_arr)

In [None]:
env_str = "compare_P_2"
viz = Visdom(env=env_str)

In [None]:
for i in tqdm(range(0,P_0.shape[0])):
    p_0_i = transform(P_0.loc[i,'P'])
    # get transformed P from key_i
    p_1_i,p_2_i = get_P_from_i(i)

    v = P_0.loc[i,'retail']
    b = P_0.loc[i,'bidfee']
    d = P_0.loc[i,'bidincrement']

    # assert(len(p_0_i) == len(p_1_i),"Not in the same length!")

    viz.line(Y = p_0_i, X = np.arange(0,len(p_0_i)), win = "P_"+str(i), env = env_str,update = 'append',name="model-0",
             opts= dict(title = f'P_{i}_v={v}_b={b}_d={d}',showlegend=True))
    viz.line(Y = p_1_i, X = np.arange(0,len(p_1_i)), win = "P_"+str(i), env = env_str,update = 'append',name="model-1")
    viz.line(Y = p_2_i, X = np.arange(0,len(p_2_i)), win = "P_"+str(i), env = env_str,update = 'append',name="model-2")

print("Done")

In [None]:
# 保存整个环境
viz.save(envs=env_str)

## 2.4 ‘P’ in the target data
1. 如果setting=[v,b,d]的话，画‘P’没什么意义，因为对于实际数据，不止要考虑这3个设置，还要考虑product_id等

# 3. Info about dataset
## 3.1 How many ‘settings’ in the dataset
1. dataset指的是samples数量在16之上的所有的ascending-price auctions,GT的两个model用这些data做了generate的过程
2. 根据settings含义的不同，需要有不同的统计

In [8]:
data_path = "../data/info_asymm/datawithnp_asc_symmetry_2_selected.csv"
data = pd.read_csv(data_path, encoding="utf-8")

setting_1 = ['retail','bidfee','bidincrement']
setting_2 = ['product_id','retail','bidfee','bidincrement']

data_1 = data.groupby(setting_1)
data_2 = data.groupby(setting_2)

print(f"按照setting={setting_1}, 一共有 *{len(data_1)}*")
print(f"按照setting={setting_2}, 一共有 *{len(data_2)}*")

按照setting=['retail', 'bidfee', 'bidincrement'], 一共有 *612*
按照setting=['product_id', 'retail', 'bidfee', 'bidincrement'], 一共有 *1303*


## 3.2 Are the dataset in the same size in the amount of setting?
1. 2个GT models用的dataset有微小不同，他们的settings数目是一样的吗: YES

In [152]:
data_pt_path = "../data/SA_PT/datawithnp_PT_selected.csv"
data_pt = pd.read_csv(data_pt_path, encoding="utf-8")

data_pt_1 = data_pt.groupby(setting_1)
data_pt_2 = data_pt.groupby(setting_2)

print(f"按照setting={setting_1}, 一共有 *{len(data_pt_1)}*")
print(f"按照setting={setting_2}, 一共有 *{len(data_pt_2)}*")

按照setting=['retail', 'bidfee', 'bidincrement'], 一共有 *612*
按照setting=['product_id', 'retail', 'bidfee', 'bidincrement'], 一共有 *1303*


## 3.3 What if I drop settings with short 'T'?
1. 尝试drop掉`T < drop_size` 的settings，注意T = (v-b)/d
2. 这里settings指的是setting_1 = ['retail','bidfee','bidincrement']

In [14]:
data_key_path = "../data/SA_PT/data_key_PT_vbd.csv"
data_key_vbd = pd.read_csv(data_key_path,encoding="utf-8")

In [37]:
data_key_vbd['T'] = np.array((data_key_vbd.retail-data_key_vbd.bidfee)/data_key_vbd.bidincrement,dtype=int)
data_key_vbd.sort_values(by = 'T', inplace=True, ascending=True,ignore_index=True)
print(f"mean of T: *{np.mean(data_key_vbd['T'])}*")
print(f"median of T: *{np.median(data_key_vbd['T'])}*")

mean of T: *17756.995098039217*
median of T: *1398.0*


In [33]:
drop_size = 100
print(f"最初settings数量为 *{data_key_vbd.shape[0]}*")
data_key_ls100 = data_key_vbd[(data_key_vbd.loc[:,'T']>=drop_size)]
print(f"drop_size = *{drop_size}*")
print(f"剩余settings数量为 *{data_key_ls100.shape[0]}*")
print(f"drop掉了 *{data_key_vbd.shape[0]-data_key_ls100.shape[0]}* 个setting，占比：*{(data_key_vbd.shape[0]-data_key_ls100.shape[0])/(data_key_vbd.shape[0])}*")

最初settings数量为 *612*
drop_size = *100*
剩余settings数量为 *591*
drop掉了 *21* 个setting，占比：*0.03431372549019608*


In [34]:
drop_size = 150
print(f"最初settings数量为 *{data_key_vbd.shape[0]}*")
data_key_ls100 = data_key_vbd[(data_key_vbd.loc[:, 'T'] >= drop_size)]
print(f"drop_size = *{drop_size}*")
print(f"剩余settings数量为 *{data_key_ls100.shape[0]}*")
print(f"drop掉了 *{data_key_vbd.shape[0] - data_key_ls100.shape[0]}* 个setting，占比：*{(data_key_vbd.shape[0] - data_key_ls100.shape[0]) / (data_key_vbd.shape[0])}*")

最初settings数量为 *612*
drop_size = *150*
剩余settings数量为 *573*
drop掉了 *39* 个setting，占比：*0.06372549019607843*


In [32]:
drop_size = 200
print(f"最初settings数量为 *{data_key_vbd.shape[0]}*")
data_key_ls100 = data_key_vbd[(data_key_vbd.loc[:, 'T'] >= drop_size)]
print(f"drop_size = *{drop_size}*")
print(f"剩余settings数量为 *{data_key_ls100.shape[0]}*")
print(f"drop掉了 *{data_key_vbd.shape[0] - data_key_ls100.shape[0]}* 个setting，占比：*{(data_key_vbd.shape[0] - data_key_ls100.shape[0]) / (data_key_vbd.shape[0])}*")

最初settings数量为 *612*
drop_size = *200*
剩余settings数量为 *546*
drop掉了 *66* 个setting，占比：*0.10784313725490197*


In [38]:
drop_size = 300
print(f"最初settings数量为 *{data_key_vbd.shape[0]}*")
data_key_ls100 = data_key_vbd[(data_key_vbd.loc[:, 'T'] >= drop_size)]
print(f"drop_size = *{drop_size}*")
print(f"剩余settings数量为 *{data_key_ls100.shape[0]}*")
print(f"drop掉了 *{data_key_vbd.shape[0] - data_key_ls100.shape[0]}* 个setting，占比：*{(data_key_vbd.shape[0] - data_key_ls100.shape[0]) / (data_key_vbd.shape[0])}*")

最初settings数量为 *612*
drop_size = *300*
剩余settings数量为 *490*
drop掉了 *122* 个setting，占比：*0.19934640522875818*


## 3.4 when "v-C(t-1)-dt-b"=0，t=？
1. v-C(t-1)-dt-b=0时，t为多少？对每个settings都进行求解，得出最大值
2. 求这个值是为了设定threshold K

In [12]:
# GT model-2 (PT)
filename_P_2 = "../data/SA_PT/results/PT_all1303_oneforall_P.csv"
P_2 = pd.read_csv(filename_P_2, encoding="utf-8")

t_K = np.array([0]*P_2.shape[0])

for i in range(0,P_2.shape[0]):
    v = P_2.loc[i,'retail']
    b = P_2.loc[i,'bidfee']
    d = P_2.loc[i,'bidincrement']

    t_K[i] = (v-0.8*b) / (0.2+d)
print(f"v-C(t-1)-dt-b=0时")
print(f"t平均有: {np.mean(t_K)}")
print(f"t中位数有: {np.median(t_K)}")

v-C(t-1)-dt-b=0时
t平均有: 1515.4787581699347
t中位数有: 490.5
