In [1]:
import os
import pandas as pd
from glob import glob
from scipy.io import loadmat

In [2]:
mat2019 = loadmat('paintings2019.mat')
print(mat2019.keys())

dict_keys(['__header__', '__version__', '__globals__', 'Decmaps', 'Decpics', 'Dectimes', 'Explactions', 'Explpics', 'Expltimes', 'Groups', 'Moneys', 'Payoffs'])


In [3]:
for key in mat2019.keys():
    if not key.startswith('__'):
        print(f"{key}: {type(mat2019[key])}, shape: {mat2019[key][0].shape}")
    

Decmaps: <class 'numpy.ndarray'>, shape: (15,)
Decpics: <class 'numpy.ndarray'>, shape: (15,)
Dectimes: <class 'numpy.ndarray'>, shape: (15,)
Explactions: <class 'numpy.ndarray'>, shape: (15,)
Explpics: <class 'numpy.ndarray'>, shape: (15,)
Expltimes: <class 'numpy.ndarray'>, shape: (15,)
Groups: <class 'numpy.ndarray'>, shape: (15,)
Moneys: <class 'numpy.ndarray'>, shape: (15,)
Payoffs: <class 'numpy.ndarray'>, shape: (15,)


In [4]:
def compute_interval_time(arr):
    if 0 in arr:
        return [0] * 4
    else:
        return [arr[i+1] - arr[i] for i in range(4)]

def compute_expl_interval_time(arr):
    n = len(arr)
    if 0 in arr:
        return [0] * (n-1)
    else:
        return [arr[i+1] - arr[i] for i in range(n-1)]
    
def compute_expl_time_per_pic(expltimes, explpics):
    interval_time = compute_expl_interval_time(expltimes)
    time_per_pic = {}
    for i, pic in enumerate(explpics[:-1]):
        if pic not in time_per_pic:
            time_per_pic[pic] = interval_time[i]
        else:
            time_per_pic[pic] += interval_time[i]
    return time_per_pic

def compute_expl_time(expltimes, explpics, decmaps):
    expl_time_per_pic = compute_expl_time_per_pic(expltimes, explpics)
    times = []
    for i in range(len(decmaps)):
        pic_times = []
        for pic in decmaps[i]:
            pic_times.append(expl_time_per_pic.get(pic, 0))
        times.append(pic_times)
    return times


In [5]:
dfs = []
for i in range(11):
    ID = "2019" + str(i+1).zfill(2)
    Group = mat2019['Groups'][0][i][0]
    round = list(range(1, len(list(mat2019['Decmaps'][0][i])) + 1))
    decmaps = [list(row) for row in mat2019['Decmaps'][0][i]]
    decpics = [list(row) for row in mat2019['Decpics'][0][i]]
    dectimes = [list(row) for row in mat2019['Dectimes'][0][i]]
    expltimes = list(mat2019['Expltimes'][0][i][0])
    explpics = list(mat2019['Explpics'][0][i][0])
    payoffs = [list(row) for row in mat2019['Payoffs'][0][i]]

    dectime_intervel = [compute_interval_time(arr) for arr in dectimes]

    df = pd.DataFrame({
        'ID': [ID] * len(round),
        'Group': [Group] * len(round),
        'round': round,
        'Decmaps': decmaps,
        'Decpics': decpics,
        'Expltime': compute_expl_time(expltimes, explpics, decmaps),
        'Dectime_interval': dectime_intervel,
        'Dectimes': dectimes,
        'Payoffs': payoffs,
        'Dectime_interval': dectime_intervel,
    })
    dfs.append(df)

In [6]:
full_df = pd.concat(dfs, ignore_index=True)
full_df = full_df.sort_values(by=['ID', 'round']).reset_index(drop=True)
full_df


Unnamed: 0,ID,Group,round,Decmaps,Decpics,Expltime,Dectime_interval,Dectimes,Payoffs
0,201901,Lc,1,"[262, 327, 310, 257, 145, 359, 303, 312, 206, ...","[269, 257, 310, 145]","[0, 14.32499999999709, 0, 0, 1.523000000001047...","[60.13799999999901, 20.718000000000757, 4.7610...","[56215.577, 56275.715, 56296.433, 56301.194, 5...","[2, 2, 4, 5]"
1,201901,Lc,2,"[74, 248, 240, 36, 59, 217, 52, 132, 75, 2, 46...","[52, 46, 217, 59]","[0, 0, 0, 0, 0, 12.415000000000873, 0, 0, 17.6...","[13.49500000000262, 0.8909999999959837, 3.4740...","[56331.835, 56345.33, 56346.221, 56349.695, 56...","[4, 4, 6, 4]"
2,201901,Lc,3,"[82, 60, 32, 131, 50, 241, 8, 87, 240, 140, 22...","[131, 140, 129, 119]","[75.79699999999139, 0, 8.709999999999127, 2.61...","[10.364999999997963, 1.2640000000028522, 3.502...","[56549.057, 56559.422, 56560.686, 56564.189, 5...","[2, 2, 2, 2]"
3,201901,Lc,4,"[190, 178, 283, 325, 214, 314, 189, 216, 277, ...","[151, 168, 178, 153]","[0, 0, 0, 0, 15.001999999993131, 0, 0, 0, 0.85...","[73.02000000000407, 1.532999999995809, 1.97699...","[56572.971, 56645.991, 56647.524, 56649.501, 5...","[5, 5, 5, 5]"
4,201901,Lc,5,"[183, 78, 328, 350, 63, 286, 39, 108, 356, 216...","[181, 192, 216, 183]","[0, 0, 0, 0, 13.172000000013213, 0, 0, 0, 0, 0...","[22.51299999999901, 1.5459999999948195, 1.9120...","[56859.999, 56882.512, 56884.058, 56885.97, 56...","[6, 6, 6, 6]"
...,...,...,...,...,...,...,...,...,...
259,201911,H,22,"[383, 446, 388, 407, 417, 1, 530, 437, 34, 451...","[451, 443, 437, 446]","[0, 0, 0, 7.8400000000037835, 0, 0, 0, 1.83299...","[30.20300000000134, 0.41199999999662396, 1.841...","[59614.432, 59644.635, 59645.047, 59646.888, 5...","[2, 2, 2, 2]"
260,201911,H,23,"[457, 439, 57, 506, 454, 46, 474, 470, 56, 438...","[416, 506, 410, 426]","[5.364999999997963, 0, 0, 0, 0, 30.13900000001...","[23.30000000000291, 9.338999999999942, 11.8810...","[59863.314, 59886.614, 59895.953, 59907.834, 5...","[5, 6, 5, 5]"
261,201911,H,24,"[272, 94, 32, 104, 277, 377, 10, 223, 220, 378...","[240, 220, 223, 236]","[2.673000000002503, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[16.526000000005297, 8.396999999997206, 3.0230...","[59920.791, 59937.317, 59945.714, 59948.737, 5...","[6, 6, 6, 6]"
262,201911,H,25,"[380, 269, 394, 259, 88, 484, 487, 516, 383, 1...","[383, 380, 394, 381]","[0, 0, 0, 29.403999999994994, 0, 0, 0, 0, 0, 0...","[37.58499999999913, 4.195000000006985, 1.74499...","[60172.791, 60210.376, 60214.571, 60216.316, 6...","[4, 4, 4, 4]"


In [7]:
full_df.to_csv('2019_data.csv', index=False)

In [8]:
import ast
def str_to_list(x):
    if isinstance(x, str):
        return ast.literal_eval(x)
    return x
full_df = pd.read_csv('2019_data.csv', converters={
        'Decmaps': str_to_list,
        'Decpics': str_to_list,
        'Expltime': str_to_list,
        'Dectimes_interval': str_to_list,
        'Dectimes': str_to_list,
        'Payoffs': str_to_list,
    })
full_df

Unnamed: 0,ID,Group,round,Decmaps,Decpics,Expltime,Dectime_interval,Dectimes,Payoffs
0,201901,Lc,1,"[262, 327, 310, 257, 145, 359, 303, 312, 206, ...","[269, 257, 310, 145]","[0, 14.32499999999709, 0, 0, 1.523000000001047...","[60.13799999999901, 20.718000000000757, 4.7610...","[56215.577, 56275.715, 56296.433, 56301.194, 5...","[2, 2, 4, 5]"
1,201901,Lc,2,"[74, 248, 240, 36, 59, 217, 52, 132, 75, 2, 46...","[52, 46, 217, 59]","[0, 0, 0, 0, 0, 12.415000000000873, 0, 0, 17.6...","[13.49500000000262, 0.8909999999959837, 3.4740...","[56331.835, 56345.33, 56346.221, 56349.695, 56...","[4, 4, 6, 4]"
2,201901,Lc,3,"[82, 60, 32, 131, 50, 241, 8, 87, 240, 140, 22...","[131, 140, 129, 119]","[75.79699999999139, 0, 8.709999999999127, 2.61...","[10.364999999997963, 1.2640000000028522, 3.502...","[56549.057, 56559.422, 56560.686, 56564.189, 5...","[2, 2, 2, 2]"
3,201901,Lc,4,"[190, 178, 283, 325, 214, 314, 189, 216, 277, ...","[151, 168, 178, 153]","[0, 0, 0, 0, 15.001999999993131, 0, 0, 0, 0.85...","[73.02000000000407, 1.532999999995809, 1.97699...","[56572.971, 56645.991, 56647.524, 56649.501, 5...","[5, 5, 5, 5]"
4,201901,Lc,5,"[183, 78, 328, 350, 63, 286, 39, 108, 356, 216...","[181, 192, 216, 183]","[0, 0, 0, 0, 13.172000000013213, 0, 0, 0, 0, 0...","[22.51299999999901, 1.5459999999948195, 1.9120...","[56859.999, 56882.512, 56884.058, 56885.97, 56...","[6, 6, 6, 6]"
...,...,...,...,...,...,...,...,...,...
259,201911,H,22,"[383, 446, 388, 407, 417, 1, 530, 437, 34, 451...","[451, 443, 437, 446]","[0, 0, 0, 7.8400000000037835, 0, 0, 0, 1.83299...","[30.20300000000134, 0.41199999999662396, 1.841...","[59614.432, 59644.635, 59645.047, 59646.888, 5...","[2, 2, 2, 2]"
260,201911,H,23,"[457, 439, 57, 506, 454, 46, 474, 470, 56, 438...","[416, 506, 410, 426]","[5.364999999997963, 0, 0, 0, 0, 30.13900000001...","[23.30000000000291, 9.338999999999942, 11.8810...","[59863.314, 59886.614, 59895.953, 59907.834, 5...","[5, 6, 5, 5]"
261,201911,H,24,"[272, 94, 32, 104, 277, 377, 10, 223, 220, 378...","[240, 220, 223, 236]","[2.673000000002503, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[16.526000000005297, 8.396999999997206, 3.0230...","[59920.791, 59937.317, 59945.714, 59948.737, 5...","[6, 6, 6, 6]"
262,201911,H,25,"[380, 269, 394, 259, 88, 484, 487, 516, 383, 1...","[383, 380, 394, 381]","[0, 0, 0, 29.403999999994994, 0, 0, 0, 0, 0, 0...","[37.58499999999913, 4.195000000006985, 1.74499...","[60172.791, 60210.376, 60214.571, 60216.316, 6...","[4, 4, 4, 4]"


In [9]:
def compute_selected_times_avgpayoff(df, n=5):
    df = df.sort_values(["ID", "round"]).copy()

    all_counts = []
    all_avg = []

    for _, sub in df.groupby("ID", sort=False):
        sub_counts = []
        sub_avg = []

        decmaps_list = sub["Decmaps"].tolist()
        decpics_list = sub["Decpics"].tolist()
        payoffs_list = sub["Payoffs"].tolist()

        for i in range(len(sub)):
            start = max(0, i - n)
            end = i
            count_dict = {}
            sum_dict = {}

            for k in range(start, end):
                pics_k = decpics_list[k]
                pays_k = payoffs_list[k]
                pics_k = list(pics_k)
                pays_k = list(pays_k)
                for pic, pay in zip(pics_k, pays_k):
                    count_dict[pic] = count_dict.get(pic, 0) + 1
                    sum_dict[pic] = sum_dict.get(pic, 0.0) + float(pay)
            decmaps_i = list(decmaps_list[i])
            counts_i = []
            avg_i = []

            for pic in decmaps_i:
                c = count_dict.get(pic, 0)
                counts_i.append(c)
                if c > 0:
                    avg_i.append(sum_dict[pic] / c)
                else:
                    avg_i.append(0.0)

            sub_counts.append(counts_i)
            sub_avg.append(avg_i)

        all_counts.extend(sub_counts)
        all_avg.extend(sub_avg)

    df["prev_n_selected_times"] = all_counts
    df["prev_n_avg_payoff"] = all_avg

    return df


df_new = compute_selected_times_avgpayoff(full_df, n=10)
df_new.head()

Unnamed: 0,ID,Group,round,Decmaps,Decpics,Expltime,Dectime_interval,Dectimes,Payoffs,prev_n_selected_times,prev_n_avg_payoff
0,201901,Lc,1,"[262, 327, 310, 257, 145, 359, 303, 312, 206, ...","[269, 257, 310, 145]","[0, 14.32499999999709, 0, 0, 1.523000000001047...","[60.13799999999901, 20.718000000000757, 4.7610...","[56215.577, 56275.715, 56296.433, 56301.194, 5...","[2, 2, 4, 5]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,201901,Lc,2,"[74, 248, 240, 36, 59, 217, 52, 132, 75, 2, 46...","[52, 46, 217, 59]","[0, 0, 0, 0, 0, 12.415000000000873, 0, 0, 17.6...","[13.49500000000262, 0.8909999999959837, 3.4740...","[56331.835, 56345.33, 56346.221, 56349.695, 56...","[4, 4, 6, 4]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,201901,Lc,3,"[82, 60, 32, 131, 50, 241, 8, 87, 240, 140, 22...","[131, 140, 129, 119]","[75.79699999999139, 0, 8.709999999999127, 2.61...","[10.364999999997963, 1.2640000000028522, 3.502...","[56549.057, 56559.422, 56560.686, 56564.189, 5...","[2, 2, 2, 2]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,201901,Lc,4,"[190, 178, 283, 325, 214, 314, 189, 216, 277, ...","[151, 168, 178, 153]","[0, 0, 0, 0, 15.001999999993131, 0, 0, 0, 0.85...","[73.02000000000407, 1.532999999995809, 1.97699...","[56572.971, 56645.991, 56647.524, 56649.501, 5...","[5, 5, 5, 5]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,201901,Lc,5,"[183, 78, 328, 350, 63, 286, 39, 108, 356, 216...","[181, 192, 216, 183]","[0, 0, 0, 0, 13.172000000013213, 0, 0, 0, 0, 0...","[22.51299999999901, 1.5459999999948195, 1.9120...","[56859.999, 56882.512, 56884.058, 56885.97, 56...","[6, 6, 6, 6]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [10]:
df_new.to_csv('2019_data_with_features.csv', index=False)
df_new.to_pickle('2019_data_with_features.pkl')