In [1]:
import pandas as pd
import copy
import numpy as np
import os
import sys

In [2]:
ALL_KEYS = ["fold", "uid", "questions", "concepts", "responses", "timestamps",
            "usetimes", "selectmasks", "is_repeat", "qidxs", "rest", "orirow", "cidxs"]
ONE_KEYS = ["fold", "uid","uni_concepts_num","interaction_num"]


In [38]:
df = pd.read_csv("../data/nips_task34/train_valid.csv")

In [39]:
df.columns

Index(['fold', 'uid', 'questions', 'concepts', 'responses', 'timestamps',
       'is_repeat'],
      dtype='object')

In [40]:
def get_int_num(row):
    interaction_num = len(row["responses"].split(","))
    row["interaction_num"] = interaction_num
    return row

In [41]:
df2 = df.apply(get_int_num, axis=1)
    

In [42]:
df2.shape

(3935, 8)

In [43]:
# 查找interaction的以50为分界的分布
df2["interaction_num"].describe()

count    3935.000000
mean      285.484879
std       186.534781
min        50.000000
25%       126.000000
50%       242.000000
75%       418.000000
max       839.000000
Name: interaction_num, dtype: float64

In [44]:
df3 = df2[df2["interaction_num"]>242]
df4= df2[df2["interaction_num"]<=242]

In [45]:
df3.shape

(1966, 8)

In [46]:
# 从df3和df4中各随机取出一半的数据，组成新的df5
df5 = pd.concat([df3.sample(frac=0.5), df4.sample(frac=0.5)])

In [47]:
df5.shape

(1967, 8)

In [48]:
def calStatistics(df, stares, key):
    allin, allselect = 0, 0
    allqs, allcs = set(), set()
    for i, row in df.iterrows():
        rs = row["responses"].split(",")
        curlen = len(rs) - rs.count("-1")
        allin += curlen
        if "selectmasks" in row:
            ss = row["selectmasks"].split(",")
            slen = ss.count("1")
            allselect += slen
        if "concepts" in row:
            cs = row["concepts"].split(",")
            fc = list()
            for c in cs:
                cc = c.split("_")
                fc.extend(cc)
            curcs = set(fc) - {"-1"}
            allcs |= curcs
        if "questions" in row:
            qs = row["questions"].split(",")
            curqs = set(qs) - {"-1"}
            allqs |= curqs
    stares.append(",".join([str(s)
                  for s in [key, allin, df.shape[0], allselect]]))
    return allin, allselect, len(allqs), len(allcs), df.shape[0]


In [49]:
stares = []
ins, ss, qs, cs, seqnum = calStatistics(df3, stares, 'origin train valid')
print(
        f"train+valid original interactions num: {ins}, select num: {ss}, qs: {qs}, cs: {cs}, seqnum: {seqnum}")

train+valid original interactions num: 862498, select num: 0, qs: 919, cs: 57, seqnum: 1966


# 切分sequences

In [50]:
def KFold_split(df, k=5):
    df = df.sample(frac=1.0, random_state=1024)
    datanum = df.shape[0]
    test_ratio = 1 / k
    test_num = int(datanum * test_ratio)
    rest = datanum % k

    start = 0
    folds = []
    for i in range(0, k):
        if rest > 0:
            end = start + test_num + 1
            rest -= 1
        else:
            end = start + test_num
        folds.extend([i] * (end - start))
        print(f"fold: {i+1}, start: {start}, end: {end}, total num: {datanum}")
        start = end
    # report
    finaldf = copy.deepcopy(df)
    finaldf["fold"] = folds
    return finaldf

In [59]:
df4 = KFold_split(df4, 5)
stares = []

fold: 1, start: 0, end: 394, total num: 1969
fold: 2, start: 394, end: 788, total num: 1969
fold: 3, start: 788, end: 1182, total num: 1969
fold: 4, start: 1182, end: 1576, total num: 1969
fold: 5, start: 1576, end: 1969, total num: 1969


In [60]:
def generate_sequences(df, effective_keys, min_seq_len=3, maxlen=200, pad_val=-1):
    save_keys = list(effective_keys) + ["selectmasks"]
    dres = {"selectmasks": []}
    dropnum = 0
    for i, row in df.iterrows():
        dcur = save_dcur(row, effective_keys)

        rest, lenrs = len(dcur["responses"]), len(dcur["responses"])
        j = 0
        while lenrs >= j + maxlen:
            rest = rest - (maxlen)
            for key in effective_keys:
                dres.setdefault(key, [])
                if key not in ONE_KEYS:
                    # [str(k) for k in dcur[key][j: j + maxlen]]))
                    dres[key].append(",".join(dcur[key][j: j + maxlen]))
                else:
                    dres[key].append(dcur[key])
            dres["selectmasks"].append(",".join(["1"] * maxlen))

            j += maxlen
        if rest < min_seq_len:  # delete sequence len less than min_seq_len
            dropnum += rest
            continue

        pad_dim = maxlen - rest
        for key in effective_keys:
            dres.setdefault(key, [])
            if key not in ONE_KEYS:
                paded_info = np.concatenate(
                    [dcur[key][j:], np.array([pad_val] * pad_dim)])
                dres[key].append(",".join([str(k) for k in paded_info]))
            else:
                dres[key].append(dcur[key])
        dres["selectmasks"].append(
            ",".join(["1"] * rest + [str(pad_val)] * pad_dim))

    # after preprocess data, report
    dfinal = dict()
    for key in ALL_KEYS:
        if key in save_keys:
            dfinal[key] = dres[key]
    finaldf = pd.DataFrame(dfinal)
    print(f"dropnum: {dropnum}")
    return finaldf

def save_dcur(row, effective_keys):
    dcur = dict()
    for key in effective_keys:
        if key not in ONE_KEYS:
            dcur[key] = row[key].split(",")#[int(i) for i in row[key].split(",")]
        else:
            dcur[key] = row[key]
    return dcur

In [61]:
effective_keys = df3.keys()
split_seqs = generate_sequences(
    df4, effective_keys=effective_keys)
ins, ss, qs, cs, seqnum = calStatistics(
    split_seqs, stares, "train+valid sequences")
print(
    f"train+valid sequences interactions num: {ins}, select num: {ss}, qs: {qs}, cs: {cs}, seqnum: {seqnum}")

dropnum: 31
train+valid sequences interactions num: 260854, select num: 260854, qs: 948, cs: 57, seqnum: 2297


In [62]:
split_seqs.to_csv("../data/nips_task34/train_valid_sequences_50_left.csv", index=False)