In [2]:
#import necessary packages
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import tree

from sklearn.metrics import f1_score
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
import time

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
trlabels = pd.read_csv("train_labels.csv")

sample = pd.read_csv("sample_submission.csv")

In [4]:
trlabels['session'] = trlabels.session_id.apply(lambda s: int(s.split('_')[0]))
trlabels['question'] = trlabels.session_id.apply(lambda s: int(s.split('_')[1][1:]))

In [5]:
#split train into 3 groups based on their levels
df_0_4 = train[train['level_group'] == '0-4']
df_5_12 = train[train['level_group'] == '5-12']
df_13_22 = train[train['level_group'] == '13-22']

In [6]:
def delt_time_def(df):
    df.sort_values(by=['session_id', 'elapsed_time'], inplace=True)
    df['delt_time'] = df['elapsed_time'].diff(1) # 对于同一个玩家的触发两个相邻的事件时的时间差
    df['delt_time'].fillna(0, inplace=True)
    df['delt_time'].clip(0, 103000, inplace=True) # 为什么是103000 s = 1716 min = 28 h
    return df

In [7]:
def feature_engineer(train):
    # 固定限定ID数据
    FIXED_TEXT = ['event_name', 'fqid', 'room_fqid', 'text_fqid', 'page']
    # 间隔时间和悬停时间
    NUMS = ['delt_time', 'hover_duration']
    # 点击事件
    EV_NAME = ['checkpoint','observation_click', 'cutscene_click', 'notification_click', 'person_click',
               'object_click', 'map_click', 'object_hover']    
    # 一个空的DataFeame,最开始只有用户的session_id(去重后的用户会话)作为行索引,因此每一行表示这个用户的一些信息
    new_train = pd.DataFrame(index=train['session_id'].unique(), columns=[])
    
    for c in EV_NAME:
        # 统计每一个用户会话某一个点击事件的开始点击次数
        new_train['l_ev_name_' + c] = train[ train['event_name'] == c ].groupby(['session_id'])['index'].count()
        # 统计每一个用户会话某一个点击事件的相邻点击间隔时间的和
        new_train['t_ev_name_' + c] = train[ train['event_name'] == c ].groupby(['session_id'])['delt_time'].sum()
    
    maska = train['name'] == 'basic' # 事件名称,获得关于base事件的布尔列表
    
    # 统计每一个用户最后一个发生的用户会话的时间（从开始到最后一个事件）
    new_train['finish'] = train[maska].groupby(['session_id'])['elapsed_time'].last(1)
    # 统计每个用户在游戏时点击会话事件的总次数
    new_train['len'] = train[maska].groupby(['session_id'])['index'].count()
    
    # 在限定ID数据上的统计，统计每一个用户在限定ID C上的种类数量
    for c in FIXED_TEXT:
        tmp = train[maska].groupby(['session_id'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique' 
        new_train = new_train.join(tmp)  # 对DataFrame进行拼接新的一列横着拼接新的列，可以一次性拼接多个
    
    # 每一个用户在某一个会话任务的间隔会话任务的平均时间和悬停发生时间的平均时间
    for c in NUMS:
        tmp = train[maska].groupby(['session_id'])[c].agg('mean')
        new_train = new_train.join(tmp)
    # 为什么使用标准差和平均值来统计时间
    # 每一个用户的某一个会话任务的间隔会话任务的标准差和悬停发生时间的标准差
    for c in NUMS:
        tmp = train[maska].groupby(['session_id'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        new_train = new_train.join(tmp)
    
    new_train = new_train.fillna(-1) # 填充缺失值-1
    return new_train

In [10]:
kol_lvl = (df_0_4.groupby(['session_id'])['level'].agg('nunique') < 5) 
# 统计每组的level中的值的种类数量，将种类数小于5的组标记为True，也就是只要level数小于5说明这一阶段的都没有通关
# print(kol_lvl.shape)
# print(kol_lvl)
list_session = kol_lvl[kol_lvl].index  
# 又因为kol_lvl的行索引就是session_id，因此直接将筛选的布尔矩阵的放进kol_lvl就可以，选择正确的id
# print(kol_lvl[kol_lvl])
df_0_4 = df_0_4[~ df_0_4['session_id'].isin(list_session) ] 
# session_id的值不在这个list_session就为False，否则为True，然后取反：获得符合要求的id


df_0_4 = delt_time_def(df_0_4)
train1 = feature_engineer(df_0_4)

print(f"train1 shape : {train1.shape}")

kol_lvl = (df_5_12.groupby(['session_id'])['level'].agg('nunique') < 8) # 小于8个的id没有通关，不要
list_session = kol_lvl[kol_lvl].index  # index返回的是不符合要求的session_id，因为行索引就是session_id

df_5_12 = df_5_12[~df_5_12['session_id'].isin(list_session)] # 如果某一列是不正确的id，则取反变成False不选择这一行

df_5_12 = delt_time_def(df_5_12)
train2 = feature_engineer(df_5_12)

print(f"train2 shape : {train2.shape}")

kol_lvl = (df_13_22.groupby(['session_id'])['level'].agg('nunique') < 10) # 小于10个的id没有通关，不要
list_session = kol_lvl[kol_lvl].index
df_13_22 = df_13_22[~ df_13_22['session_id'].isin(list_session) ]

df_13_22 = delt_time_def(df_13_22)
train3 = feature_engineer(df_13_22)

print(f"train3 shape : {train3.shape}")

train1 shape : (23562, 27)
train2 shape : (23561, 27)
train3 shape : (22986, 27)


In [8]:
train1.describe()

NameError: name 'train1' is not defined

In [24]:
# t2 = train2[['l_ev_name_checkpoint', 'l_ev_name_observation_click']]
# t3 = train3[['l_ev_name_checkpoint', 'l_ev_name_observation_click']]


KeyError: "None of [Index(['event_name', 'name', 'text', 'fqid', 'room_fqid', 'text_fqid'], dtype='object')] are in the [columns]"

In [15]:
FEATURES1 = [c for c in t1.columns if c not in ['level_group']] # 由于这里只有NUM类型的数据因此每个level的
FEATURES2 = [c for c in t2.columns if c not in ['level_group']]
FEATURES3 = [c for c in t3.columns if c not in ['level_group']]
print('We will train with', len(FEATURES1), len(FEATURES2), len(FEATURES3) ,'features')
ALL_USERS = train1.index.unique()
print('We will train with', len(ALL_USERS) ,'users info')

We will train with 2 2 2 features
We will train with 23562 users info


In [16]:
xgb_params = {
    'objective' : 'binary:logistic',
    'eval_metric':'logloss',
    'learning_rate': 0.05,
    'max_depth': 4,
    'n_estimators': 1000,
    'early_stopping_rounds': 50,
    'tree_method':'hist',
    'subsample':0.8,
    'colsample_bytree': 0.4,
    'use_label_encoder' : False}
models = {}

In [21]:
for t in range(1,19):
    # 分别训练不同问题的分类器
    if t <= 3:
        grp = '0-4'
        df = train1
        FEATURES = FEATURES1
        
    elif t <= 13:
        grp = '5-12'
        df = train2
        FEATURES = FEATURES2
    
    elif t <= 22:
        grp = '13-22'
        df = train3
        FEATURES = FEATURES3
    
    # TRAIN DATA
    train_x = df
#     print(train_x.shape, end=',')
    train_users = train_x.index.values # 返回train数据集中存在的session_id的列表
#     print(train_users, end)
    train_y = trlabels.loc[trlabels['question'] == t].set_index('session').loc[train_users] # 选择第t个问题的标签且id在训练数据中的
    train_y = train_y['correct']
#     print(train_y.shape)
    
    X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.2)
    clf = LinearRegression()
    clf.fit(X_train[FEATURES].astype('float32'), y_train)
    print(f'quiz{t}, train data shape is {X_train[FEATURES].shape}, test data shape is {y_train.shape}')
    print("train data F1 score is ", f1_score(y_train, clf.predict(X_train)))
    print(f'test data F1 score is ', f1_score(y_test, clf.predict(X_test)))
    # SAVE MODEL, PREDICT VALID OOF
    models[f'{grp}_{t}'] = clf

quiz1, train data shape is (18849, 2), test data shape is (18849,)


Feature names unseen at fit time:
- delt_time
- delt_time_std
- event_name_nunique
- finish
- fqid_nunique
- ...
Feature names must be in the same order as they were in fit.



ValueError: X has 27 features, but LinearRegression is expecting 2 features as input.

In [None]:
print(trlabels['session_id'].values)
print(trlabels['Question'].values)

# print(trlabels['IDQ'].iloc[i].index('_'))
#separate session ids into question groups (1-18)
#merge train w/ train_labels to get correct column

In [None]:
#merge trlabels and train to get Y (correct) and then predict 

In [None]:
group1 = trlabels.groupby('Question').get_group('q1')

IDs = trlabels.groupby('Question').get_group('q1')['session_id']

In [None]:
g1 = train[~train['session_id'].isin(IDs)]  

In [None]:
t = g1.groupby('level_group').get_group('0-4')
# print(len(g1['session_id']))

# print(len(group1))

In [None]:
#create train and test data 
X = g1
Y = group1
print(len(X))
print(len(Y))
# train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.4, random_state = 213)

In [None]:
#construct models
#print(X)
reg = LinearRegression().fit(train_x, (train_y))


In [None]:
#train model on test 
Xtest = test[['session_id', 'music']]

print(np.unique(reg.predict(Xtest)))

In [None]:
#dummy variables 