# 读取文件

In [2]:
import pandas as pd
import numpy as np
import json

train_log = pd.read_csv("./data/train_logs.csv")
train_log.head();

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
8405893,fff05981,3615,2063944,2064440,496,Nonproduction,Leftclick,Leftclick,NoChange,1031,240
8405894,fff05981,3616,2064497,2064497,0,Nonproduction,Shift,Shift,NoChange,1031,240
8405895,fff05981,3617,2064657,2064765,108,Replace,q,q,q => q,1031,240
8405896,fff05981,3618,2069186,2069259,73,Nonproduction,Leftclick,Leftclick,NoChange,1028,240
8405897,fff05981,3619,2070065,2070133,68,Input,.,.,.,1029,240


# 数据处理
## 击键数据衡量指标
### 生产率
* 写作中每分钟的字符产出(包含空格)
* 整篇文章写完后平均每分钟的字符产出(包含空格)
### 暂停
暂停一般定义为：IKI = Time2 - Time1 > 2000 milliseconds
* 暂停总数(总共或每分钟)
* 暂停时间比例((暂停时间/总时间)%)
* 暂停长度(通常是文本制作中所有暂停的平均持续时间)
* 在单词内、单词之间、句子之间、段落之间等暂停长度或频率。
### 修正
* 删除次数（总计或每分钟）
* 插入次数（总计或每分钟）
* 删除的长度（以字符为单位）
* 插入的长度（以字符为单位）
* 删除的比例（占总写作时间的百分比）
* 插入的比例（占总写入时间的百分比）
* 产品过程比率（产品中的字符数除以写入过程中产生的字符数）
* 当前点点（即正在制作的文本的当前末尾）的修订次数/长度
* 修改前面已经写好部分的数量/长度
* 立即修改次数（闪烁光标位置与修改点之间的距离为零）
* 远程修改次数（闪烁光标位置与修改点之间的距离大于零）
### 突发事件
**突发是指文本生产中连续生产文本的时期，没有停顿和/或修改。**

P-burst指的是以停顿终止的书面片段，R-burst描述的是以评估、修订或其他语法不连续性终止的片段。
* P-burst 数量（总计或每分钟）
* R-burst 数量（总计或每分钟）
* P-burst 的比例（占总写入时间的百分比）
* R-burst 的比例（占总写入时间的百分比）
* P-burst 的长度（以字符为单位）
* R-burst 的长度（以字符为单位）
### 写作过程中的差异
过程差异关注写作过程相对于时间的动态，从而代表作者的流畅程度在不同阶段可能存在的差异。

过程差异通常是通过首先将整个书写过程划分为一定数量的相等时间间隔（例如 5 或 10），然后计算间隔内产生的字符总数（通常标准化为每分钟的平均字符数）来测量的。或者为了使其在作家之间更具可比性，将其标准化为每个时间间隔产生的字符比例。 每个时间间隔产生的字符的差异也是根据击键日志数据计算的，作为过程差异的指标。

In [7]:
# 生产率计算
pd.set_option('display.min_rows', 50)  # 在DataFrame中显示50行
condition = 'id == "001519c8"';
res = train_log.query(condition);
see = res.tail(100);
see
# print(train_log['activity']);
# 整篇文章完成后的生产率


Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
2457,001519c8,2458,1515376,1515499,123,Input,q,q,q,1301,253
2458,001519c8,2459,1515480,1515576,96,Input,q,q,q,1302,253
2459,001519c8,2460,1515555,1515741,186,Input,q,q,q,1303,253
2460,001519c8,2461,1515689,1515817,128,Input,q,q,q,1304,253
2461,001519c8,2462,1515812,1515932,120,Input,q,q,q,1305,253
2462,001519c8,2463,1516034,1516183,149,Input,Space,Space,,1306,253
2463,001519c8,2464,1516803,1516939,136,Input,q,q,q,1307,254
2464,001519c8,2465,1516858,1516996,138,Input,q,q,q,1308,254
2465,001519c8,2466,1516978,1517076,98,Input,Space,Space,,1309,254
2466,001519c8,2467,1517165,1517269,104,Input,q,q,q,1310,255


# 模拟写作过程

In [8]:
def simulate(row, content):
	activity = row.loc['activity']
	position = row.loc['cursor_position']
	down_event = row.loc['down_event']
	text_change = row.loc['text_change']
	
	if activity == 'Nonproduction':
		pass
	elif activity == 'Input':
		content = content[:position] + text_change + content[position:]
	elif activity == 'Remove/Cut':
		content = content[:position] + content[position + 1:] 
	elif activity == 'Paste':
		content = content[:position - len(text_change)] + text_change + content[position - len(text_change):]
	elif activity == 'Replace':
		replaces = text_change.split(' => ')
		position -= len(replaces[1])
		content = content[:position] + replaces[1] + content[position + len(replaces[0]):]
	elif 'Move From' in activity:
		activity = activity.replace('Move From ','')
		activity = activity.split(' To ')
		origin = json.loads(activity[0])
		target = json.loads(activity[1])
		content = content[:origin[0]] + content[origin[1]:]
		content = content[:target[0]] + text_change + content[target[0]:]

	return content

content = ''
first = train_log.loc[train_log['id'] == '001519c8']
for index, a in first.iterrows():
	content = simulate(row=a, content=content)

print(content)

'''
simulate(row=train_log.loc[2359], content="")
train_log.loc[train_log['activity'] == 'Paste'].head()
i = 2359
print(len(train_log.loc[i, 'text_change']))
print(train_log[i - 5 : i + 5])
print('hello' + train_log.loc[i,'text_change'] + 'world')
condition = ['Move From' in a for a in train_log['activity']]
train_log.loc[condition].head()
'''


qqqqqqqqqq qq qqqqq qq qqqq qqqq.  qqqqq qqq qqqq qqqqqq qq qq qqqqq qq qqqq qqqqq qq qqqqqqqqq qqqqq qqqq qqqqq qqqq qqqqqqqq qqqqqqqqq qqqq.  qqqqqq qqq qqqqq qqqq qqqqqqqqqq q,q qqq qqqqqqqqqq qqqqq qqq qqqqq qqqqqq qq qq qqqqq qqqqqqqqqq qqqqqqq qq qq qqqqqqqqqqq.  qqqqqqqq qq qqqqqqqqqqq qqqq qqq qqqqqqqqq qqq qqqqqqq qq qqqqqq qqqqq qqq qqq q qqqqqqqqq qq qq qqq qqqqq qqqqq
 qq qqq.
qq qq qqqq qqqq qqq qqqqqqqqq qqq qqqqqqq qq qqq qqqqq qqqqq, qq qq  qqqqqqqqq qqq qqqqqqqq qqqqq qq qqq qqqqqqqqqqq qq qqqqqqqqq.  qqqqqqqqq qq qqq qqqqqqqq qqq qqq qqqq qq qqqqqqq qqqqq qqqqq, qqq qqqqqq qqqqq qqqqq qqq qqq qq qqq qqqqqqq qqqqqqq qqqq.  qqqq qqqqq qqqqq qqqq qqqq 'qqqqqqq qqqqqqqqq qqqqq qqqqqqq qqqqqqq qqqqqqqqqq, qqqq qq qqqqqqqqqq qqqqqqq qqq qqqqqqq; qqqqqqq, qqqqq qqqqqqqq qqqqqq qqqqqqq qqqqqqq qqq qqqqq qqq qqq qqq qqqqqqq.  qqqq qqqqqqqqq qqqq qqq qqqq qqqq qq qqqqqqqqqqqqq qqqq qqqqq qqqqq.  qqq qqqqqqqqqq qq qqqqqqqq q qqqqqq, qqqqqqqq qqqq qqqq qqqqqqqqqq, qqq. qq qqqqq q

'\nsimulate(row=train_log.loc[2359], content="")\ntrain_log.loc[train_log[\'activity\'] == \'Paste\'].head()\ni = 2359\nprint(len(train_log.loc[i, \'text_change\']))\nprint(train_log[i - 5 : i + 5])\nprint(\'hello\' + train_log.loc[i,\'text_change\'] + \'world\')\ncondition = [\'Move From\' in a for a in train_log[\'activity\']]\ntrain_log.loc[condition].head()\n'