In [1]:
import pandas as pd

df = pd.read_csv('selected-ann.csv')

import sklearn
from sklearn.model_selection import StratifiedKFold

# user_id列为唯一标识，content列为文本内容，score列为评分
# 按评分，分层进行五折交叉验证
# 设定random_state=0，保证每次运行结果一致
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_dfs = []
test_dfs = []
for train_index, test_index in skf.split(df['content'], df['score']):
    train_df = df.loc[train_index]
    test_df = df.loc[test_index]
    train_dfs.append(train_df)
    test_dfs.append(test_df)

In [2]:
# 提取文本浅层特征，包括中文的词频、词性、句法依存关系，构建线性回归模型，计算5折交叉验证的平均QWK
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score
import numpy as np
import jieba
import pyltp
import jieba.posseg
import jieba.analyse
import spacy

model = spacy.load('zh_core_web_sm')

def get_tree_depth(text):
    # 对文本进行句法分析
    doc = model(text)

    depths = []
    # 遍历每个句子
    for sent in doc.sents:
        # 获取句法树的根节点
        root = sent.root

        # 计算从叶子节点到根节点的路径长度
        depth = 0
        for token in sent:
            if token.dep_ != 'ROOT':
                path_len = 1
                current_token = token
                while current_token.head != root:
                    path_len += 1
                    current_token = current_token.head
                depth = max(depth, path_len)

        depths.append(depth)

    # 返回句法树深度的平均值
    return sum(depths) / len(depths)

def getSimpleAttrs(text):
    attr = []
    # 字数
    attr.append(len(text))
    # 句子数
    sentences = pyltp.SentenceSplitter.split(text)
    attr.append(len(sentences))
    # 总词数
    attr.append(len(jieba.lcut(text)))
    # 每句话的平均字数
    attr.append(attr[0] / attr[1])
    # 分词后，词汇量
    attr.append(len(set(jieba.cut(text))))
    # 形容词词汇量
    attr.append(len(set([w for w, t in jieba.posseg.cut(text) if t.startswith('a')])))
    # 动词词汇量
    attr.append(len(set([w for w, t in jieba.posseg.cut(text) if t.startswith('v')])))
    # 名词词汇量
    attr.append(len(set([w for w, t in jieba.posseg.cut(text) if t.startswith('n')])))
    # 连词词汇量
    attr.append(len(set([w for w, t in jieba.posseg.cut(text) if t.startswith('c')])))
    # 平均句法树深度
    attr.append(get_tree_depth(text))
    return attr

train_attrs = {}
test_attrs = {}

for i in range(5):
    train_df = train_dfs[i]
    test_df = test_dfs[i]
    # 提取文本浅层特征
    print(i)
    train_attrs[i] = np.array([getSimpleAttrs(text) for text in train_df['content']])
    test_attrs[i] = np.array([getSimpleAttrs(text) for text in test_df['content']])

# 构建线性回归模型，计算5折交叉验证的平均QWK

preds = []
trues = []
for i in range(5):
    train_df = train_dfs[i]
    test_df = test_dfs[i]
    # 构建线性回归模型
    lr = LinearRegression()
    lr.fit(train_attrs[i], train_df['score'])
    # 拟合结果详细信息
    print('Model', i, ":")
    print('Coefficients: \n', lr.coef_)
    print('Intercept: \n', lr.intercept_)
    print('R2: \n', lr.score(train_attrs[i], train_df['score']))
    # 计算QWK
    pred = lr.predict(test_attrs[i])
    for i in range(len(pred)):
        pred[i] = round(pred[i])
    preds.extend(pred)
    trues.extend(test_df['score'])

print('QWK: \n', cohen_kappa_score(trues, preds, weights='quadratic'))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache


0


Loading model cost 0.493 seconds.
Prefix dict has been built successfully.


1
2
3
4
Model 0 :
Coefficients: 
 [-0.00418627  0.03662068  0.00822377 -0.0047862  -0.00241734  0.06641793
  0.00490553  0.00240476  0.01802271  0.47960648]
Intercept: 
 -0.17029244261762155
R2: 
 0.22021820512983203
Model 1 :
Coefficients: 
 [-0.00568805  0.07677755  0.00728801 -0.00208812 -0.00767011  0.10928824
  0.02314468  0.00567956  0.04501041  0.45717305]
Intercept: 
 -0.22246298982946744
R2: 
 0.2475516293496881
Model 2 :
Coefficients: 
 [-0.00389477  0.03293367  0.00560601 -0.00387343 -0.00951521  0.07461051
  0.02479802  0.0274025   0.04138507  0.42715234]
Intercept: 
 -0.03083086550459635
R2: 
 0.2200876693838547
Model 3 :
Coefficients: 
 [-0.00640468  0.03298682  0.01077683 -0.0116658  -0.0071925   0.08610526
  0.01767583  0.01092979  0.00279208  0.55099001]
Intercept: 
 -0.0950970195552232
R2: 
 0.21530956286798952
Model 4 :
Coefficients: 
 [-3.95608832e-03  3.31872115e-02  6.61424390e-03 -4.24218216e-03
 -3.36515267e-04  6.48108493e-02  9.43391400e-03 -2.42474252e-04
  5

In [7]:
# 构建SVM模型，计算5折交叉验证的平均QWK
from sklearn.svm import SVR
preds = []
trues = []
for i in range(5):
    train_df = train_dfs[i]
    test_df = test_dfs[i]
    # 构建SVM模型
    svr = SVR(kernel='linear')
    svr.fit(train_attrs[i], train_df['score'])
    # 拟合结果详细信息
    print('Model', i, ":")
    print('Coefficients: \n', svr.coef_)
    print('Intercept: \n', svr.intercept_)
    print('R2: \n', svr.score(train_attrs[i], train_df['score']))
    # 计算QWK
    pred = svr.predict(test_attrs[i])
    for i in range(len(pred)):
        pred[i] = round(pred[i])
    preds.extend(pred)
    trues.extend(test_df['score'])

print('QWK: \n', cohen_kappa_score(trues, preds, weights='quadratic'))

Model 0 :
Coefficients: 
 [[-0.00187691  0.04672855  0.00485837 -0.00306826 -0.00343717  0.13032148
   0.00509994 -0.00054437  0.03965667  0.52519932]]
Intercept: 
 [-0.70913894]
R2: 
 0.1911853691011408
Model 1 :
Coefficients: 
 [[-2.27272290e-03  8.27597996e-02 -3.69417522e-05 -1.29280294e-03
  -6.04725739e-03  1.51694626e-01  3.31831571e-02  1.24801756e-02
   7.70094979e-02  4.56168505e-01]]
Intercept: 
 [-0.72689785]
R2: 
 0.20699810738568758
Model 2 :
Coefficients: 
 [[-0.00087556  0.05525834  0.00522616 -0.00305818 -0.01890633  0.13880422
   0.02875635  0.02897431  0.05323417  0.45137663]]
Intercept: 
 [-0.5976344]
R2: 
 0.1675527098515306
Model 3 :
Coefficients: 
 [[ 2.65353869e-03  3.74046346e-02  1.98910191e-04 -1.71383572e-02
  -7.84118490e-03  1.27059049e-01  7.24438884e-03  5.33375112e-03
   2.60015197e-02  6.71344186e-01]]
Intercept: 
 [-0.6741221]
R2: 
 0.17322781756978034
Model 4 :
Coefficients: 
 [[-0.00139066  0.03650044  0.00098052 -0.00312666  0.00074631  0.13237737


In [6]:
import pandas as pd

df = pd.DataFrame(columns=["score", "content"])
import os
import json

for filename in os.listdir('result/enhanced'):
    index = int(filename.split(".")[0].split("-")[0])
    time = int(filename.split(".")[0].split("-")[1])
    filePath = "./result/enhanced/" + filename
    fileObj = json.load(open(filePath, encoding='utf8'))
    if time == 1:
        df.loc[len(df.index)] = [int(index/100), fileObj["response"].replace("\n", "")]


# 提取文本浅层特征，包括中文的词频、词性、句法依存关系，构建线性回归模型，计算5折交叉验证的平均QWK
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score
import numpy as np
import jieba
import pyltp
import jieba.posseg
import jieba.analyse
import spacy

model = spacy.load('zh_core_web_sm')

def get_tree_depth(text):
    # 对文本进行句法分析
    doc = model(text)

    depths = []
    # 遍历每个句子
    for sent in doc.sents:
        # 获取句法树的根节点
        root = sent.root

        # 计算从叶子节点到根节点的路径长度
        depth = 0
        for token in sent:
            if token.dep_ != 'ROOT':
                path_len = 1
                current_token = token
                while current_token.head != root:
                    path_len += 1
                    current_token = current_token.head
                depth = max(depth, path_len)

        depths.append(depth)

    # 返回句法树深度的平均值
    return sum(depths) / len(depths)

# 计算每个评分等级的文本长度平均值
for i in range(6):
    print(np.mean(df[df['score'] == i]['content'].apply(len)))
# 计算总体文本长度平均值
print(np.mean(df['content'].apply(len)))
# 计算每个评分等级的文本长度标准差
for i in range(6):
    print(np.std(df[df['score'] == i]['content'].apply(len)))
# 计算总体文本长度标准差
print(np.std(df['content'].apply(len)))

print('------------------')

# 计算每个评分等级的句子数平均值
for i in range(6):
    print(np.mean(df[df['score'] == i]['content'].apply(lambda x: len(pyltp.SentenceSplitter.split(x)))))
# 计算总体句子数平均值
print(np.mean(df['content'].apply(lambda x: len(pyltp.SentenceSplitter.split(x)))))
# 计算每个评分等级的句子数标准差
for i in range(6):
    print(np.std(df[df['score'] == i]['content'].apply(lambda x: len(pyltp.SentenceSplitter.split(x)))))
# 计算总体句子数标准差
print(np.std(df['content'].apply(lambda x: len(pyltp.SentenceSplitter.split(x)))))

print('------------------')

# 计算每个评分等级的句法树深度平均值
for i in range(6):
    print(np.mean(df[df['score'] == i]['content'].apply(get_tree_depth)))
# 计算总体句法树深度平均值
print(np.mean(df['content'].apply(get_tree_depth)))
# 计算每个评分等级的句法树深度标准差
for i in range(6):
    print(np.std(df[df['score'] == i]['content'].apply(get_tree_depth)))
# 计算总体句法树深度标准差
print(np.std(df['content'].apply(get_tree_depth)))

498.36
376.8
382.42
375.54
389.53
397.81
403.41
1040.0100337977512
125.13536670342243
107.54005579317878
115.30563039158147
97.94564359888601
109.64804558221728
438.73320507266527
------------------
14.35
14.65
14.69
13.93
14.24
14.42
14.38
4.8401962770119145
4.52852072977479
3.7032283213434196
4.499455522616042
3.4673332692430936
4.190894892502078
4.240157229789165
------------------
10.728784933735882
3.668286685029796
3.6756866172519693
3.6826669607482296
3.727790155412632
3.826507428313001
4.884953796748585
68.67273556840846
0.40768955181966826
0.3622769229858414
0.41793924582055964
0.3391014791990142
0.30606558278956214
28.159140621738246
