In [27]:
import pandas as pd
import numpy as np
import string
from math import log

# 从更新后的Excel文件中读取数据
df = pd.read_excel('remove_student_teacher_processed.xlsx')


# 定义文本预处理函数
def preprocess_text(text):
    # 去除标点符号
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    # 转换文本为小写
    text = text.lower()
    # 分割单词
    words = text.split()
    # 去除停用词 "teacher" 和 "student"
    words = [word for word in words if word not in ['teacher', 'student']]
    return " ".join(words)

# 对 'text' 列进行预处理，将每个单词转换为小写并去除停用词
df['processed_text'] = df['text'].apply(preprocess_text)

# 根据 'd' 列对数据进行分组，以便为每个组单独进行计算
grouped = df.groupby('d')

# 定义一个函数来计算每个单词的IDF值
def calculate_idf(word, group):
    # 计算包含该单词的文档数
    documents_containing_word = group['processed_text'].apply(lambda x: word in x).sum()
    # 使用给定的公式计算IDF值
    idf = log(group['response'].values[0] / documents_containing_word)
    return idf

# 创建一个3x3x3的三维数组（初始值为0）
NIDF = []

# 遍历每个组，并计算 'processed_text' 列中每个单词的IDF值和NIDF值
for group in grouped:
    # 获取该组中所有文本的单词集合
    words = set(" ".join(group[1]['processed_text']).split())
    
    # 计算该组中每个单词的IDF值
    idf_values = []
    
    # 创建一个新的DataFrame来存储每个单词的IDF和NIDF值
    idf_df = pd.DataFrame(columns=['Word', 'IDF', 'NIDF'])
    
    for word in words:
        idf = calculate_idf(word, group[1])
        idf_values.append(idf)
        
        # 将单词和IDF值添加到新的DataFrame中
        idf_df = idf_df.append({'Word': word, 'IDF': idf}, ignore_index=True)
    
    # 计算该组中单词的最大最小IDF值，并计算NIDF值
    max_idf = max(idf_values)
    min_idf = min(idf_values)
    idf_df.loc[idf_df['Word'].isin(words), 'NIDF'] = (idf_df['IDF'] - min_idf) / (max_idf - min_idf)

    # 向三维数组的第0个索引位置添加二维数据
    NIDF.append(idf_df)

    
# # 将NIDF数据保存到三维数据结构中（字典）
# nidf_dict = {}
# for d, group in grouped:
#     nidf_dict[d] = idf_df.loc[idf_df['Word'].isin(group['processed_text'].sum().split())][['Word', 'NIDF']]

# # 显示NIDF数据
# for d, nidf_data in nidf_dict.items():
#     print(f"Data with d='{d}':")
#     print(nidf_data)
#     print("--------------------------")

print(NIDF)

[        Word       IDF      NIDF
0     saying  3.688879  1.000000
1       verb  2.302585  0.624196
2      lived  3.688879  1.000000
3       word  2.302585  0.624196
4        day  2.079442  0.563705
..       ...       ...       ...
490  british  3.688879  1.000000
491  runnign  3.688879  1.000000
492   littel  3.688879  1.000000
493   smoked  3.688879  1.000000
494     part  2.995732  0.812098

[495 rows x 3 columns],             Word       IDF      NIDF
0           hows  3.496508  1.000000
1      returning  3.496508  1.000000
2         saying  3.496508  1.000000
3            two  2.803360  0.801760
4           word  2.397895  0.685797
..           ...       ...       ...
397  edinborough  3.496508  1.000000
398         more  1.704748  0.487557
399     normally  3.496508  1.000000
400   examplethe  3.496508  1.000000
401        heard  3.496508  1.000000

[402 rows x 3 columns],       Word       IDF      NIDF
0      the  0.722135  0.174949
1       of  1.358123  0.360154
2    large  3.55

In [64]:
# 获取所有不重复的'd'值
unique_d_values = df['d'].unique()

# 创建一个字典，将不重复的'd'值作为key，key从0开始增加1，value为相应的索引
d_mapping = {d: i for i, d in enumerate(unique_d_values)}

print(d_mapping)  # 打印字典
print(len(d_mapping))  # 打印字典长度，即不重复的'd'值的数量
print(d_mapping[7])  # 打印索引为1的'd'值


{7: 0, 9: 1, 12: 2, 27: 3, 28: 4, 30: 5, 36: 6, 37: 7, 43: 8, 52: 9, 54: 10, 62: 11, 64: 12, 70: 13, 74: 14, 77: 15, 80: 16, 83: 17, 84: 18, 88: 19, 92: 20, 94: 21, 103: 22, 110: 23, 118: 24, 119: 25, 125: 26, 127: 27, 131: 28, 132: 29, 134: 30, 142: 31, 143: 32, 152: 33, 153: 34, 154: 35, 159: 36, 163: 37, 166: 38, 167: 39, 168: 40, 172: 41, 177: 42, 188: 43, 190: 44, 195: 45, 199: 46, 205: 47, 211: 48, 212: 49, 214: 50, 219: 51, 221: 52, 231: 53, 249: 54, 253: 55, 254: 56, 257: 57, 258: 58}
59
0


In [128]:
print(next(iter(grouped))[1]['processed_text'])

0                        hi there all ok hi how are you
1     yeah im good thanks just been for a run thats ...
2     yeah what did you do ill come back on the have...
3     ok i see too bad about the runningbut yoga is ...
4     yes i realise its not easy really i can see it...
5     not similar to so not dissimilar meanssimilar ...
6     its bit of a complicated way of putting it i g...
7     yes im sure youre righti have thought about do...
8     yeah well my daughter started doing it and has...
9     ok sure she hit me with her elbow the other da...
10    no we were play fighting but she takes it quit...
11    no i think youre right i guess kids with sibli...
12    you can say as part of a team or in a team or ...
13     but not as a team just for yourself i understand
14    i sed to play a lot of football but then retir...
15    yeah some kids just dont for different reasons...
16    yeah in the uk too id saythere are lots of par...
17    yes you know one thing that i think about 

In [122]:
# 定义一个步长
step = 0.05

# 创建一个空的矩阵用于存储结果
rare_words_matrix = []

# 遍历每个组，并计算每个数据中有多少个词为rare word
for group in grouped:
    d_rare_words_count = []  # 存储当前d中每段话中rare words的数量
    d_nidf_values = NIDF[d_mapping[group[0]]]  # 获取当前d中每个数据的NIDF值
    d_nidf_values = d_nidf_values['NIDF']  # 将NIDF值转换为数值型
    # words = d_nidf_values[d_nidf_values > 0.95].index  # 获取NIDF大于0.95的词作为rare words
    
    # words = NIDF[d_mapping[group[0]]].loc[d_nidf_values[d_nidf_values > 0.95].index, 'Word'].values
    
    # 统计当前d中每段话中rare words的数量
    for threshold in np.arange(0.95, 0.05, -step):
        words = NIDF[d_mapping[group[0]]].loc[d_nidf_values[d_nidf_values > threshold].index, 'Word'].values
        rare_words_count = np.sum(group[1]['processed_text'].apply(lambda text: sum(word in words for word in text.split())))
        d_rare_words_count.append(rare_words_count)
    
    rare_words_matrix.append(d_rare_words_count)  # 将当前d的结果添加到矩阵中

# 将矩阵转换为DataFrame，方便查看结果
rare_words_matrix_df = pd.DataFrame(rare_words_matrix, columns=np.arange(0.95, 0.05, -step))
print(rare_words_matrix_df)


    0.95  0.90  0.85  0.80  0.75  0.70  0.65  0.60  0.55  0.50  0.45  0.40  \
0    263   263   263   425   425   505   505   586   667   706   769   926   
1    202   202   202   327   327   327   417   467   467   530   585   684   
2     82    82    82    82   152   152   178   178   200   228   243   278   
3    184   184   184   287   287   360   360   442   481   520   626   652   
4    344   344   344   577   577   577   686   775   775   815   918   949   
5    154   154   154   220   220   254   254   288   318   355   362   400   
6    108   108   108   175   175   175   218   237   264   278   323   341   
7    132   132   132   204   204   204   260   310   310   354   389   458   
8     48    48    48    48    99    99    99   129   137   137   142   155   
9    198   198   198   290   290   357   357   453   510   621   649   687   
10   161   161   161   280   280   280   369   435   435   501   565   591   
11   168   168   168   168   265   265   360   360   422   491  

In [124]:
# 定义一个步长
step = 0.05

# 创建一个空的矩阵用于存储结果
rare_words_matrix = []

# 遍历每个组，并计算每个数据中有多少个词为rare word
for group in grouped:
    d_rare_words_count = []  # 存储当前d中每段话中rare words的数量
    d_nidf_values = NIDF[d_mapping[group[0]]]  # 获取当前d中每个数据的NIDF值
    d_nidf_values = d_nidf_values['NIDF']  # 将NIDF值转换为数值型
    
    # 统计当前d中每段话中rare words的数量
    for threshold in np.arange(0.95, 0.05, -step):
        words = NIDF[d_mapping[group[0]]].loc[d_nidf_values > threshold, 'Word'].values
        rare_words_count = group[1]['processed_text'].apply(lambda text: sum(word in words for word in text.split()))
        d_rare_words_count.append(rare_words_count)
    
    d_rare_words_count = pd.concat(d_rare_words_count, axis=1)  # 将每个阈值下的 rare words 数量拼接在一起
    d_rare_words_count.columns = np.arange(0.95, 0.05, -step)  # 设置列名为对应的阈值
    rare_words_matrix.append(d_rare_words_count)  # 将当前d的结果添加到矩阵中

# 将矩阵转换为DataFrame，方便查看结果
rare_words_matrix_df = pd.concat(rare_words_matrix)  # 将每个d的结果拼接在一起
print(rare_words_matrix_df)

# 存储为Excel文件
rare_words_matrix_df.to_excel('rare_words.xlsx', index=False)


      0.95  0.90  0.85  0.80  0.75  0.70  0.65  0.60  0.55  0.50  0.45  0.40  \
0        0     0     0     0     0     0     0     0     0     1     1     3   
1        1     1     1     2     2     4     4     5     8     9    12    16   
2       20    20    20    29    29    34    34    37    50    50    54    68   
3       15    15    15    27    27    36    36    39    42    42    48    61   
4        5     5     5    10    10    16    16    16    19    20    21    26   
...    ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   
2970     4     4     4     4     5     5     6     6     6     9    10    11   
2971     1     1     1     1     1     1     1     1     2     4     4     4   
2972     0     0     0     0     0     0     1     1     1     1     1     1   
2973    13    13    13    13    19    19    25    25    25    29    32    34   
2974     8     8     8     8     9     9    11    11    12    14    16    17   

      0.35  0.30  0.25  0.20  0.15  0.1

In [36]:
# 创建一个新的DataFrame来存储每个d中每段话的rare words数量
rare_words_count_df = pd.DataFrame(columns=['d', 'id', 'Threshold', 'RareWordsCount'])

# 遍历每个组，并计算每个d中每段话的rare words数量
for d, group in grouped:
    # 获取该组的文本id和NIDF值
    ids = group['id'].values
    nidf_values = idf_df.loc[idf_df['Word'].isin(group['processed_text'].sum().split())]['NIDF'].values
    
    # 以0.05为步长，计算rare words数量
    for threshold in np.arange(0.95, 0.05, -0.05):
        rare_words_count = np.sum(nidf_values > threshold)
        
        # 将结果添加到DataFrame中
        for i in range(len(ids)):
            rare_words_count_df = rare_words_count_df.append({'d': d, 'id': ids[i], 'Threshold': threshold, 'RareWordsCount': rare_words_count}, ignore_index=True)

# 显示每个d中每段话的rare words数量矩阵
print(rare_words_count_df)


         d         id  Threshold RareWordsCount
0        7  45939.txt       0.95           3277
1        7  45940.txt       0.95           3277
2        7  45941.txt       0.95           3277
3        7  45942.txt       0.95           3277
4        7  45943.txt       0.95           3277
...    ...        ...        ...            ...
53545  258  38342.txt       0.10           8090
53546  258  38343.txt       0.10           8090
53547  258  38344.txt       0.10           8090
53548  258  38345.txt       0.10           8090
53549  258  38346.txt       0.10           8090

[53550 rows x 4 columns]


In [27]:
import pandas as pd
import numpy as np
import string
from math import log

# 从更新后的Excel文件中读取数据
df = pd.read_excel('remove_student_teacher_processed.xlsx')

# 创建一个新的DataFrame来存储每个d中每段话中的rare words数量以及对应的文本id
rare_words_matrix = pd.DataFrame(columns=['d', 'id', 'Threshold', 'RareWordsCount'])

# 定义文本预处理函数
def preprocess_text(text):
    # 去除标点符号
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    # 转换文本为小写
    text = text.lower()
    # 分割单词
    words = text.split()
    # 去除停用词 "teacher" 和 "student"
    words = [word for word in words if word not in ['teacher', 'student']]
    return " ".join(words)

# 对 'text' 列进行预处理，将每个单词转换为小写并去除停用词
df['processed_text'] = df['text'].apply(preprocess_text)

# 根据 'd' 列对数据进行分组，以便为每个组单独进行计算
grouped = df.groupby('d')

# 定义一个函数来计算每个单词的IDF值
def calculate_idf(word, group):
    # 计算包含该单词的文档数
    documents_containing_word = group['processed_text'].apply(lambda x: word in x).sum()
    # 使用给定的公式计算IDF值
    idf = log(group['response'].values[0] / documents_containing_word)
    return idf

# 遍历每个组，并计算 'processed_text' 列中每个单词的IDF值和NIDF值
for d, group in grouped:
    # 获取该组中所有文本的单词集合
    words = set(" ".join(group['processed_text']).split())
    
    # 计算该组中每个单词的IDF值
    idf_values = []
    for word in words:
        idf = calculate_idf(word, group)
        idf_values.append(idf)
        
    # 遍历不同的阈值，计算rare words数量
    for threshold in np.arange(0.95, 0.05, -0.05):
        rare_words_count = len([idf for idf in idf_df['NIDF'] if idf > threshold])
        
        # 获取该组的文本id
        ids = group['id'].values[0]
        
        # 将结果添加到DataFrame中
        rare_words_matrix = rare_words_matrix.append({'d': d, 'id': ids, 'Threshold': threshold, 'RareWordsCount': rare_words_count}, ignore_index=True)

# 显示rare words数量矩阵
print(rare_words_matrix)


        d         id  Threshold RareWordsCount
0       7  45939.txt       0.95          11639
1       7  45939.txt       0.90          13062
2       7  45939.txt       0.85          14187
3       7  45939.txt       0.80          15878
4       7  45939.txt       0.75          16814
...   ...        ...        ...            ...
1057  258  38320.txt       0.30          21464
1058  258  38320.txt       0.25          21667
1059  258  38320.txt       0.20          21831
1060  258  38320.txt       0.15          21986
1061  258  38320.txt       0.10          22109

[1062 rows x 4 columns]
