# 1. Load Packages

In [1]:
import re
# Regular Expression (RegEx），to search, match and encode Chinese character

from collections import Counter

import pandas as pd

import jieba
import jieba.posseg as pseg

import ltp
from ltp import LTP

import torch

In [3]:
from multiprocessing import Pool, cpu_count

In [5]:
print(ltp.__version__)

4.2.13


In [7]:
ltp = LTP()  # load the default model

  state_dict = torch.load(model_file, map_location=map_location)


## 1.1. Test

In [10]:
result = ltp.pipeline(["清华大学是一所著名的高等学府。"], tasks=["cws"])
print("Segmentation Result:", result.cws)

Segmentation Result: [['清华大学', '是', '一', '所', '著名', '的', '高等', '学府', '。']]


In [12]:
type(result.cws)

list

In [14]:
# input the sentence
sentence = ["清华大学是一所著名的高等学府。"]

# Segmentation and lexical annotation by pipeline
result = ltp.pipeline(sentence, tasks=["cws", "pos"])

# Segmentation results
seg = result.cws[0]
print("Segmentation:", seg)

# Lexical annotation results
pos = result.pos[0]
print("Lexical Annotation:", pos)

# Extracting verbs (LTP uses 'v' for verbs)
verbs = [word for word, tag in zip(seg, pos) if tag == "v"]
print("Verb list:", verbs)

Segmentation: ['清华大学', '是', '一', '所', '著名', '的', '高等', '学府', '。']
Lexical Annotation: ['ni', 'v', 'm', 'q', 'a', 'u', 'b', 'n', 'wp']
Verb list: ['是']


# 2. Load Data

## 2.1. Former Couplet

In [18]:
with open("in上联.txt", "r", encoding="utf-8") as file:
    former = [line.strip() for line in file if line.strip()] 

In [20]:
len(former)

741096

## 2.2. Later Couplet

In [23]:
with open("out下联.txt", "r", encoding="utf-8") as file:
    later = [line.strip() for line in file if line.strip()] 

In [25]:
len(later)

741096

## 2.3. Couplet Merging 

In [28]:
couplet = pd.DataFrame({'Column1': former, 'Column2': later})

In [30]:
couplet

Unnamed: 0,Column1,Column2
0,晚 风 摇 树 树 还 挺,晨 露 润 花 花 更 红
1,愿 景 天 成 无 墨 迹,万 方 乐 奏 有 于 阗
2,丹 枫 江 冷 人 初 去,绿 柳 堤 新 燕 复 来
3,忽 忽 几 晨 昏 ， 离 别 间 之 ， 疾 病 间 之 ， 不 及 终 年 同 静 好,茕 茕 小 儿 女 ， 孱 羸 若 此 ， 娇 憨 若 此 ， 更 烦 二 老 费 精 神
4,闲 来 野 钓 人 稀 处,兴 起 高 歌 酒 醉 中
...,...,...
741091,半 榻 诗 书 盈 陋 室,一 墙 字 画 靓 寒 庐
741092,借 角 青 山 埋 姓 字,掬 壶 明 月 洗 尘 心
741093,苑 内 尽 天 姿 ， 锦 窠 仙 髻 无 双 艳,亭 前 多 国 色 ， 金 粉 紫 檀 第 一 香
741094,浩 淼 洞 庭 ， 极 目 天 为 界,安 闲 钓 叟 ， 静 心 孰 羡 鱼


In [32]:
type(couplet)

pandas.core.frame.DataFrame

# 3. Data Clean

## 3.1. Remove non-Chinese characters from each element

In [36]:
def keep_only_chinese(s):
    # Extract all Chinese characters using regular expressions
    chinese_characters = re.findall(r'[\u4e00-\u9fff]', s)
    # Re-splicing the extracted Chinese characters into a string
    return ''.join(chinese_characters)

In [38]:
cleaned_former_1 = [keep_only_chinese(line) for line in former]
cleaned_later_1 = [keep_only_chinese(line) for line in later]

# Apply the function to each column of the DataFrame
cleaned_couplet_1 = couplet
cleaned_couplet_1['Column1'] = cleaned_couplet_1['Column1'].apply(keep_only_chinese)
cleaned_couplet_1['Column2'] = cleaned_couplet_1['Column2'].apply(keep_only_chinese)

In [39]:
print('The length of cleaned_former_1 is',len(cleaned_former_1))
print('The length of cleaned_later_1 is',len(cleaned_later_1))
print('cleaned_couplet_1(df）has',cleaned_couplet_1.shape[0],'rows')


The length of cleaned_former_1 is 741096
The length of cleaned_later_1 is 741096
cleaned_couplet_1(df）has 741096 rows


In [40]:
cleaned_couplet_1

Unnamed: 0,Column1,Column2
0,晚风摇树树还挺,晨露润花花更红
1,愿景天成无墨迹,万方乐奏有于阗
2,丹枫江冷人初去,绿柳堤新燕复来
3,忽忽几晨昏离别间之疾病间之不及终年同静好,茕茕小儿女孱羸若此娇憨若此更烦二老费精神
4,闲来野钓人稀处,兴起高歌酒醉中
...,...,...
741091,半榻诗书盈陋室,一墙字画靓寒庐
741092,借角青山埋姓字,掬壶明月洗尘心
741093,苑内尽天姿锦窠仙髻无双艳,亭前多国色金粉紫檀第一香
741094,浩淼洞庭极目天为界,安闲钓叟静心孰羡鱼


## 3.2. Retain only the rows where the number of Chinese characters is 7

In [42]:
pattern = re.compile(r'^[\u4e00-\u9fff]{7}$')

In [47]:
cleaned_former_2 = [sentence for sentence in cleaned_former_1 if pattern.match(sentence)]
cleaned_later_2 = [sentence for sentence in cleaned_later_1 if pattern.match(sentence)]

# 同时检查 Column1 和 Column2
cleaned_couplet_2 = cleaned_couplet_1[
    cleaned_couplet_1['Column1'].apply(lambda x: bool(pattern.match(x))) &
    cleaned_couplet_1['Column2'].apply(lambda x: bool(pattern.match(x)))
]

In [49]:
print('The length of cleaned_former_2 is',len(cleaned_former_2))
print('The length of cleaned_later_2 is',len(cleaned_later_2))
print('cleaned_couplet_2(df）has',cleaned_couplet_2.shape[0],'rows')

The length of cleaned_former_2 is 346104
The length of cleaned_later_2 is 346104
cleaned_couplet_2(df）has 346097 rows


In [51]:
# Check the number of eligible rows for Column1
column1_valid = cleaned_couplet_1['Column1'].apply(lambda x: bool(pattern.match(x))).sum()
print(f"The number of eligible rows for Column1: {column1_valid}")

# Check the number of eligible rows for Column2
column2_valid = cleaned_couplet_1['Column2'].apply(lambda x: bool(pattern.match(x))).sum()
print(f"The number of eligible rows for Column2: {column2_valid}")


The number of eligible rows for Column1: 346104
The number of eligible rows for Column2: 346104


In [53]:
valid_couplet_df = pd.DataFrame({
    'Column1': cleaned_couplet_1['Column1'][cleaned_couplet_1['Column1'].apply(lambda x: bool(pattern.match(x)))].tolist(),
    'Column2': cleaned_couplet_1['Column2'][cleaned_couplet_1['Column2'].apply(lambda x: bool(pattern.match(x)))].tolist()
})

print(f"The length of the merged data is: {len(valid_couplet_df)}")


The length of the merged data is: 346104


In [55]:
print(valid_couplet_df)
print(cleaned_couplet_2)

        Column1  Column2
0       晚风摇树树还挺  晨露润花花更红
1       愿景天成无墨迹  万方乐奏有于阗
2       丹枫江冷人初去  绿柳堤新燕复来
3       闲来野钓人稀处  兴起高歌酒醉中
4       投石向天跟命斗  闭门问卷与时争
...         ...      ...
346099  寺镇摩尼青色宝  山飞舍利紫祥光
346100  呼饭为斋禅十足  借鸡生蛋利三分
346101  半榻诗书盈陋室  一墙字画靓寒庐
346102  借角青山埋姓字  掬壶明月洗尘心
346103  志踏云梯能揽月  坚磨铁棒可成针

[346104 rows x 2 columns]
        Column1  Column2
0       晚风摇树树还挺  晨露润花花更红
1       愿景天成无墨迹  万方乐奏有于阗
2       丹枫江冷人初去  绿柳堤新燕复来
4       闲来野钓人稀处  兴起高歌酒醉中
6       投石向天跟命斗  闭门问卷与时争
...         ...      ...
741087  寺镇摩尼青色宝  山飞舍利紫祥光
741089  呼饭为斋禅十足  借鸡生蛋利三分
741091  半榻诗书盈陋室  一墙字画靓寒庐
741092  借角青山埋姓字  掬壶明月洗尘心
741095  志踏云梯能揽月  坚磨铁棒可成针

[346097 rows x 2 columns]


In [57]:
# Find the difference set with `pd.merge`.
missing_rows = valid_couplet_df.merge(cleaned_couplet_2, on=['Column1', 'Column2'], how='left', indicator=True)
missing_rows = missing_rows[missing_rows['_merge'] == 'left_only']

print(f"Number of missing rows: {len(missing_rows)}")
print(missing_rows[['Column1', 'Column2']])


Number of missing rows: 219422
        Column1  Column2
12535   静夜残风添寂寞  才雄能应人际情
12536   性懒全抛世俗事  水月调弦淡雅风
12537   山泉出句清廉韵  红枫霜重雁乌寒
12538   碧海潮生鱼龙跃  背后常多算计人
12539   尘中哪有笔直路  文近天然雅自生
...         ...      ...
330760  抱手围棋观胜负  绿水青山室外琴
330761  红云碧宇天然画  九州织梦绣图新
330762  百会征联文集雅  虎大了无狼狗欺
330764  人穷时有蚊虫咬  一轮蟾影恰当帘
330765  四面岚光俱入座  有时云峤听钩辀

[219422 rows x 2 columns]


## 3.3. Reset Row Index

In [60]:
# Reset Row Index
cleaned_couplet_3 = cleaned_couplet_2.reset_index(drop=True)

In [62]:
print(cleaned_couplet_3)

        Column1  Column2
0       晚风摇树树还挺  晨露润花花更红
1       愿景天成无墨迹  万方乐奏有于阗
2       丹枫江冷人初去  绿柳堤新燕复来
3       闲来野钓人稀处  兴起高歌酒醉中
4       投石向天跟命斗  闭门问卷与时争
...         ...      ...
346092  寺镇摩尼青色宝  山飞舍利紫祥光
346093  呼饭为斋禅十足  借鸡生蛋利三分
346094  半榻诗书盈陋室  一墙字画靓寒庐
346095  借角青山埋姓字  掬壶明月洗尘心
346096  志踏云梯能揽月  坚磨铁棒可成针

[346097 rows x 2 columns]


## 3.4. Remove duplicate lines

In [65]:
# Find duplicate rows and get their location and number of repetitions
def find_duplicates(df):
    # Find all duplicate rows
    duplicates = df[df.duplicated(keep=False)]
    
    # Count the number of repetitions per line
    duplicate_counts = df.duplicated(keep=False).value_counts()
    
    # Getting the index of duplicate rows
    duplicate_indices = duplicates.index.tolist()
    
    # Print information of duplicate lines
    print("Information of duplicate lines:")
    for index in duplicate_indices:
        # Find identical rows
        duplicate_rows = df[df.apply(lambda row: row.equals(df.loc[index]), axis=1)]
        
        print(f"\n The information of thw duplicate line {index}")
        print(duplicate_rows)
        print(f"This row duplicated {len(duplicate_rows)} times")
        print("-" * 50)
    
    # Return the total number of duplicate rows and details
    return {
        'total_duplicates': len(duplicates),
        'duplicate_indices': duplicate_indices
    }

In [67]:
#####
#####
#WARNING!!! DO NOT RUN!!! TIME CONSUMER!!!
#####
#####

# duplicate_info = find_duplicates(cleaned_couplet_3)
# print("\n The number of duplicates :", duplicate_info['total_duplicates'])

In [69]:
# Group counting by identical rows
duplicate_counts = cleaned_couplet_3.duplicated(keep=False)
detail_duplicates = cleaned_couplet_3[duplicate_counts]

# Print the number of duplicate lines
print("Total number of duplicate lines:", len(detail_duplicates))

# Print the number of times each repeating line occurs
print("\n Number of occurrences of the repeated rows:")
print(detail_duplicates.groupby(detail_duplicates.columns.tolist()).size())

Total number of duplicate lines: 2296

 Number of occurrences of the repeated rows:
Column1  Column2
一世尘缘难舍弃  三生爱恋不归来    2
一丝惆怅无端起  几许苍凉暗上来    2
一代名师人逝去  千秋华表鹤飞来    2
一任岁月随风去  不辞春光带雨来    2
一别挥手从军去  几度焚林放火来    2
                   ..
龙腾华夏宏图起  燕舞新春福气来    2
龙腾沧海卷雪去  虎踞灵山迎春来    2
龙腾虎跃忍疼去  猴年马月终会来    2
龙蟠沧海贺岁去  虎踞灵山迎春来    2
龙门有兆春风起  泉眼无声活水来    2
Length: 1148, dtype: int64


In [71]:
# Remove duplicate rows and keep the first occurrence
cleaned_couplet_4 = cleaned_couplet_3.drop_duplicates()

# Reset indexes
cleaned_couplet_4 = cleaned_couplet_4.reset_index(drop=True)

In [73]:
print('cleaned_couplet_4(df) has ',cleaned_couplet_4.shape[0],'rows')

cleaned_couplet_4(df) has  344949 rows


In [75]:
print(cleaned_couplet_4)

        Column1  Column2
0       晚风摇树树还挺  晨露润花花更红
1       愿景天成无墨迹  万方乐奏有于阗
2       丹枫江冷人初去  绿柳堤新燕复来
3       闲来野钓人稀处  兴起高歌酒醉中
4       投石向天跟命斗  闭门问卷与时争
...         ...      ...
344944  寺镇摩尼青色宝  山飞舍利紫祥光
344945  呼饭为斋禅十足  借鸡生蛋利三分
344946  半榻诗书盈陋室  一墙字画靓寒庐
344947  借角青山埋姓字  掬壶明月洗尘心
344948  志踏云梯能揽月  坚磨铁棒可成针

[344949 rows x 2 columns]


## 3.5. Further Check

### 3.5.1. Whether all 7 Chinese characters

In [85]:
def count_chinese_characters(s):
    # Use regular expressions to match all Chinese characters.
    chinese_characters = re.findall(r'[\u4e00-\u9fff]', s)
    # Returns the number of Chinese characters matched
    return len(chinese_characters)

In [87]:
# Apply the counter function
chinese_count_df = pd.DataFrame({
    'Column1_chinese_count': cleaned_couplet_4['Column1'].apply(count_chinese_characters),
    'Column2_chinese_count': cleaned_couplet_4['Column2'].apply(count_chinese_characters)
})

In [89]:
# Check both columns
all_7_both_columns = all((chinese_count_df['Column1_chinese_count'] == 7) & 
                         (chinese_count_df['Column2_chinese_count'] == 7))
print("The two columns have all elements of 7 Chinese character:", all_7_both_columns)

The two columns have all elements of 7 Chinese character: True


### 3.5.2.  Determine whether a string contains non-Hanzi characters

In [92]:
def has_non_chinese_characters(s):
    # Regular Expression Matching for Non-Chinese Characters
    # [^\u4e00-\u9fff] matches all characters that are not Chinese characters
    return bool(re.search(r'[^\u4e00-\u9fff]', s))

In [94]:
non_chinese_df = pd.DataFrame({
    'Column1_has_non_chinese': cleaned_couplet_4['Column1'].apply(has_non_chinese_characters),
    'Column2_has_non_chinese': cleaned_couplet_4['Column2'].apply(has_non_chinese_characters)
})

In [96]:
non_chinese_both_columns = all((non_chinese_df['Column1_has_non_chinese'] == 7) & 
                               (non_chinese_df['Column2_has_non_chinese'] == 7))
print("Non-Chinese character exists:", non_chinese_both_columns)

Non-Chinese character exists: False


In [98]:
print(cleaned_couplet_4)

        Column1  Column2
0       晚风摇树树还挺  晨露润花花更红
1       愿景天成无墨迹  万方乐奏有于阗
2       丹枫江冷人初去  绿柳堤新燕复来
3       闲来野钓人稀处  兴起高歌酒醉中
4       投石向天跟命斗  闭门问卷与时争
...         ...      ...
344944  寺镇摩尼青色宝  山飞舍利紫祥光
344945  呼饭为斋禅十足  借鸡生蛋利三分
344946  半榻诗书盈陋室  一墙字画靓寒庐
344947  借角青山埋姓字  掬壶明月洗尘心
344948  志踏云梯能揽月  坚磨铁棒可成针

[344949 rows x 2 columns]


# 4. Mark Verb

## 4.1. Package 'jieba' (dropped)

In [41]:
# # 定义一个函数，用于标记动词为 1，其他词性为 0
# def tag_verbs(sentence):
#     # 使用 jieba 分词和词性标注
#     words = pseg.cut(sentence)
    
#     # 结果列表，初始全为 0
#     result = [0] * len(sentence)
    
#     # 遍历分词结果，根据词性标记
#     for word, flag in words:
#         if flag == 'v':  # 动词的词性标记为 'v'
#             for i, char in enumerate(sentence):
#                 if char in word:  # 如果字符属于当前词
#                     result[i] = 1
    
#     return result

In [42]:
# # 对上联和下联分别标记动词
# cleaned_couplet_4['Column1_Tag'] = cleaned_couplet_4['Column1'].apply(tag_verbs)
# cleaned_couplet_4['Column2_Tag'] = cleaned_couplet_4['Column2'].apply(tag_verbs)

# # 查看结果
# print(cleaned_couplet_4)

## 4.2. Package 'ltp'

### 4.2.1. Defining function for labeling verb

In [105]:
ltp = LTP()  # Loading the default model

  state_dict = torch.load(model_file, map_location=map_location)


In [107]:
def tag_verbs_by_character(sentence):
    # Split sentences into individual characters
    characters = list(sentence)
    # Segmentation and lexical labeling of entire sentences using LTP
    result = ltp.pipeline([sentence], tasks=["cws", "pos"])
    seg = result.cws[0]  # Segmentation result
    pos = result.pos[0]  # Lexical annotation result

    # Create verbatim lexical annotation result
    tags = []
    for char in characters:
        # Check whether the current Chinese character appears in the segmentation result
        if char in seg:
            # If the kanji is found, get its lexical property
            ## kanji ??? Chinese Character
            index = seg.index(char)
            tag = 1 if pos[index] == "v" else 0
        else:
            # If the kanji is not tagged by the LTP segmentation result, it defaults to a non-verb.
            tag = 0
        tags.append(tag)
    return tags

In [109]:
tag_verbs_by_character("志踏云梯能揽月")

[0, 1, 0, 0, 1, 1, 0]

In [111]:
# Defining Parallel Processing Functions
def parallel_apply(func, data, num_processes=None):
    """
    Function for parallel processing
    - func: Function to be applied
    - data: data set
    - num_processes: Number of processes (default is number of CPU cores)
    """
    if num_processes is None:
        num_processes = cpu_count()  # Uses all CPU cores of the system
    with Pool(processes=num_processes) as pool:
        results = pool.map(func, data)
    return results

In [113]:
# Parallel processing of two columns
# cleaned_couplet_4['Column1_Verbs'] = parallel_apply(tag_verbs_by_character, cleaned_couplet_4['Column1'].tolist())
# cleaned_couplet_4['Column2_Verbs'] = parallel_apply(tag_verbs_by_character, cleaned_couplet_4['Column2'].tolist())

# print(cleaned_couplet_4)

### 4.2.1. Small batch run

In [116]:
couplet_0_10000 = cleaned_couplet_4.head(10000)

In [58]:
couplet_10001_20000 = cleaned_couplet_4.iloc[10000:20000]  # Note that the index starts at 0

In [64]:
couplet_20001_30000 = cleaned_couplet_4.iloc[20000:30000]  # Note that the index starts at 0

In [68]:
couplet_30001_40000 = cleaned_couplet_4.iloc[30000:40000]  # Note that the index starts at 0

In [71]:
print(couplet_30001_40000)

       Column1  Column2
30000  镜里鬓霜为墨换  笔底文章因才活
30001  黄莺紫燕迎春使  白雪红梅贺岁图
30002  学士班联旧清秘  翰林风月小澄怀
30003  桂馥须邀嘉客赏  桔黄应待国宾尝
30004  花娇惹妒群蜂踩  果老夸张打马云
...        ...      ...
39995  寒泉漱月清秋味  白石游虾古色香
39996  田家好客三沽酒  游子怀乡几弄弦
39997  把盏纵横天下事  埋头旦暮主人心
39998  超脱三界外难也  放浪五行中易乎
39999  南山树绿弥清韵  古塔霞红溢雅情

[10000 rows x 2 columns]


In [72]:
# Mark the verbs for each of the former and later couplets
couplet_30001_40000['Column1_Tag'] = couplet_30001_40000['Column1'].apply(tag_verbs_by_character)
couplet_30001_40000['Column2_Tag'] = couplet_30001_40000['Column2'].apply(tag_verbs_by_character)

# Print result
print(couplet_30001_40000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  couplet_30001_40000['Column1_Tag'] = couplet_30001_40000['Column1'].apply(tag_verbs_by_character)


       Column1  Column2            Column1_Tag            Column2_Tag
30000  镜里鬓霜为墨换  笔底文章因才活  [0, 0, 0, 0, 1, 0, 1]  [0, 0, 0, 0, 0, 0, 0]
30001  黄莺紫燕迎春使  白雪红梅贺岁图  [0, 0, 0, 0, 0, 0, 1]  [0, 0, 0, 0, 0, 0, 0]
30002  学士班联旧清秘  翰林风月小澄怀  [0, 0, 0, 0, 0, 0, 0]  [0, 0, 0, 0, 0, 0, 0]
30003  桂馥须邀嘉客赏  桔黄应待国宾尝  [0, 0, 0, 1, 0, 0, 1]  [0, 0, 1, 1, 0, 0, 1]
30004  花娇惹妒群蜂踩  果老夸张打马云  [0, 0, 1, 1, 0, 0, 1]  [0, 0, 0, 0, 0, 0, 0]
...        ...      ...                    ...                    ...
39995  寒泉漱月清秋味  白石游虾古色香  [0, 0, 1, 0, 0, 0, 0]  [0, 0, 0, 0, 0, 0, 0]
39996  田家好客三沽酒  游子怀乡几弄弦  [0, 0, 0, 0, 0, 0, 0]  [0, 0, 0, 0, 0, 1, 0]
39997  把盏纵横天下事  埋头旦暮主人心  [0, 0, 0, 0, 0, 0, 0]  [0, 0, 0, 0, 0, 0, 0]
39998  超脱三界外难也  放浪五行中易乎  [1, 1, 0, 0, 0, 0, 0]  [0, 0, 0, 0, 0, 0, 0]
39999  南山树绿弥清韵  古塔霞红溢雅情  [0, 0, 0, 0, 1, 0, 0]  [0, 0, 0, 0, 1, 0, 0]

[10000 rows x 4 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  couplet_30001_40000['Column2_Tag'] = couplet_30001_40000['Column2'].apply(tag_verbs_by_character)


In [73]:
couplet_30001_40000.to_csv('词性数据/couplet_30001_40000.csv', index=False)

In [60]:
# Setting the start and end indexes
start_index = 200000  
end_index = cleaned_couplet_4.shape[0]  # Total number of rows of data
step = 10000  # Step size for each process

for i in range(start_index, end_index, step):
    # Dynamic generation of chunked data
    chunk = cleaned_couplet_4.iloc[i:i + step]
    
    # Apply the tag_verbs_by_character Function
    chunk['Column1_Tag'] = chunk['Column1'].apply(tag_verbs_by_character)
    chunk['Column2_Tag'] = chunk['Column2'].apply(tag_verbs_by_character)
    
    # Dynamically generated file names
    output_path = f'词性数据/couplet_{i + 1}_{min(i + step, end_index)}.csv'
    
    # Save data
    chunk.to_csv(output_path, index=False)
    
    print(f"Saved: {output_path}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(tag_verbs_by_character)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(tag_verbs_by_character)


Saved: 词性数据/couplet_200001_210000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(tag_verbs_by_character)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(tag_verbs_by_character)


Saved: 词性数据/couplet_210001_220000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(tag_verbs_by_character)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(tag_verbs_by_character)


Saved: 词性数据/couplet_220001_230000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(tag_verbs_by_character)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(tag_verbs_by_character)


Saved: 词性数据/couplet_230001_240000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(tag_verbs_by_character)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(tag_verbs_by_character)


Saved: 词性数据/couplet_240001_250000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(tag_verbs_by_character)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(tag_verbs_by_character)


Saved: 词性数据/couplet_250001_260000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(tag_verbs_by_character)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(tag_verbs_by_character)


Saved: 词性数据/couplet_260001_270000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(tag_verbs_by_character)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(tag_verbs_by_character)


Saved: 词性数据/couplet_270001_280000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(tag_verbs_by_character)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(tag_verbs_by_character)


Saved: 词性数据/couplet_280001_290000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(tag_verbs_by_character)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(tag_verbs_by_character)


Saved: 词性数据/couplet_290001_300000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(tag_verbs_by_character)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(tag_verbs_by_character)


Saved: 词性数据/couplet_300001_310000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(tag_verbs_by_character)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(tag_verbs_by_character)


Saved: 词性数据/couplet_310001_320000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(tag_verbs_by_character)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(tag_verbs_by_character)


Saved: 词性数据/couplet_320001_330000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(tag_verbs_by_character)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(tag_verbs_by_character)


Saved: 词性数据/couplet_330001_340000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(tag_verbs_by_character)


Saved: 词性数据/couplet_340001_344949.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(tag_verbs_by_character)


# 5. Segmentation

## 5.1. Define the Segmentation function

In [121]:
# Input Sentences
sentence = ["清华大学是一所著名的高等学府。"]

# Segmentation using the pipeline method
result = ltp.pipeline(sentence, tasks=["cws"])  # Performing the Segmentation Task

# Extracting Segmentation Results
seg = result.cws[0]  # Get the result of the first sentence's disambiguation
print("Segmentation Result:", seg)

Segmentation Result: ['清华大学', '是', '一', '所', '著名', '的', '高等', '学府', '。']


In [125]:
# Defining encoding functions
def encode_sentence(sentence):
    """
    Sentences are segmented and coded:
    The first word of each word is labeled 1 and the rest of the word is labeled 0.
    """
    # Segmentation
    result = ltp.pipeline([sentence], tasks=["cws"])
    seg = result.cws[0]  # Segmentation result

    # Initialize the code list
    #encoding = [0] * len(sentence)
    encoding = [0] * 7 
    ###
    ### I've changed it directly to 7. If you want to run the following test code, change the encoding back to the one above.

    # Iterate over the result of the word segmentation, marking the encoding
    start_idx = 0
    for word in seg:
        # Mark the first Chinese character of the current expression as 1
        encoding[start_idx] = 1
        # Move the index position to skip the length of the current word
        start_idx += len(word)
    
    # Converting a list of codes into a string
    return ''.join(map(str, encoding))



In [127]:
# example sentence
sentence = "清华大学是一所著名的高等学府"

# call function
encoded = encode_sentence(sentence)
print("Encoding results:", encoded)

编码结果: 10001111011010


In [129]:
# example sentence
sentence = "黄莺紫燕迎春使"

# call function
encoded = encode_sentence(sentence)
print("Encoding results:", encoded)

编码结果: 1010101


## 5.2. Small batch run

In [132]:
couplet_0_10000 = cleaned_couplet_4.head(10000)

In [134]:
print(couplet_0_10000)

      Column1  Column2
0     晚风摇树树还挺  晨露润花花更红
1     愿景天成无墨迹  万方乐奏有于阗
2     丹枫江冷人初去  绿柳堤新燕复来
3     闲来野钓人稀处  兴起高歌酒醉中
4     投石向天跟命斗  闭门问卷与时争
...       ...      ...
9995  无常最是风云变  淡定一如泰岳闲
9996  泻玉吻梅香雪海  熔金落日醉春江
9997  且凭鹤驾寻沧海  忍送文星上碧天
9998  第一香应名士品  初三月是美人修
9999  诗风到处生春景  禅趣来时悟妙机

[10000 rows x 2 columns]


In [136]:
couplet_0_10000['Column1_Tag'] = couplet_0_10000['Column1'].apply(encode_sentence)
couplet_0_10000['Column2_Tag'] = couplet_0_10000['Column2'].apply(encode_sentence)

KeyboardInterrupt: 

In [83]:
print(couplet_0_10000)

      Column1  Column2 Column1_Tag Column2_Tag
0     晚风摇树树还挺  晨露润花花更红     1011111     1011011
1     愿景天成无墨迹  万方乐奏有于阗     1101110     1001111
2     丹枫江冷人初去  绿柳堤新燕复来     1001111     1001011
3     闲来野钓人稀处  兴起高歌酒醉中     1111111     1010101
4     投石向天跟命斗  闭门问卷与时争     1111111     1000111
...       ...      ...         ...         ...
9995  无常最是风云变  淡定一如泰岳闲     1011100     1011101
9996  泻玉吻梅香雪海  熔金落日醉春江     1101111     1010110
9997  且凭鹤驾寻沧海  忍送文星上碧天     1111110     1111110
9998  第一香应名士品  初三月是美人修     1011110     1001100
9999  诗风到处生春景  禅趣来时悟妙机     1010110     1011110

[10000 rows x 4 columns]


In [84]:
couplet_0_10000.to_csv('分词数据/couplet_0_10000.csv', index=False)

In [48]:
# Setting the start and end indexes
start_index = 130000  
end_index = cleaned_couplet_4.shape[0]  # Total number of rows of data
step = 10000  # Step size for each process

for i in range(start_index, end_index, step):
    # Dynamic generation of chunked data
    chunk = cleaned_couplet_4.iloc[i:i + step]
    
    # Applying the tag_verbs_by_character Function
    chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
    chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)
    
    # Dynamically generated file names
    output_path = f'分词数据/分词couplet_{i + 1}_{min(i + step, end_index)}.csv'
    
    # Save data
    chunk.to_csv(output_path, index=False)
    
    print(f"Saved: {output_path}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_130001_140000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_140001_150000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_150001_160000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_160001_170000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_170001_180000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_180001_190000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_190001_200000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_200001_210000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_210001_220000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_220001_230000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_230001_240000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_240001_250000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_250001_260000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_260001_270000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_270001_280000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_280001_290000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_290001_300000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_300001_310000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_310001_320000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_320001_330000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)


Saved: 分词数据/分词couplet_330001_340000.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column1_Tag'] = chunk['Column1'].apply(encode_sentence)


Saved: 分词数据/分词couplet_340001_344949.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Column2_Tag'] = chunk['Column2'].apply(encode_sentence)
