In [1]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset("humarin/chatgpt-paraphrases")

# Get training data
train_data = dataset['train']

# Create lists to store data
texts = []
paraphrases_list = []

# Filter and collect data
for data in train_data:
    if data['category'] == 'sentence' and data['source'] == 'cnn_news':
        texts.append(data['text'])
        paraphrases_list.append(data['paraphrases'])

# Create DataFrame and save
import pandas as pd
df = pd.DataFrame({
    'text': texts,
    'paraphrases': paraphrases_list
})
df.to_csv('input_sentences.csv', index=False)

In [2]:
df.head()

Unnamed: 0,text,paraphrases
0,"Matt Chorley, Mailonline Political Editor .",['The Political Editor of Mailonline is Matt C...
1,Universities minister David Willetts said all ...,"[""According to David Willetts, universities ca..."
2,Universities cannot be expected to march forei...,"['David Willetts, a senior Tory minister, caut..."
3,The universities minister told MailOnline lect...,"[""According to MailOnline, the universities mi..."
4,But he insisted they cannot be forced to track...,['He maintained that they cannot be compelled ...


In [None]:
# Convert string representation of list to actual list and take first element
df['paraphrases'] = df['paraphrases'].apply(eval).apply(lambda x: x[0])
df.head()

In [12]:
df_output = pd.read_csv('output_sentences_all.csv', on_bad_lines='skip', encoding='utf-8',) 
# 显示跳过的行信息
df_output.shape

(11629, 4)

In [13]:
df_output.head()

Unnamed: 0,Original Sentence,Modified Sentence,Modification Type,semantic_relationship
0,"Matt Chorley, Mailonline Political Editor .",Corley is a political editor for Mailonline.,RD,Predicted Relationship: entailment with 98.31%...
1,"Matt Chorley, Mailonline Political Editor .",Mailonline's persuasion editor is Matt Chorley.,hypernym,Predicted Relationship: entailment with 85.41%...
2,"Matt Chorley, Mailonline Political Editor .",Matt Chorley isn't a political editor at Mailo...,polarity_negation,Predicted Relationship: contradiction with 99....
3,"Matt Chorley, Mailonline Political Editor .",Mailonline political editor Matt Chorley,RS,Predicted Relationship: entailment with 99.95%...
4,"Matt Chorley, Mailonline Political Editor .",Mailonline's political editor Matt Chorley is ...,antonym,Predicted Relationship: neutral with 88.81% co...


In [17]:
# First cell: Clean data, remove rows with float type values
# Check and print float type values
float_mask = df_output['semantic_relationship'].apply(lambda x: isinstance(x, float))
problematic_rows = df_output[float_mask]
if len(problematic_rows) > 0:
    print(f"Found {len(problematic_rows)} problematic rows with float values")
    print("\nSample of problematic rows:")
    print(problematic_rows[['Original Sentence', 'Modified Sentence', 'Modification Type', 'semantic_relationship']].head())
    
    # Remove these rows
    df_output = df_output[~float_mask]
    print(f"\nAfter removing float values, DataFrame shape: {df_output.shape}")
else:
    print("No float values found in semantic_relationship column")

Found 37 problematic rows with float values

Sample of problematic rows:
                                      Original Sentence  \
3621  A team of scientists has just finished what mi...   
3622  A team of scientists has just finished what mi...   
7340  The fact that Syd Leroux comes in and changes ...   
7341  The fact that Syd Leroux comes in and changes ...   
7342  The fact that Syd Leroux comes in and changes ...   

                                      Modified Sentence  \
3621  Scientists have just finished surveiling and m...   
3622  The unscientific team just finished surveiling...   
7340                                                 RD   
7341                                           hypernym   
7342                                  polarity_negation   

                                      Modification Type semantic_relationship  
3621  Predicted Relationship: contradiction with 99....                   NaN  
3622  Predicted Relationship: contradiction with 99....  

In [18]:
df_output.shape

(11592, 4)

In [19]:
def extract_confidence(text):
    if 'contradiction with' not in text:
        return 0
    try:
        confidence = float(text.split('contradiction with')[1].split('%')[0].strip())
        return confidence
    except:
        return 0

test_text = "Predicted Relationship: contradiction with 99.93% confidence"
print(f"Test result: {extract_confidence(test_text)}%")

Test result: 99.93%


In [20]:
# 第三个单元格：筛选高置信度的contradiction记录
mask = (df_output['semantic_relationship'].str.contains('contradiction', na=False)) & \
       (df_output['semantic_relationship'].apply(extract_confidence) > 90)

high_confidence_contradictions = df_output[mask]

print(f"Found {len(high_confidence_contradictions)} rows with high confidence contradictions")
print("\nSample of filtered data:")
print(high_confidence_contradictions[['Original Sentence', 'semantic_relationship']].head())

Found 3051 rows with high confidence contradictions

Sample of filtered data:
                                    Original Sentence  \
2         Matt Chorley, Mailonline Political Editor .   
8   Universities minister David Willetts said all ...   
9   Universities minister David Willetts said all ...   
10  Universities cannot be expected to march forei...   
12  Universities cannot be expected to march forei...   

                                semantic_relationship  
2   Predicted Relationship: contradiction with 99....  
8   Predicted Relationship: contradiction with 97....  
9   Predicted Relationship: contradiction with 99....  
10  Predicted Relationship: contradiction with 99....  
12  Predicted Relationship: contradiction with 98....  


In [None]:
# 第四个单元格：检查原始df中的paraphrases格式
print("Original df paraphrases format:")
print(df['paraphrases'].head())
print("\nAfter eval:")
print(df['paraphrases'].apply(eval).apply(lambda x: x[0]).head())

In [22]:
# 第四个单元格：检查原始df中的paraphrases格式
print("Original df paraphrases format:")
print(df['paraphrases'].head())

# 检查第一行的具体内容和类型
print("\nFirst row type and content:")
first_row = df['paraphrases'].iloc[0]
print(f"Type: {type(first_row)}")
print(f"Content: {first_row}")

# 尝试直接获取第一个元素（如果已经是字符串的话）
print("\nTrying to use the string directly:")
print(df['paraphrases'].iloc[0])

Original df paraphrases format:
0    The Political Editor of Mailonline is Matt Cho...
1    According to David Willetts, universities can ...
2    David Willetts, a senior Tory minister, cautio...
3    According to MailOnline, the universities mini...
4    He maintained that they cannot be compelled to...
Name: paraphrases, dtype: object

First row type and content:
Type: <class 'str'>
Content: The Political Editor of Mailonline is Matt Chorley.

Trying to use the string directly:
The Political Editor of Mailonline is Matt Chorley.


In [23]:
# 第四个单元格：检查原始df中的paraphrases格式
print("Original df paraphrases format:")
print(df['paraphrases'].head())

# 确认数据类型
print("\nData type check:")
print(df['paraphrases'].dtype)
print(df['paraphrases'].apply(type).value_counts())

Original df paraphrases format:
0    The Political Editor of Mailonline is Matt Cho...
1    According to David Willetts, universities can ...
2    David Willetts, a senior Tory minister, cautio...
3    According to MailOnline, the universities mini...
4    He maintained that they cannot be compelled to...
Name: paraphrases, dtype: object

Data type check:
object
paraphrases
<class 'str'>    80076
Name: count, dtype: int64


In [24]:
# 第五个单元格：构建结果DataFrame
result_rows = []
for _, row in high_confidence_contradictions.iterrows():
    original_sentence = row['Original Sentence']
    matching_row = df[df['text'] == original_sentence]
    
    if not matching_row.empty:
        result_rows.append({
            'original_sentence': original_sentence,
            'modified_sentence': row['Modified Sentence'],
            'modification_type': row['Modification Type'],
            'semantic_relationship': row['semantic_relationship'],
            'paraphrase': matching_row.iloc[0]['paraphrases']  # 直接使用字符串值
        })

result_df = pd.DataFrame(result_rows)
print(f"Final result shape: {result_df.shape}")
print("\nSample of final results:")
print(result_df.head())

Final result shape: (3001, 5)

Sample of final results:
                                   original_sentence  \
0        Matt Chorley, Mailonline Political Editor .   
1  Universities minister David Willetts said all ...   
2  Universities minister David Willetts said all ...   
3  Universities cannot be expected to march forei...   
4  Universities cannot be expected to march forei...   

                                   modified_sentence  modification_type  \
0  Matt Chorley isn't a political editor at Mailo...  polarity_negation   
1  Home Minister David Blankts said the universit...                 RS   
2  University Minister David Blankts said the uni...            antonym   
3  Senior Conservative minister David Blankts war...                 RD   
4  No senior Conservative minister David Blankets...  polarity_negation   

                               semantic_relationship  \
0  Predicted Relationship: contradiction with 99....   
1  Predicted Relationship: contradiction wit

In [25]:
output_files = [
    'output_sentences_all.csv',
    'output_pack1.csv',
    'output_pack2.csv',
    'output_pack3.csv',
    'output_pack4.csv',
    'output_pack5.csv'
]

all_results = []
print("Files to process:", output_files)

Files to process: ['output_sentences_all.csv', 'output_pack1.csv', 'output_pack2.csv', 'output_pack3.csv', 'output_pack4.csv', 'output_pack5.csv']


In [26]:
# 第二个单元格：读取第一个文件测试处理流程
test_file = output_files[0]
print(f"Testing with file: {test_file}")

# 读取文件
df_output_temp = pd.read_csv(test_file, on_bad_lines='skip', encoding='utf-8')
print(f"Initial shape: {df_output_temp.shape}")

# 清理float值
float_mask = df_output_temp['semantic_relationship'].apply(lambda x: isinstance(x, float))
if float_mask.any():
    print(f"Found {float_mask.sum()} rows with float values")
    df_output_temp = df_output_temp[~float_mask]
    print(f"Shape after removing float values: {df_output_temp.shape}")

# 显示数据样本
print("\nSample of cleaned data:")
print(df_output_temp.head())

Testing with file: output_sentences_all.csv
Initial shape: (11629, 4)
Found 37 rows with float values
Shape after removing float values: (11592, 4)

Sample of cleaned data:
                             Original Sentence  \
0  Matt Chorley, Mailonline Political Editor .   
1  Matt Chorley, Mailonline Political Editor .   
2  Matt Chorley, Mailonline Political Editor .   
3  Matt Chorley, Mailonline Political Editor .   
4  Matt Chorley, Mailonline Political Editor .   

                                   Modified Sentence  Modification Type  \
0       Corley is a political editor for Mailonline.                 RD   
1    Mailonline's persuasion editor is Matt Chorley.           hypernym   
2  Matt Chorley isn't a political editor at Mailo...  polarity_negation   
3           Mailonline political editor Matt Chorley                 RS   
4  Mailonline's political editor Matt Chorley is ...            antonym   

                               semantic_relationship  
0  Predicted Relatio

In [27]:
# 第三个单元格：测试筛选高置信度contradiction
mask = (df_output_temp['semantic_relationship'].str.contains('contradiction', na=False)) & \
       (df_output_temp['semantic_relationship'].apply(extract_confidence) > 90)

high_confidence_temp = df_output_temp[mask]
print(f"Found {len(high_confidence_temp)} high confidence contradictions in {test_file}")
print("\nSample of high confidence contradictions:")
print(high_confidence_temp[['Original Sentence', 'semantic_relationship']].head())

Found 3051 high confidence contradictions in output_sentences_all.csv

Sample of high confidence contradictions:
                                    Original Sentence  \
2         Matt Chorley, Mailonline Political Editor .   
8   Universities minister David Willetts said all ...   
9   Universities minister David Willetts said all ...   
10  Universities cannot be expected to march forei...   
12  Universities cannot be expected to march forei...   

                                semantic_relationship  
2   Predicted Relationship: contradiction with 99....  
8   Predicted Relationship: contradiction with 97....  
9   Predicted Relationship: contradiction with 99....  
10  Predicted Relationship: contradiction with 99....  
12  Predicted Relationship: contradiction with 98....  


In [28]:
# 第四个单元格：测试构建结果
test_results = []
for _, row in high_confidence_temp.head().iterrows():  # 只测试前几行
    original_sentence = row['Original Sentence']
    matching_row = df[df['text'] == original_sentence]
    
    if not matching_row.empty:
        test_results.append({
            'original_sentence': original_sentence,
            'modified_sentence': row['Modified Sentence'],
            'modification_type': row['Modification Type'],
            'semantic_relationship': row['semantic_relationship'],
            'paraphrase': matching_row.iloc[0]['paraphrases'],
            'source_file': test_file
        })

print("Test results sample:")
pd.DataFrame(test_results).head()

Test results sample:


Unnamed: 0,original_sentence,modified_sentence,modification_type,semantic_relationship,paraphrase,source_file
0,"Matt Chorley, Mailonline Political Editor .",Matt Chorley isn't a political editor at Mailo...,polarity_negation,Predicted Relationship: contradiction with 99....,The Political Editor of Mailonline is Matt Cho...,output_sentences_all.csv
1,Universities minister David Willetts said all ...,Home Minister David Blankts said the universit...,RS,Predicted Relationship: contradiction with 97....,"According to David Willetts, universities can ...",output_sentences_all.csv
2,Universities minister David Willetts said all ...,University Minister David Blankts said the uni...,antonym,Predicted Relationship: contradiction with 99....,"According to David Willetts, universities can ...",output_sentences_all.csv
3,Universities cannot be expected to march forei...,Senior Conservative minister David Blankts war...,RD,Predicted Relationship: contradiction with 99....,"David Willetts, a senior Tory minister, cautio...",output_sentences_all.csv
4,Universities cannot be expected to march forei...,No senior Conservative minister David Blankets...,polarity_negation,Predicted Relationship: contradiction with 98....,"David Willetts, a senior Tory minister, cautio...",output_sentences_all.csv


In [29]:
# 第五个单元格：处理所有文件
for file in output_files:
    print(f"\nProcessing {file}...")
    try:
        # 读取文件
        df_output_temp = pd.read_csv(file, on_bad_lines='skip', encoding='utf-8')
        initial_shape = df_output_temp.shape
        
        # 清理float值
        float_mask = df_output_temp['semantic_relationship'].apply(lambda x: isinstance(x, float))
        if float_mask.any():
            df_output_temp = df_output_temp[~float_mask]
        
        # 筛选高置信度contradiction
        mask = (df_output_temp['semantic_relationship'].str.contains('contradiction', na=False)) & \
               (df_output_temp['semantic_relationship'].apply(extract_confidence) > 90)
        
        high_confidence_temp = df_output_temp[mask]
        
        # 构建结果
        for _, row in high_confidence_temp.iterrows():
            original_sentence = row['Original Sentence']
            matching_row = df[df['text'] == original_sentence]
            
            if not matching_row.empty:
                all_results.append({
                    'original_sentence': original_sentence,
                    'modified_sentence': row['Modified Sentence'],
                    'modification_type': row['Modification Type'],
                    'semantic_relationship': row['semantic_relationship'],
                    'paraphrase': matching_row.iloc[0]['paraphrases'],
                    'source_file': file
                })
        
        print(f"File processed: {initial_shape} -> {len(high_confidence_temp)} high confidence samples")
                
    except Exception as e:
        print(f"Error processing {file}: {str(e)}")


Processing output_sentences_all.csv...
File processed: (11629, 4) -> 3051 high confidence samples

Processing output_pack1.csv...
File processed: (8340, 4) -> 1696 high confidence samples

Processing output_pack2.csv...
File processed: (8220, 4) -> 1741 high confidence samples

Processing output_pack3.csv...
File processed: (8155, 4) -> 1738 high confidence samples

Processing output_pack4.csv...
File processed: (8010, 4) -> 1705 high confidence samples

Processing output_pack5.csv...
File processed: (8075, 4) -> 1732 high confidence samples


In [30]:
# 第六个单元格：创建最终DataFrame并查看统计信息
final_df = pd.DataFrame(all_results)
final_df = final_df.drop_duplicates()

print("Final DataFrame information:")
print(f"Shape: {final_df.shape}")
print("\nSamples from each source file:")
print(final_df['source_file'].value_counts())
print("\nSample of final results:")
print(final_df.head())

Final DataFrame information:
Shape: (11456, 6)

Samples from each source file:
source_file
output_sentences_all.csv    3001
output_pack3.csv            1709
output_pack2.csv            1703
output_pack5.csv            1699
output_pack1.csv            1676
output_pack4.csv            1668
Name: count, dtype: int64

Sample of final results:
                                   original_sentence  \
0        Matt Chorley, Mailonline Political Editor .   
1  Universities minister David Willetts said all ...   
2  Universities minister David Willetts said all ...   
3  Universities cannot be expected to march forei...   
4  Universities cannot be expected to march forei...   

                                   modified_sentence  modification_type  \
0  Matt Chorley isn't a political editor at Mailo...  polarity_negation   
1  Home Minister David Blankts said the universit...                 RS   
2  University Minister David Blankts said the uni...            antonym   
3  Senior Conservative

In [31]:
# 第七个单元格：保存结果
output_filename = f"chatgpt_test_samples_{final_df.shape[0]}x{final_df.shape[1]}.csv"
final_df.to_csv(output_filename, index=False)
print(f"Results saved to {output_filename}")

Results saved to chatgpt_test_samples_11456x6.csv


In [33]:
# 第一个单元格：读取源文件和output_paws
# 读取源文件
paws_source = pd.read_csv('paws_x_en_test_sentences.csv')
print("Source file shape:", paws_source.shape)
print("\nSource file sample:")
print(paws_source.head())

# 读取output_paws
df_paws = pd.read_csv('output_paws.csv', on_bad_lines='skip', encoding='utf-8')
print("\nOutput paws shape:", df_paws.shape)
print("\nOutput paws sample:")
print(df_paws.head())

Source file shape: (2000, 2)

Source file sample:
                                           sentence1  \
0  The exception was between late 2005 and 2009 w...   
1  The Tabaci River is a tributary of the River L...   
2  He played with the A-level Kane County Cougars...   
3  Winarsky is a member of the IEEE , Phi Beta Ka...   
4  In 1938 he became the government anthropologis...   

                                           sentence2  
0  The exception was between late 2005 and 2009 ,...  
1  The Leurda River is a tributary of the River T...  
2  He played in 1993 with the A - Level Portland ...  
3  Winarsky is a member of ACM , the IEEE , the P...  
4  In 1938 he became the Government Anthropologis...  

Output paws shape: (9995, 4)

Output paws sample:
                                   Original Sentence  \
0  The Tabaci River is a tributary of the River L...   
1  The Tabaci River is a tributary of the River L...   
2  The Tabaci River is a tributary of the River L...   
3  The T

In [34]:
# 第二个单元格：清理float值
float_mask = df_paws['semantic_relationship'].apply(lambda x: isinstance(x, float))
if float_mask.any():
    print(f"Found {float_mask.sum()} rows with float values")
    print("\nSample of problematic rows:")
    print(df_paws[float_mask][['Original Sentence', 'Modified Sentence', 'Modification Type', 'semantic_relationship']].head())
    
    # 去除这些行
    df_paws = df_paws[~float_mask]
    print(f"\nAfter removing float values, DataFrame shape: {df_paws.shape}")
else:
    print("No float values found in semantic_relationship column")

Found 2 rows with float values

Sample of problematic rows:
                                      Original Sentence  \
7703  He eventually established himself in northwest...   
7704  He eventually established himself in northwest...   

                                      Modified Sentence  \
7703  He was established in northwest Italy where he...   
7704  Eventually he was disproven in northwest Italy...   

                                      Modification Type semantic_relationship  
7703  Predicted Relationship: entailment with 82.40%...                   NaN  
7704  Predicted Relationship: contradiction with 94....                   NaN  

After removing float values, DataFrame shape: (9993, 4)


In [35]:
# 第三个单元格：筛选高置信度的contradiction记录
mask = (df_paws['semantic_relationship'].str.contains('contradiction', na=False)) & \
       (df_paws['semantic_relationship'].apply(extract_confidence) > 90)

high_confidence_contradictions = df_paws[mask]

print(f"Found {len(high_confidence_contradictions)} rows with high confidence contradictions")
print("\nSample of filtered data:")
print(high_confidence_contradictions[['Original Sentence', 'semantic_relationship']].head())

Found 1737 rows with high confidence contradictions

Sample of filtered data:
                                    Original Sentence  \
14  Winarsky is a member of the IEEE , Phi Beta Ka...   
17  In 1938 he became the government anthropologis...   
23  Billy Billy Batson appeared in the first four ...   
26  The solar approach to this requirement is the ...   
44     Holly was musically influenced by Elton John .   

                                semantic_relationship  
14  Predicted Relationship: contradiction with 96....  
17  Predicted Relationship: contradiction with 94....  
23  Predicted Relationship: contradiction with 97....  
26  Predicted Relationship: contradiction with 97....  
44  Predicted Relationship: contradiction with 97....  


In [36]:
# 第四个单元格：构建结果DataFrame
paws_results = []
for _, row in high_confidence_contradictions.iterrows():
    original_sentence = row['Original Sentence']
    # 在源文件中查找对应的sentence1和sentence2
    matching_row = paws_source[paws_source['sentence1'] == original_sentence]
    
    if not matching_row.empty:
        paws_results.append({
            'original_sentence': original_sentence,
            'modified_sentence': row['Modified Sentence'],
            'modification_type': row['Modification Type'],
            'semantic_relationship': row['semantic_relationship'],
            'paraphrase': matching_row.iloc[0]['sentence2'],  # 使用sentence2作为paraphrase
            'source_file': 'output_paws.csv'
        })

paws_df = pd.DataFrame(paws_results)
print(f"Final paws result shape: {paws_df.shape}")
print("\nSample of final results:")
print(paws_df.head())

Final paws result shape: (1737, 6)

Sample of final results:
                                   original_sentence  \
0  Winarsky is a member of the IEEE , Phi Beta Ka...   
1  In 1938 he became the government anthropologis...   
2  Billy Billy Batson appeared in the first four ...   
3  The solar approach to this requirement is the ...   
4     Holly was musically influenced by Elton John .   

                                   modified_sentence  modification_type  \
0  Winarsky is a non-member of IEEE Phi Beta Kapp...            antonym   
1  He became an anthropologist for the government...  polarity_negation   
2  Billy Billy Batson appeared in the first 4 iss...                 RS   
3  This requirement is an approach to using sun c...           hypernym   
4  Ellington John was an unmusical influence on H...            antonym   

                               semantic_relationship  \
0  Predicted Relationship: contradiction with 96....   
1  Predicted Relationship: contradictio

In [37]:
# 第五个单元格：保存PAWS结果
paws_output_filename = f"chatgpt_test_samples_paws_{paws_df.shape[0]}x{paws_df.shape[1]}.csv"
paws_df.to_csv(paws_output_filename, index=False)
print(f"PAWS results saved to {paws_output_filename}")

PAWS results saved to chatgpt_test_samples_paws_1737x6.csv


In [38]:
# Statistical Analysis Cell
# 1. Read previously saved results
chatgpt_df = pd.read_csv('chatgpt_test_samples_11456x6.csv')

# 2. Function to extract confidence and relationship type
def extract_relationship_and_confidence(text):
    try:
        # Extract relationship type
        rel_type = text.split('Relationship: ')[1].split(' with')[0]
        # Extract confidence
        confidence = float(text.split('with ')[1].split('%')[0])
        return pd.Series([rel_type, confidence])
    except:
        return pd.Series(['unknown', 0.0])

# 3. Apply function and create new columns
chatgpt_df[['relationship_type', 'confidence']] = chatgpt_df['semantic_relationship'].apply(extract_relationship_and_confidence)

# 4. Filter high confidence samples (>90%)
high_conf_df = chatgpt_df[chatgpt_df['confidence'] > 90]

# 5. Statistical analysis
print("=== Data Funnel Analysis ===")
print(f"Total original samples: {len(chatgpt_df)}")
print(f"High confidence samples (>90%): {len(high_conf_df)} ({len(high_conf_df)/len(chatgpt_df)*100:.2f}%)")

print("\n=== Relationship Type Distribution for High Confidence Samples ===")
rel_type_stats = high_conf_df['relationship_type'].value_counts()
for rel_type, count in rel_type_stats.items():
    print(f"{rel_type}: {count} ({count/len(high_conf_df)*100:.2f}%)")

print("\n=== Modification Type Distribution for High Confidence Samples ===")
mod_type_stats = high_conf_df['modification_type'].value_counts()
for mod_type, count in mod_type_stats.items():
    print(f"{mod_type}: {count} ({count/len(high_conf_df)*100:.2f}%)")

print("\n=== Cross Analysis of Relationship Types and Modification Types ===")
cross_stats = pd.crosstab(high_conf_df['relationship_type'], 
                         high_conf_df['modification_type'], 
                         normalize='index') * 100
print("\nPercentage of modification types for each relationship type (%):")
print(cross_stats)

# 6. Statistics by data source
print("\n=== Sample Distribution by Data Source ===")
source_stats = high_conf_df['source_file'].value_counts()
for source, count in source_stats.items():
    print(f"{source}: {count} ({count/len(high_conf_df)*100:.2f}%)")

=== 数据漏斗分析 ===
原始样本总数: 11456
高置信度样本数 (>90%): 11456 (100.00%)

=== 高置信度样本的关系类型分布 ===
contradiction: 11456 (100.00%)

=== 高置信度样本的修改类型分布 ===
polarity_negation: 4182 (36.50%)
antonym: 3478 (30.36%)
RS: 1757 (15.34%)
RD: 1100 (9.60%)
hypernym: 939 (8.20%)

=== 关系类型与修改类型的交叉分析 ===

每种关系类型中各修改类型的占比（%）：
modification_type        RD         RS    antonym  hypernym  polarity_negation
relationship_type                                                             
contradiction      9.601955  15.336941  30.359637  8.196578          36.504888

=== 各数据源的样本分布 ===
output_sentences_all.csv: 3001 (26.20%)
output_pack3.csv: 1709 (14.92%)
output_pack2.csv: 1703 (14.87%)
output_pack5.csv: 1699 (14.83%)
output_pack1.csv: 1676 (14.63%)
output_pack4.csv: 1668 (14.56%)


In [41]:
# Calculate distribution of all relationship types
# 1. Read results from all pack files
all_files_df = pd.DataFrame()
for file in output_files:
    temp_df = pd.read_csv(file, on_bad_lines='skip', encoding='utf-8')
    all_files_df = pd.concat([all_files_df, temp_df])

# 2. Clean float values
float_mask = all_files_df['semantic_relationship'].apply(lambda x: isinstance(x, float))
all_files_df = all_files_df[~float_mask]

# 3. Extract relationship and confidence
all_files_df[['relationship_type', 'confidence']] = all_files_df['semantic_relationship'].apply(extract_relationship_and_confidence)

# 4. Group statistics by confidence
confidence_ranges = [(0, 50), (50, 70), (70, 90), (90, 100)]

print("=== Overall Data Analysis ===")
print(f"Total samples: {len(all_files_df)}")

for min_conf, max_conf in confidence_ranges:
    conf_mask = (all_files_df['confidence'] > min_conf) & (all_files_df['confidence'] <= max_conf)
    conf_df = all_files_df[conf_mask]
    
    print(f"\n=== Distribution for confidence {min_conf}-{max_conf}% ===")
    print(f"Sample count: {len(conf_df)} ({len(conf_df)/len(all_files_df)*100:.2f}%)")
    
    rel_type_stats = conf_df['relationship_type'].value_counts()
    for rel_type, count in rel_type_stats.items():
        print(f"{rel_type}: {count} ({count/len(conf_df)*100:.2f}%)")
        
    # For high confidence samples, show additional modification type distribution
    if min_conf == 90:
        print("\nModification type distribution for high confidence samples:")
        mod_type_stats = conf_df['Modification Type'].value_counts()
        for mod_type, count in mod_type_stats.items():
            print(f"{mod_type}: {count} ({count/len(conf_df)*100:.2f}%)")

=== 总体数据分析 ===
总样本数: 43963

=== 置信度 0-50% 的分布 ===
样本数: 780 (1.77%)
contradiction: 279 (35.77%)
entailment: 264 (33.85%)
neutral: 237 (30.38%)

=== 置信度 50-70% 的分布 ===
样本数: 3548 (8.07%)
entailment: 1350 (38.05%)
contradiction: 1183 (33.34%)
neutral: 1015 (28.61%)

=== 置信度 70-90% 的分布 ===
样本数: 5355 (12.18%)
entailment: 2164 (40.41%)
contradiction: 1916 (35.78%)
neutral: 1275 (23.81%)

=== 置信度 90-100% 的分布 ===
样本数: 34280 (77.97%)
entailment: 20015 (58.39%)
contradiction: 11663 (34.02%)
neutral: 2602 (7.59%)

高置信度样本的修改类型分布：
polarity_negation: 7365 (21.48%)
RD: 7216 (21.05%)
antonym: 6818 (19.89%)
hypernym: 6684 (19.50%)
RS: 6197 (18.08%)
