In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


def clean_text(text):
    # Remove <referential> tags
    text = re.sub(r'<referential>([^<]+)</referential>', r'\1', text)
    # Remove any leftover XML-like tags
    text = re.sub(r'<[^>]+>', '', text)
    # Remove any extra quotes
    text = re.sub(r'"{2,}', '"', text)
    # Remove any extra spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

try:
    df = pd.read_csv('test.tsv', sep='\t', header=None, 
                     names=['id', 'sentence_id', 'sentence', 'label'])
    if len(df.columns) == 3:
        df = pd.read_csv('paste.txt', sep='\t', header=None, 
                         names=['id', 'sentence_id', 'sentence'])
        df['label'] = 'INNOCUOUS'  # Default label if not provided
except Exception as e:
    print(f"Error reading TSV file: {e}")
    print("Trying alternative parsing method...")
    
    data = []
    with open('paste.txt', 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                id_num = parts[0]
                sentence_id = parts[1]
                sentence = parts[2]
                label = "INNOCUOUS" if len(parts) <= 3 else parts[3]
                
                data.append({
                    'id': id_num,
                    'sentence_id': sentence_id,
                    'sentence': sentence,
                    'label': label
                })
    
    df = pd.DataFrame(data)

df['sentence'] = df['sentence'].apply(clean_text)

df.to_csv('cleaned_dataset_test.csv', index=False)
print("\nCleaned dataset saved to 'cleaned_dataset.csv'")




Cleaned dataset saved to 'cleaned_dataset.csv'
