In [1]:
import pandas as pd
import re

### **1 - Reading & Exploring Data :**

In [54]:
train_df = pd.read_csv("Data/Initial-Data/MIND_train.csv")
valid_df = pd.read_csv("Data/Initial-Data/MIND_valid.csv")
test_df = pd.read_csv("Data/Initial-Data/MIND_test.csv")

In [55]:
test_df.columns

Index(['impression_id', 'history', 'candidate', 'label', 'history_news_id',
       'history_title', 'history_category', 'history_subvert',
       'history_abstract', 'candidate_news_id', 'candidate_title',
       'candidate_category', 'candidate_subvert', 'candidate_abstract'],
      dtype='object')

In [56]:
test_df['total'] = test_df['candidate'].apply(lambda x: len(x.split('\n')) if x else 0)

In [57]:
test_df['total'].min()

np.int64(10)

In [58]:
print(train_df['history'][0])

H1: Panera Bread worker fired after TikTok exposed frozen mac and cheese Category=>foodanddrink;foodnews
H2: A Texas mom is going to prison after putting her son through unnecessary medical procedures Category=>news;newscrime
H3: Emily Ratajkowski Is Being Sued for $150,000 Over an Instagram Photo Category=>movies;movies-celebrity
H4: Joe Biden reportedly denied Communion at a South Carolina church because of his stance on abortion Category=>news;newspolitics
H5: Mississippi woman found after being missing for days by writing S.O.S. with rocks Category=>news;newsus


In [59]:
(train_df.shape), (valid_df.shape), (test_df.shape)

((180, 16), (20, 16), (400, 15))

In [60]:
test_df.isna().sum()

impression_id         0
history               0
candidate             0
label                 0
history_news_id       0
history_title         0
history_category      0
history_subvert       0
history_abstract      1
candidate_news_id     0
candidate_title       0
candidate_category    0
candidate_subvert     0
candidate_abstract    0
total                 0
dtype: int64

In [61]:
train_df.dtypes

impression_id          int64
history               object
candidate             object
label                 object
history_news_id       object
history_title         object
history_category      object
history_subvert       object
history_abstract      object
candidate_news_id     object
candidate_title       object
candidate_category    object
candidate_subvert     object
candidate_abstract    object
Description           object
COT                   object
dtype: object

In [62]:
test_df.columns

Index(['impression_id', 'history', 'candidate', 'label', 'history_news_id',
       'history_title', 'history_category', 'history_subvert',
       'history_abstract', 'candidate_news_id', 'candidate_title',
       'candidate_category', 'candidate_subvert', 'candidate_abstract',
       'total'],
      dtype='object')

### **2 - Getting Needed Columns :**

In [63]:
train_df = train_df[['history',"candidate","label","Description","COT"]]
valid_df = valid_df[['history',"candidate","label","Description","COT"]]
test_df = test_df[['history',"candidate","label"]]

### **3 - Preprocessing Data :**

In [64]:
def reformat_text(text):
    lines = text.split("\n")  # Split by newline
    pattern = r"^(H\d+:|C\d+:)\s*(.*?)\s*Category=>(.*?);(.*)$"
    
    formatted_lines = []
    
    for line in lines:
        match = re.match(pattern, line)
        if match:
            prefix, headline, category, subcategory = match.groups()
            formatted_line = f"{prefix} {headline} which belongs to the category of {category} and subcategory of {subcategory}"
            formatted_lines.append(formatted_line)
        else:
            formatted_lines.append(line)  # Keep original line if no match
    
    return "\n".join(formatted_lines)

In [65]:
def update(x):
    return re.sub(r'\bsubcategory of (\w*?)news(\w*)\b', r'subcategory of \1\2', x)

In [66]:
train_df['history'] = train_df['history'].apply(lambda x: reformat_text(x))
valid_df['history'] = valid_df['history'].apply(lambda x: reformat_text(x))
test_df['history'] = test_df['history'].apply(lambda x: reformat_text(x))

train_df['candidate'] = train_df['candidate'].apply(lambda x: reformat_text(x))
valid_df['candidate'] = valid_df['candidate'].apply(lambda x: reformat_text(x))
test_df['candidate'] = test_df['candidate'].apply(lambda x: reformat_text(x))

In [67]:
train_df['history'] = train_df['history'].apply(lambda x: update(x))
valid_df['history'] = valid_df['history'].apply(lambda x: update(x))
test_df['history'] = test_df['history'].apply(lambda x: update(x))

train_df['candidate'] = train_df['candidate'].apply(lambda x: update(x))
valid_df['candidate'] = valid_df['candidate'].apply(lambda x: update(x))
test_df['candidate'] = test_df['candidate'].apply(lambda x: update(x))

In [None]:
def process_cot(row):
    cot = row['COT'].lstrip('result =')
    
    cot = cot.replace('\\n', '\n ')
    
    cot = cot.strip(" \" ")
    
    split_text = re.split(r'[:=]', cot)
    last_part = split_text[-1].strip()  
    
    labels = re.findall(r'\bC\d+\b', last_part)
    
    targets = ', '.join(labels)
    
    return cot, targets

In [None]:
train_df['COT'], train_df['Targets'] = zip(*train_df.apply(process_cot, axis=1))
valid_df['COT'], valid_df['Targets'] = zip(*valid_df.apply(process_cot, axis=1))

### **4 - Saving Results :**

In [None]:
train_df.to_csv("Data/MIND-Preprocessed/train.csv", index=False)
valid_df.to_csv("Data/MIND-Preprocessed/valid.csv", index=False)
test_df.to_csv("Data/test.csv", index=False)