In [114]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# Download punkt_tab if needed.
try:
    nltk.data.find('tokenizers/punkt_tab/english/pickle')
except LookupError:
    nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/noxiusk/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [115]:
import pandas as pd
from collections import defaultdict
import csv

def escape_and_flatten_text(text):
    if not isinstance(text, str):
        return text
    text = text.replace('"', '""')             # Escape double quotes
    text = text.replace('\n', ' ').replace('\r', ' ')  # Flatten newlines
    return f'"{text}"'                          # Wrap in quotes

def write_custom_csv(df, filepath, quote_columns):
    with open(filepath, 'w', encoding='utf-8', newline='') as f:
        # Header
        f.write(','.join(df.columns) + '\n')
        
        for _, row in df.iterrows():
            row_data = []
            for col in df.columns:
                val = row[col]
                if col in quote_columns:
                    val = escape_and_flatten_text(val)
                else:
                    val = str(val)
                row_data.append(val)
            f.write(','.join(row_data) + '\n')


In [116]:
def process_train_labeled_file(file_path, labeled=True, name='processed_posts.csv'):
    # Read the Excel file
    df = pd.read_excel(file_path)
    # Store [title] and [suicide risk label] for each row as a dictionary
    user_data = []
    new_user_data = []

    idx = None
    text = None
    label = None
    for _, row in df.iterrows():
        idx = row['Unnamed: 0'] if not pd.isna(row['Unnamed: 0']) else idx
        text = row['title'] 
        if labeled:
            label = row['suicide risk label'] if not pd.isna(row['Unnamed: 0']) else label
            user_data.append({'idx': int(idx), 'text': text, 'label': label})
        else:
            user_data.append({'idx': int(idx), 'text': text})


    # Group by idx and concatenate texts
    grouped_texts = defaultdict(list)
    labels = {}

    for entry in user_data:
        i = entry['idx']
        grouped_texts[i].append(entry['text'])
        if labeled == True:
            labels[i] = entry['label']

    for i in sorted(grouped_texts.keys()):
        concatenated_text = " ".join(grouped_texts[i])
        if labeled == True:
            new_user_data.append({'index': i, 'post': concatenated_text, 'post_risk': labels[i]})
        else:
            new_user_data.append({'index': i, 'post': concatenated_text})

    output_df = pd.DataFrame(new_user_data)
    #output_df.to_csv(name, index=False)
    write_custom_csv(output_df, name, quote_columns=['post'])

    print("CSV file 'processed_posts.csv' has been creaeted.")

In [117]:
file_path = "Dataset(post_level).xlsx"

process_train_labeled_file(file_path, labeled=True, name="posts_with_labels.csv")

CSV file 'processed_posts.csv' has been creaeted.


In [118]:
def process_test_labeled_file(file_path, labeled=True, name='posts_with_labels.csv'):
    # Read the Excel file
    df = pd.read_excel(file_path)
    # Store [title] and [suicide risk label] for each row as a dictionary
    user_data = []
    new_user_data = []

    idx = None
    text = None
    label = None
    for _, row in df.iterrows():
        idx = row['Unnamed: 0']
        text = row['post']
        if labeled:
            label = row['post risk']
            user_data.append({'idx': int(idx), 'post': text, 'post_risk': label})
        else:
            user_data.append({'idx': int(idx), 'post': text})

    output_df = pd.DataFrame(user_data)
    #output_df.to_csv(name, index=False, quoting=1)
    write_custom_csv(output_df, name, quote_columns=['post'])
    

    print("CSV file 'processed_posts.csv' has been created.")

In [119]:
file_path = "test_100_label_competition.xlsx"
process_test_labeled_file(file_path, labeled=True, name="posts_with_labels_test.csv")

CSV file 'processed_posts.csv' has been created.


In [120]:
def process_train_unlabeled_file(file_path, labeled=True, name='posts_with_labels.csv'):
    # Read the Excel file
    df = pd.read_excel(file_path)
    # Store [title] and [suicide risk label] for each row as a dictionary
    user_data = []
    new_user_data = []

    idx = None
    text = None
    label = None
    for _, row in df.iterrows():
        idx = row['index']
        text = row['post']
        if labeled:
            label = row['post risk']
            user_data.append({'idx': int(idx), 'post': text, 'post_risk': label})
        else:
            user_data.append({'idx': int(idx), 'post': text})

    output_df = pd.DataFrame(user_data)
    #output_df.to_csv(name, index=False, quoting=1)
    write_custom_csv(output_df, name, quote_columns=['post'])
    

    print("CSV file 'processed_posts.csv' has been created.")

In [121]:
file_path = "1500_competition.xlsx"
process_train_unlabeled_file(file_path, labeled=False, name="posts_without_labels_train.csv")

CSV file 'processed_posts.csv' has been created.
