In [67]:
import nltk
import pandas as pd
from tqdm import tqdm

nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aryanahadinia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [68]:
data_df = pd.read_excel("../data/dataset.xlsx")
data_df = data_df.dropna()
data_df = data_df.reset_index(drop=True)
data_df = data_df.drop(columns=["Unnamed: 0"])
print(data_df.shape)
data_df.head()

(1983, 50)


Unnamed: 0,participant_id,collective,contrast,goal,goals2,list,metaphor,moral,question,story,...,final_text,overall_sentiment_all,positive_sentiment_all,negative_sentiment_all,neutra_sentiment_all,mixed_sentiment_all,targets,text_length_all,prolific_score,prolific_indicator_all
0,5e1cf0eb65b6d3071f489de9,0.35,1.07,0.43,0.32,6.96,0.94,2.36,0.01,0.46,...,Hello everyone. Thank you. Taking the time to ...,POSITIVE,0.9569,0.0007,0.0417,0.0007,HIGH,771.0,100.0,2
1,55d06fd334e9060012e5781c,0.3,0.67,0.3,0.2,2.83,0.71,0.22,0.01,0.6,...,"Hi, I am Kathy. I'd love to be considered for ...",NEUTRAL,0.1587,0.0055,0.835,0.0009,MED,424.0,99.0,2
2,615586b009f801c3f2d4af8d,0.18,0.74,0.16,0.26,3.4,1.1,1.09,0.01,0.37,...,uh yeah I I think I would be the best candidat...,POSITIVE,0.8051,0.0164,0.1747,0.0039,MED,449.0,100.0,2
3,5847e60f73170700013697c6,0.14,2.14,0.27,0.12,3.05,0.49,0.46,0.0,1.09,...,Hello. Um I've of course a fair amount of expe...,POSITIVE,0.5761,0.1185,0.2484,0.057,HIGH,611.0,100.0,2
4,6086a11397234e7f83e4e793,0.9,4.76,0.86,0.22,7.92,0.56,2.95,0.01,0.19,...,"Okay, so I would like to thank you for giving ...",POSITIVE,0.8515,0.0016,0.1456,0.0013,HIGH,611.0,100.0,2


In [69]:
def tokenize_text(text):
    return nltk.word_tokenize(text)


def create_chunks(text, min_tokens=300):
    sentences = nltk.tokenize.sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_token_count = 0

    for sentence in sentences:
        sentence_tokens = tokenize_text(sentence)
        sentence_token_count = len(sentence_tokens)

        current_chunk.append(sentence)
        current_token_count += sentence_token_count

        if current_token_count >= min_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_token_count = 0

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

In [70]:
all_chunks = []
for participant_id, text in tqdm(
    zip(data_df["participant_id"], data_df["final_text"]),
    total=data_df.shape[0],
):
    chunks = create_chunks(text)
    all_chunks.append([participant_id, chunks])

100%|██████████| 1983/1983 [00:03<00:00, 514.82it/s]


In [71]:
all_participants_ids = []
all_chunks_ids = []
all_chunks_texts = []

for pair in all_chunks:
    participant_id, chunks = pair
    for chunk_id, chunk in enumerate(chunks):
        all_participants_ids.append(participant_id)
        all_chunks_ids.append(chunk_id)
        all_chunks_texts.append(chunk)

In [72]:
chunked_data_df = pd.DataFrame(
    {
        "participant_id": all_participants_ids,
        "chunk_id": all_chunks_ids,
        "chunk_text": all_chunks_texts,
    }
)
chunked_data_df.head()

Unnamed: 0,participant_id,chunk_id,chunk_text
0,5e1cf0eb65b6d3071f489de9,0,Hello everyone. Thank you. Taking the time to ...
1,5e1cf0eb65b6d3071f489de9,1,"Again, as I talked about those great organizat..."
2,5e1cf0eb65b6d3071f489de9,2,So I'm pretty experienced at the moment. I'm c...
3,55d06fd334e9060012e5781c,0,"Hi, I am Kathy. I'd love to be considered for ..."
4,55d06fd334e9060012e5781c,1,"It's been about five days, but I am reminding ..."


In [73]:
final_df = pd.merge(data_df, chunked_data_df, on="participant_id", how="inner")
final_df = final_df.drop(columns=["final_text"])
final_df.shape

(5362, 51)

In [74]:
final_df = final_df[final_df["chunk_text"].apply(lambda x: len(x.split()) >= 50)]
final_df.shape

(4988, 51)

In [75]:
final_df.to_csv("../data/chunked_dataset.csv")