In [141]:
%run "../read_and_write_docs.py"

In [142]:
%run "../preprocessing.py"

usage: preprocessing.py [-h] --file_path FILE_PATH --output_file_path
                        OUTPUT_FILE_PATH [--num_words NUM_WORDS]
preprocessing.py: error: the following arguments are required: --file_path, --output_file_path


SystemExit: 2

In [143]:
%run "../combine_sentences.py"

usage: combine_sentences.py [-h] --file_path FILE_PATH --output_file_path
                            OUTPUT_FILE_PATH
                            [--length_threshold LENGTH_THRESHOLD]
                            [--threshold_type {char,word}]
combine_sentences.py: error: the following arguments are required: --file_path, --output_file_path


SystemExit: 2

## Read Files

First we read the new temporary metadata file and the raw file. The aim being to filter out the doc_id, author_id combo from the raw file.

In [144]:
import pandas as pd
import numpy as np

In [145]:
blogger_loc = "../../../../datasets/blogger/raw_error_fix/"

raw_loc = f"{blogger_loc}raw.jsonl"
temp_metadata_loc = f"{blogger_loc}metadata_temp.jsonl"

In [146]:
temp_meta = read_jsonl_file(temp_metadata_loc)

In [147]:
raw = read_jsonl_file(raw_loc)

## Filtering

Initially filter for doc and author combo and then for word count > 1000.

In [148]:
# Step 1: Extract unique combinations of doc_id and author_id from metadata
unique_combinations = temp_meta[['doc_id_y', 'author_id_y']].copy()
unique_combinations.rename(columns={'doc_id_y': 'doc_id', 'author_id_y': 'author_id'}, inplace=True)

# Rename columns in unique_combinations to match those in raw
raw.rename(columns={'id': 'doc_id'}, inplace=True)

# Step 2: Merge unique_combinations with raw DataFrame to find the rows to exclude
merged_df = raw.merge(unique_combinations, on=['doc_id', 'author_id'], how='left', indicator=True)

# Step 3: Filter out rows from raw where the combination of id and author_id exists in metadata
filtered_raw = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])

filtered_raw = filtered_raw[filtered_raw['word_count'] >= 1000]

## Create Metadata Table - Same Author

First we want to create the metadata for the same authors by creating a temp metadata file for those with more than one doc in the data before sampling and returning a single same author doc each time

In [149]:
# Step 1: Extract unique author_id_x values from temp_meta
author_id_y_list = temp_meta['author_id_y'].unique()

# Step 2: Filter filtered_raw based on author_id_x
matching_raw = filtered_raw[filtered_raw['author_id'].isin(author_id_y_list)]

# Step 3: Get the unique author_id values from matching_raw
matching_author_ids = matching_raw['author_id'].unique()

n_same = round(len(temp_meta) / 2)

# Step 4: Filter temp_meta to include only rows with author_id_x in the list of matching_author_ids
same_author_temp_meta = temp_meta[temp_meta['author_id_y'].isin(matching_author_ids)].head(n = n_same)

In [150]:
# Set the seed for reproducibility
np.random.seed(42)

# Initialize an empty list to store the updated rows
updated_rows = []

# Loop through each row in temp_meta
for _, row in same_author_temp_meta.iterrows():
    sample_id = row['sample_id']
    author_id_y = row['author_id_y']

    # Filter the raw DataFrame based on author_id_y
    filtered_df = filtered_raw[filtered_raw['author_id'] == author_id_y]
    
    # Randomly sample a row from the filtered DataFrame
    sampled_row = filtered_df.sample(n=1)
        
    # Extract the relevant information
    doc_id_x = sampled_row['doc_id'].values[0]
    author_id_x = sampled_row['author_id'].values[0]
    topic_x = sampled_row['topic'].values[0]
        
    # Update the temp_meta DataFrame with new values
    updated_row = row.copy()
    updated_row['doc_id_x'] = doc_id_x
    updated_row['author_id_x'] = author_id_x
    updated_row['topic_x'] = topic_x
        
    # Append the updated row to the list
    updated_rows.append(updated_row)


# Convert the list of updated rows to a DataFrame
updated_same_author_temp_meta = pd.DataFrame(updated_rows)

updated_same_author_temp_meta = updated_same_author_temp_meta[['sample_id', 'doc_id_x', 'doc_id_y', 'author_id_x', 'author_id_y', 'topic_x', 'topic_y']]

## Create Metadata Table - Diff Author

Now we create the different author but same topic table by removing all authors from the raw data which are already in the metadata and then sampling a row making sure the topic is the same each time.

In [151]:
diff_author_temp_meta = temp_meta[~temp_meta['sample_id'].isin(same_author_temp_meta['sample_id'])]
temp_filtered_raw = filtered_raw[~filtered_raw['author_id'].isin(temp_meta['author_id_y'])]

In [152]:
np.random.seed(42)

# Initialize an empty list to store the updated rows
updated_rows = []

# Create a copy of temp_filtered_raw to avoid modifying the original DataFrame
temp_filtered_raw_copy = temp_filtered_raw.copy()

# Loop through each row in diff_author_temp_meta
for _, row in diff_author_temp_meta.iterrows():
    sample_id = row['sample_id']
    topic_y = row['topic_y']
    
    # Filter temp_filtered_raw_copy based on topic
    filtered_df = temp_filtered_raw_copy[temp_filtered_raw_copy['topic'] == topic_y]
    
    # Randomly sample one row from the filtered DataFrame
    sampled_row = filtered_df.sample(n=1)
        
    # Extract the relevant information
    doc_id_x = sampled_row['doc_id'].values[0]
    author_id_x = sampled_row['author_id'].values[0]
    topic_x = sampled_row['topic'].values[0]
        
    # Update the original row with new values
    updated_row = row.copy()
    updated_row['doc_id_x'] = doc_id_x
    updated_row['author_id_x'] = author_id_x
    updated_row['topic_x'] = topic_x
        
    # Append the updated row to the list
    updated_rows.append(updated_row)
        
    # Remove the sampled row from the temp_filtered_raw_copy to avoid reselection
    temp_filtered_raw_copy = temp_filtered_raw_copy.drop(sampled_row.index)

# Convert the list of updated rows to a DataFrame
updated_diff_author_temp_meta = pd.DataFrame(updated_rows)

updated_diff_author_temp_meta = updated_diff_author_temp_meta[['sample_id', 'doc_id_x', 'doc_id_y', 'author_id_x', 'author_id_y', 'topic_x', 'topic_y']]

In [153]:
# Step 1: Concatenate the DataFrames
final_metadata = pd.concat([updated_diff_author_temp_meta, updated_same_author_temp_meta], ignore_index=True)

# Step 2: Add new columns for same_author and same_topic
final_metadata['same_author'] = final_metadata['author_id_x'] == final_metadata['author_id_y']
final_metadata['same_topic'] = final_metadata['topic_x'] == final_metadata['topic_y']

# Step 3: Sort by sample_id
final_metadata = final_metadata.sort_values(by='sample_id').reset_index(drop=True)

final_metadata

Unnamed: 0,sample_id,doc_id_x,doc_id_y,author_id_x,author_id_y,topic_x,topic_y,same_author,same_topic
0,1,618134,16188,1516660,3952922,Student,Student,False,True
1,2,17857,17850,3321827,3321827,Technology,Technology,True,True
2,3,18512,18516,4160528,4160528,Student,Student,True,True
3,4,184544,26252,2258198,4169442,Engineering,Engineering,False,True
4,5,37301,37303,3084647,3084647,Student,Student,True,True
...,...,...,...,...,...,...,...,...,...
95,96,202101,623524,3405693,2927895,Technology,Technology,False,True
96,97,161192,669029,1278138,3093335,Education,Education,False,True
97,98,45629,671397,2002478,3419072,Science,Science,False,True
98,99,590738,676573,3678120,2876684,Technology,Technology,False,True


In [154]:
save_as_jsonl(final_metadata, f"{blogger_loc}metadata.jsonl")

## Create the Known Raw Table

In [155]:
# Step 1: Extract unique combinations of doc_id and author_id from metadata
unique_combinations_known = final_metadata[['doc_id_x', 'author_id_x']].copy()
unique_combinations_known.rename(columns={'doc_id_x': 'doc_id', 'author_id_x': 'author_id'}, inplace=True)

# Rename columns in unique_combinations to match those in raw
raw.rename(columns={'id': 'doc_id'}, inplace=True)

# Step 2: Merge unique_combinations with raw DataFrame to find the rows to exclude
known_raw = raw.merge(unique_combinations_known, on=['doc_id', 'author_id'], how='left', indicator=True)

# # Step 3: Filter out rows from raw where the combination of id and author_id exists in metadata
known_raw = known_raw[known_raw['_merge'] == 'both'].drop(columns=['_merge'])

In [156]:
known_raw

Unnamed: 0,doc_id,author_id,gender,age,topic,sign,date,text,word_count
17857,17857,3321827,male,38,Technology,Sagittarius,"13,May,2004","Is it just me, or are liberals unab...",2012
18512,18512,4160528,female,16,Student,Leo,"09,August,2004",MaNy NiTeS i'Ve CrIeD fRoM ...,11502
29123,29123,3011326,male,23,Maritime,Taurus,"17,March,2004",Readings I'm a Democrat. I'm a...,1237
36608,36608,3347922,female,23,indUnk,Virgo,"01,August,2004",First thing first how i'm fe...,2806
37301,37301,3084647,male,15,Student,Pisces,"10,July,2004","The next morning arrived, bringing with it ...",1367
...,...,...,...,...,...,...,...,...,...
618134,618134,1516660,male,17,Student,Cancer,"30,May,2004","hrm...I'm bored. Yup,you got that r...",1038
653547,653547,3444305,male,15,Student,Aquarius,"23,July,2004",Well I have been really busy lately...,1201
656699,656699,3347383,male,39,Internet,Aries,"21,May,2004",Part one of my 'Millionaire' experience...,1087
659396,659396,2587254,male,15,Student,Libra,"14,February,2004",2/14/04 Happy Valentine's Day!...,1034


In [157]:
save_as_jsonl(known_raw, f"{blogger_loc}known_raw.jsonl")

## Preprocess the Known Raw Data

In [158]:
known_raw = known_raw.drop('word_count', axis = 1)
known_raw.rename(columns={'doc_id': 'id'}, inplace=True)
known_preprocessed = apply_sentence_split(known_raw)
known_preprocessed = split_rows_by_word_count(known_preprocessed, num_words=250)

In [160]:
save_as_jsonl(known_preprocessed, f"{blogger_loc}known_preprocessed.jsonl")

## Concatenate to Chunk Sentences

In [161]:
known_concat = concatenate_sentences(known_preprocessed, length_threshold=500, threshold_type='word')

In [162]:
save_as_jsonl(known_concat, f"{blogger_loc}known_combined.jsonl")

## Final Known

In [163]:
known_final = known_concat.drop_duplicates(subset='id', keep='first')

# Keep the specified columns
known_final = known_final[['id', 'author_id', 'gender', 'word_count', 'age', 'topic', 'sign', 'date', 'text']]

# Rename 'id' to 'doc_id'
known_final.rename(columns={'id': 'doc_id'}, inplace=True)

In [164]:
known_final

Unnamed: 0,doc_id,author_id,gender,word_count,age,topic,sign,date,text
0,17857,3321827,male,517,38,Technology,Sagittarius,"13,May,2004","Is it just me, or are liberals unable to make ..."
132,18512,4160528,female,556,16,Student,Leo,"09,August,2004","MaNy NiTeS iVe CrIeD fRoM tHe ThInGs U dO, fEl..."
881,29123,3011326,male,503,23,Maritime,Taurus,"17,March,2004",Readings Im a Democrat. Im a liberal. And Iv...
933,36608,3347922,female,521,23,indUnk,Virgo,"01,August,2004",First thing first how im feeling... shorty som...
1122,37301,3084647,male,510,15,Student,Pisces,"10,July,2004","The next morning arrived, bringing with it col..."
...,...,...,...,...,...,...,...,...,...
11378,618134,1516660,male,524,17,Student,Cancer,"30,May,2004","hrm...Im bored. Yup,you got that right, Im bor..."
11432,653547,3444305,male,515,15,Student,Aquarius,"23,July,2004",Well I have been really busy lately so sorry I...
11509,656699,3347383,male,500,39,Internet,Aries,"21,May,2004",Part one of my Millionaire experience aired la...
11592,659396,2587254,male,510,15,Student,Libra,"14,February,2004",2/14/04 Happy Valentines Day!...Pretty fun d...


In [165]:
save_as_jsonl(known_final, f"{blogger_loc}known_final.jsonl")