In [49]:
%run "../read_and_write_docs.py"

In [50]:
%run "../preprocessing.py"

usage: preprocessing.py [-h] --file_path FILE_PATH --output_file_path
                        OUTPUT_FILE_PATH [--num_words NUM_WORDS]
preprocessing.py: error: the following arguments are required: --file_path, --output_file_path


SystemExit: 2

In [51]:
%run "../combine_sentences.py"

usage: combine_sentences.py [-h] --file_path FILE_PATH --output_file_path
                            OUTPUT_FILE_PATH
                            [--length_threshold LENGTH_THRESHOLD]
                            [--threshold_type {char,word}]
combine_sentences.py: error: the following arguments are required: --file_path, --output_file_path


SystemExit: 2

In [52]:
import pandas as pd
import numpy as np

In [53]:
blogger_loc = "../../../../datasets/blogger/raw_error_fix/"

raw_loc = f"{blogger_loc}raw.jsonl"
metadata_loc = f"{blogger_loc}metadata.jsonl"

In [70]:
metadata = read_jsonl_file(metadata_loc)
raw = read_jsonl_file(raw_loc)

## Filter out Authors and Docs Shorter than 1000 Words

In [71]:
metadata_authors = pd.concat([metadata['author_id_x'], metadata['author_id_y']]).drop_duplicates().reset_index(drop=True).tolist()

In [74]:
raw.rename(columns={'id': 'doc_id'}, inplace=True)
filtered_raw = raw[raw['word_count'] >= 1000]
filtered_raw = filtered_raw[~filtered_raw['author_id'].isin(metadata_authors)]
filtered_raw = filtered_raw.drop('word_count', axis = 1)
filtered_raw['id'] = range(1, len(filtered_raw) + 1)

cols = ['id'] + [col for col in filtered_raw.columns if col != 'id']
filtered_raw = filtered_raw[cols]

In [75]:
filtered_raw

Unnamed: 0,id,doc_id,author_id,gender,age,topic,sign,date,text
2,1,2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
48,2,48,3581210,male,33,InvestmentBanking,Aquarius,"23,July,2004","Just so you know, this blog isn'..."
146,3,146,3705830,male,25,Non-Profit,Cancer,"24,July,2004",&nbsp; aside: you know you've done wel...
192,4,192,3389918,female,37,indUnk,Aquarius,"23,May,2004",I had this conversation with a man the ...
219,5,219,3429420,male,15,Student,Aquarius,"23,May,2004",The young elf ducked as a blade shimmer...
...,...,...,...,...,...,...,...,...,...
680937,11398,680937,1673216,male,17,Student,Taurus,"28,November,2003",First survey ever to be posted he...
680944,11399,680944,1673216,male,17,Student,Taurus,"01,December,2003",Donnie (1:00:06 AM): but the real...
680986,11400,680986,1673216,male,17,Student,Taurus,"17,June,2004",Good god...Never save this much h...
681014,11401,681014,3789932,female,24,Non-Profit,Capricorn,"28,July,2004",I know it was controversial.&nbsp;...


## Preprocess the Remaining Data

In [76]:
raw_preprocessed = apply_sentence_split(filtered_raw)
raw_preprocessed = split_rows_by_word_count(raw_preprocessed, num_words=250)

In [77]:
raw_concat = concatenate_sentences(raw_preprocessed, length_threshold=500, threshold_type='word')

In [93]:
raw_final = raw_concat.drop_duplicates(subset='id', keep='first')
raw_final = raw_final[['doc_id', 'author_id', 'gender', 'word_count', 'age', 'topic', 'sign', 'date', 'text']]

In [94]:
raw_final

Unnamed: 0,doc_id,author_id,gender,word_count,age,topic,sign,date,text
0,2,2059027,male,511,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde: MAAK JE ...
251,48,3581210,male,505,33,InvestmentBanking,Aquarius,"23,July,2004","Just so you know, this blog isnt about being p..."
354,146,3705830,male,520,25,Non-Profit,Cancer,"24,July,2004",&nbsp; aside: you know youve done well with y...
403,192,3389918,female,535,37,indUnk,Aquarius,"23,May,2004",I had this conversation with a man the other e...
431,219,3429420,male,502,15,Student,Aquarius,"23,May,2004",The young elf ducked as a blade shimmered past...
...,...,...,...,...,...,...,...,...,...
1206882,680937,1673216,male,506,17,Student,Taurus,"28,November,2003",First survey ever to be posted here. 1)The sin...
1207315,680944,1673216,male,507,17,Student,Taurus,"01,December,2003",Donnie (1:00:06 AM): but the real question is....
1207529,680986,1673216,male,503,17,Student,Taurus,"17,June,2004",Good god...Never save this much homework to st...
1207691,681014,3789932,female,504,24,Non-Profit,Capricorn,"28,July,2004",I know it was controversial.&nbsp; I know he s...


In [95]:
save_as_jsonl(raw_final, f"{blogger_loc}raw_final.jsonl")

## Create the Impostors

In [96]:
metadata

Unnamed: 0,sample_id,doc_id_x,doc_id_y,author_id_x,author_id_y,topic_x,topic_y,same_author,same_topic
0,1,618134,16188,1516660,3952922,Student,Student,False,True
1,2,17857,17850,3321827,3321827,Technology,Technology,True,True
2,3,18512,18516,4160528,4160528,Student,Student,True,True
3,4,184544,26252,2258198,4169442,Engineering,Engineering,False,True
4,5,37301,37303,3084647,3084647,Student,Student,True,True
...,...,...,...,...,...,...,...,...,...
95,96,202101,623524,3405693,2927895,Technology,Technology,False,True
96,97,161192,669029,1278138,3093335,Education,Education,False,True
97,98,45629,671397,2002478,3419072,Science,Science,False,True
98,99,590738,676573,3678120,2876684,Technology,Technology,False,True


In [100]:
def sample_impostors(metadata, raw, num_impostors, seed=None):
    result = []
    
    for sample_id in metadata['sample_id'].unique():
        # Take a sample of num_impostors from the raw dataframe
        sample = raw.sample(n=num_impostors, replace=False, random_state=seed)
        
        # Create a DataFrame with the sample_id and the sampled rows
        sample['sample_id'] = sample_id
        
        # Append to the result list
        result.append(sample)
    
    # Concatenate all the samples into a single DataFrame
    result_df = pd.concat(result).reset_index(drop=True)

    result_df = result_df[['sample_id', 'doc_id', 'author_id', 'gender', 'word_count', 'age', 'topic', 'sign', 'date', 'text']]
    
    return result_df

In [103]:
impostors_raw = sample_impostors(metadata, raw_final, 1000, seed=42)

In [104]:
impostors_raw

Unnamed: 0,sample_id,doc_id,author_id,gender,word_count,age,topic,sign,date,text
0,1,465573,3371144,male,502,16,indUnk,Sagittarius,"03,June,2004","Eh, Ill just start on Houleys post. I really d..."
1,1,125713,3509247,male,500,17,Student,Scorpio,"08,June,2004","Hey guys, heres the quick view of my first two..."
2,1,244640,2730505,male,501,24,Technology,Cancer,"06,August,2004",Sorry for a ridiculously long post but this wa...
3,1,667699,3473351,female,504,33,Law,Scorpio,"26,July,2004",Too Many Memories Take me back Too many memo...
4,1,110372,1084668,female,511,15,Student,Capricorn,"13,June,2004",i ... im .. wat u call ... sad too many thing...
...,...,...,...,...,...,...,...,...,...,...
99995,100,459262,3504914,male,525,16,Student,Libra,"25,June,2004",Quotation that Expresses This Post : I write e...
99996,100,305314,470861,male,502,27,indUnk,Cancer,"07,January,2003",This is a momentus (momentous?) occassion. I a...
99997,100,203771,2839449,female,528,26,Biotech,Aries,"04,August,2004","urlLink July 23, 2004, Newsday Can college ..."
99998,100,243633,171591,male,535,36,Arts,Capricorn,"03,May,2003",Day 4 - Rome to Urbania One of those journeys...


In [107]:
save_as_jsonl(impostors_raw, f"{blogger_loc}general_impostors_raw.jsonl")

In [105]:
impostors_final = impostors_raw[['sample_id', 'doc_id', 'text']]

In [106]:
impostors_final

Unnamed: 0,sample_id,doc_id,text
0,1,465573,"Eh, Ill just start on Houleys post. I really d..."
1,1,125713,"Hey guys, heres the quick view of my first two..."
2,1,244640,Sorry for a ridiculously long post but this wa...
3,1,667699,Too Many Memories Take me back Too many memo...
4,1,110372,i ... im .. wat u call ... sad too many thing...
...,...,...,...
99995,100,459262,Quotation that Expresses This Post : I write e...
99996,100,305314,This is a momentus (momentous?) occassion. I a...
99997,100,203771,"urlLink July 23, 2004, Newsday Can college ..."
99998,100,243633,Day 4 - Rome to Urbania One of those journeys...


In [108]:
save_as_jsonl(impostors_final, f"{blogger_loc}general_impostors_final.jsonl")