In [1]:
import arguebuf as ag
import random

In [2]:
import os
def get_all_files(directory):
    file_list = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isdir(filepath):
            # If the file is a directory, recursively call this function
            file_list.extend(get_all_files(filepath))
        else:
            # If the file is a file, add it to the list
            if os.path.splitext(filepath)[1] == ".json":
                file_list.append(filepath)
    return file_list

# Call the function with the directory path you want to list
files = get_all_files('Full Un-anottated data/')

In [3]:
tweet_id_graph_dict = dict()
parent_child_tweets = {"parent": [], "child": []}

for i, f in enumerate(files):
    
    # create a dictionary for each file with the key = i (index in the list of files)
    tweet_id_graph_dict[i] = dict()
    
    # same as previous, but just with identiying list of parents and list of childs for each file
 #   parent_child_tweets[i] = {"parent": [], "child": []}
    
    # read file using arguebuf library
    graph = ag.Graph.from_file(f)

    # loop on values of each file and save the label (tweet) as value and id as key within each file
    for node in graph.atom_nodes.values():
        tweet_id_graph_dict[i][node.id] = node.label
    
    # loop on each file items and save all related child and parents in the lists
    # also add them to a set so no values are repeated
#     print(f)
    for key, value in graph.scheme_nodes.items():
       # print(i)
      #  print("key".ljust(10), key)
      #  print("value".ljust(10), value)
        
        child_tweet_id = key.split(',')[0]
        parent_tweet_id = key.split(',')[1]
        
        parent_child_tweets["parent"].append(tweet_id_graph_dict[i][parent_tweet_id])
        parent_child_tweets["child"].append(tweet_id_graph_dict[i][child_tweet_id])

In [4]:
import pandas as pd
df = pd.DataFrame(parent_child_tweets)
df = df.drop_duplicates().reset_index(drop=True)

In [5]:
# Shuffle and reset index, to make sure it's randomized
df = df.sample(frac=1).reset_index(drop=True)

In [6]:
# Get a random sample of the DataFrame
first_df = df.sample(frac=0.5)

# Drop the sampled rows from the original DataFrame
second_df = df.drop(first_df.index)

In [7]:
first_df["label"] = 1
first_df.head()

Unnamed: 0,parent,child,label
209622,⚠️BREAKING—Huge CDC study says #COVID19 not on...,"As best as I can tell, this excess deaths stor...",1
109081,.@realDonaldTrump was the clear winner in last...,You’re clearly delusional.,1
189860,"President Trump arrives in Fort Myers, Florida...",Death cult. How many today??,1
154200,“Learn” - a word still not understood by trump.,Damn right,1
78050,I’m voting for America #vote #election2020,You have no idea what real Americans look or s...,1


In [8]:
first_df = first_df.rename(columns={"parent": "input1", "child": "input2"})
first_df.head()

Unnamed: 0,input1,input2,label
209622,⚠️BREAKING—Huge CDC study says #COVID19 not on...,"As best as I can tell, this excess deaths stor...",1
109081,.@realDonaldTrump was the clear winner in last...,You’re clearly delusional.,1
189860,"President Trump arrives in Fort Myers, Florida...",Death cult. How many today??,1
154200,“Learn” - a word still not understood by trump.,Damn right,1
78050,I’m voting for America #vote #election2020,You have no idea what real Americans look or s...,1


In [9]:
second_df.head()

Unnamed: 0,parent,child
0,Today is your last chance to register to vote ...,Go AZ for Biden! We can change the color of th...
1,OMG! Who did this?!?! Best 2 minutes of your d...,Omg.. This is the best clip ever. 🎉
3,OMG! Who did this?!?! Best 2 minutes of your d...,Absolutely loved it awesome!
5,"BREAKING—New records reveal that, while Trump ...",its pretty fucked up but also I feel like if y...
7,President Trump won't have to recover from COV...,Have you taken your temp? Only someone serious...


In [10]:
second_df = second_df[["child", "parent"]]
second_df.head()

Unnamed: 0,child,parent
0,Go AZ for Biden! We can change the color of th...,Today is your last chance to register to vote ...
1,Omg.. This is the best clip ever. 🎉,OMG! Who did this?!?! Best 2 minutes of your d...
3,Absolutely loved it awesome!,OMG! Who did this?!?! Best 2 minutes of your d...
5,its pretty fucked up but also I feel like if y...,"BREAKING—New records reveal that, while Trump ..."
7,Have you taken your temp? Only someone serious...,President Trump won't have to recover from COV...


In [11]:
second_df = second_df.rename(columns={"child": "input1", "parent": "input2"})
second_df["label"] = 0
second_df.head()

Unnamed: 0,input1,input2,label
0,Go AZ for Biden! We can change the color of th...,Today is your last chance to register to vote ...,0
1,Omg.. This is the best clip ever. 🎉,OMG! Who did this?!?! Best 2 minutes of your d...,0
3,Absolutely loved it awesome!,OMG! Who did this?!?! Best 2 minutes of your d...,0
5,its pretty fucked up but also I feel like if y...,"BREAKING—New records reveal that, while Trump ...",0
7,Have you taken your temp? Only someone serious...,President Trump won't have to recover from COV...,0


In [12]:
df = pd.concat([first_df, second_df])
df

Unnamed: 0,input1,input2,label
209622,⚠️BREAKING—Huge CDC study says #COVID19 not on...,"As best as I can tell, this excess deaths stor...",1
109081,.@realDonaldTrump was the clear winner in last...,You’re clearly delusional.,1
189860,"President Trump arrives in Fort Myers, Florida...",Death cult. How many today??,1
154200,“Learn” - a word still not understood by trump.,Damn right,1
78050,I’m voting for America #vote #election2020,You have no idea what real Americans look or s...,1
...,...,...,...
305943,He called us losers for serving the country. T...,"My Vote is for what makes this country great, ...",0
305948,That's the problem with my fellow black brothe...,That’s because the best thing about the Republ...,0
305949,We start early voting tomorrow in SWFL. I’ll b...,Don’t wait for Election Day to vote. If you’re...,0
305955,If you want freedom of choice you're voting fo...,"My Vote is for what makes this country great, ...",0


In [13]:
df = df.sort_index()
df

Unnamed: 0,input1,input2,label
0,Go AZ for Biden! We can change the color of th...,Today is your last chance to register to vote ...,0
1,Omg.. This is the best clip ever. 🎉,OMG! Who did this?!?! Best 2 minutes of your d...,0
2,.@realDonaldTrump was the clear winner in last...,"covidbrain is real, but this guy was never too...",1
3,Absolutely loved it awesome!,OMG! Who did this?!?! Best 2 minutes of your d...,0
4,JUST IN: @KamalaHarris canceling her campaign ...,How can that be when they are following the CD...,1
...,...,...,...
305953,"POTUS still going in Janesville, Wisconsin!! #...",Isn't this the 3rd Peaceful Protest today for ...,1
305954,"My Vote is for what makes this country great, ...",Hmm ..all the things you mentioned-Biden suppo...,1
305955,If you want freedom of choice you're voting fo...,"My Vote is for what makes this country great, ...",0
305956,3) “Congressional Dems have grown so frustrate...,and she better not cave without these in place.,1


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 305958 entries, 0 to 305957
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   input1  305958 non-null  object
 1   input2  305958 non-null  object
 2   label   305958 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 9.3+ MB


In [15]:
df.to_csv("data.csv", index=False)