In [107]:
import pandas as pd
import re

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.max_colwidth = 100

In [108]:
def read_df():
    # Read complete non preprocessed dataset including submissions and comments
    
    df = pd.read_csv("../datasets/reddit_dump.csv", usecols=["author","author_fullname","created_utc","link_id","parent_id","score","subreddit","name"])
    return df

In [109]:
def drop_bots_from_comments(df):
    
    # Read scraped bot lists
    df_botranks_pasttimes = pd.read_csv("../datasets/botranks_pasttimes.csv")
    df_botranks = pd.read_csv("../datasets/botranks.csv")
    bots = pd.concat([df_botranks_pasttimes, df_botranks])    
    bots = bots[["bot_names","source"]]
    
    # Manually add bot found during research
    manual_added = {'bot_names': 'reddit-timestamp-bot', 'source': 'manually_added'}
    bots = bots.append(manual_added, ignore_index = True)
    
    # Filter to unique bots
    bot = bots.drop_duplicates(["bot_names"], keep="first")
    
    # Delete all bot comments
    df = df[~df['author'].isin(bot["bot_names"].tolist())]
    return df

# Read DF 

In [110]:
df = read_df()

  if (await self.run_code(code, result,  async_=asy)):


In [111]:
df = drop_bots_from_comments(df)

In [119]:
df

Unnamed: 0,author,author_fullname,created_utc,link_id,parent_id,score,subreddit,name
0,fixthismess,t2_fnnaj,1651435486,t3_ug6my9,t3_ug6my9,1,TwoXChromosomes,t1_i6y00op
1,seravivi,t2_148x1q,1651435486,t3_ufz7z8,t1_i6x05ks,1,TwoXChromosomes,t1_i6y00mx
2,SillySundae,t2_13722gob,1651435477,t3_ufz1tz,t3_ufz1tz,1,TwoXChromosomes,t1_i6xzzzq
3,GreyTartanTee,t2_bxckb6oc,1651435466,t3_ufz1tz,t1_i6xbi37,1,TwoXChromosomes,t1_i6xzz0k
4,Magdalan,t2_12yrq8,1651435459,t3_ug00f6,t1_i6x1rdg,1,TwoXChromosomes,t1_i6xzyet
...,...,...,...,...,...,...,...,...
18645652,jackson01105,t2_cjr1ikd2,1659226131,t3_wcb9qh,,1,Conservative,t3_wcb9qh
18645653,LocksmithOk9368,t2_akltk5pq,1659225974,t3_wcb7vh,,1,Conservative,t3_wcb7vh
18645654,kingmaster12345,t2_qoxvq11h,1659225884,t3_wcb6ru,,1,Conservative,t3_wcb6ru
18645655,B4NNED4LIFE,t2_p741blpy,1659225483,t3_wcb1ra,,1,Conservative,t3_wcb1ra


# Create final df where results gets merged too

author und author_fullname immer zusammen connected

In [121]:
# Build up dataframe to merge all resulting metrics

final_df = df[['author',"author_fullname"]]


In [122]:
# Filter undefined deleted users from the result dataframe, not from the original dataframe because they still are part of reply graphs

final_df = final_df[final_df["author"] != "[deleted]"]

In [123]:
# Filter undefined NaN users from the result dataframe

final_df = final_df[~final_df["author_fullname"].isna()]

In [124]:
# Delete duplicated users

final_df = final_df.drop_duplicates(keep="first")

In [125]:
final_df

Unnamed: 0,author,author_fullname
0,fixthismess,t2_fnnaj
1,seravivi,t2_148x1q
2,SillySundae,t2_13722gob
3,GreyTartanTee,t2_bxckb6oc
4,Magdalan,t2_12yrq8
...,...,...
18645650,Husklik,t2_hm2edcrx
18645652,jackson01105,t2_cjr1ikd2
18645653,LocksmithOk9368,t2_akltk5pq
18645654,kingmaster12345,t2_qoxvq11h


## calculate average of karma for the users (just comment karma, as defined in Morrison et al)

In [126]:
# Filter submissions by t3_ in nae and calculate mean with groupby
df_karma_avg = df[~df['name'].str.contains("t3_")].groupby(["author_fullname"])["score"].mean().reset_index(name="mean_karma")

In [127]:
df_karma_avg

Unnamed: 0,author_fullname,mean_karma
0,t2_1000b4kb,1.0
1,t2_1000n2,1.0
2,t2_10012x,1.0
3,t2_1001bs,1.0
4,t2_1001ge,1.0
...,...,...
1496781,t2_zzznp,1.0
1496782,t2_zzzq3,1.0
1496783,t2_zzzqx,1.0
1496784,t2_zzzwu,1.0


In [128]:
# Add average karma to the result dataframe

final_df = final_df.merge(df_karma_avg, how="left", left_on="author_fullname",right_on="author_fullname")

In [21]:
del df_karma_avg

## th - # of submitted posts (new threads) 

In [129]:
# Calculate amount of submissions per user by grouping on author_fullname

df_th = df[df['name'].str.contains("t3_")].groupby(["author_fullname"]).size().reset_index(name="th")

In [130]:
# Normalize results for easier interpretation

df_th["th"] = df_th["th"] / max(df_th["th"])

In [131]:
# Add th metric to the result dataframe

final_df = final_df.merge(df_th, how="left", left_on="author_fullname",right_on="author_fullname")

In [26]:
del df_th

## mean # of comments per post that the user engages in (mpth)

In [132]:
# Exclude submissions from dataframe and calculate amount of comments per user per link id/thread
mpth = df[~df['name'].str.contains("t3_")].groupby(by=["author_fullname","link_id"]).size().reset_index(name='count') 

In [133]:
# Calculate the average per user
mpth = mpth.groupby(by=["author_fullname"])["count"].mean().reset_index(name='mpth') 

In [134]:
# Add average to the result dataframe

final_df = final_df.merge(mpth, how="left", left_on="author_fullname",right_on="author_fullname")

In [30]:
del mpth

## standard deviation of comments per post that the user engages in (spth)

In [135]:
# Exclude submissions from dataframe and calculate amount of comments per user per link id/thread

spth = df[~df['name'].str.contains("t3_")].groupby(by=["author_fullname","link_id"]).size().reset_index(name='count') 

In [136]:
# Calculate the standard deviation of comments per thread per user

spth = spth.groupby(by=["author_fullname"])["count"].std().reset_index(name='spth') 

In [137]:
# Add standard deviation to the result dataframe

final_df = final_df.merge(spth, how="left", left_on="author_fullname",right_on="author_fullname")

In [34]:
del spth

##  # of comments submitted by the user that received at least one reply (pr)

In [138]:
# Exclude submissions from the dataset

df_filter_col = df[~df['name'].str.contains("t3_")][["name","parent_id","author_fullname","link_id"]]

In [139]:
# Merge dataframes by name and parent_id to get all possible replies

df_filter_col_merged = df_filter_col.merge(df_filter_col, how="inner", left_on=["name"], right_on=["parent_id"])

In [140]:
# Drop comment duplicates that achieved more than one reply by column name (unique for every user comment)

df_replies_unique = df_filter_col_merged.drop_duplicates(
  subset = ['name_x', 'author_fullname_x'],
  keep = 'first').reset_index(drop = True)

In [141]:
# Groupby author for df_filter_col to count the amount of comments every user wrote

df_all_occ = df_filter_col.groupby(["author_fullname"]).size().reset_index(name="Amount")

In [142]:
# Groupby author for df_replies to count the unique amount of authors replied to the user

df_replies = df_replies_unique.groupby(["author_fullname_x"]).size().reset_index(name="amount_one_rep")

In [143]:
pr = df_replies.merge(df_all_occ, how="outer", left_on="author_fullname_x", right_on="author_fullname")

In [144]:
pr['amount_one_rep'] = pr['amount_one_rep'].fillna(0)

# Calculate the ratio between the comments that achieved at least one reply and the amount of total comments
pr["pr"] = pr["amount_one_rep"] / pr["Amount"]
pr = pr[["author_fullname","pr"]]

In [145]:
# Add pr metric to the result dataframe 

final_df = final_df.merge(pr, how="left", left_on="author_fullname",right_on="author_fullname")

In [43]:
del df_filter_col, df_filter_col_merged, df_replies_unique, df_all_occ, df_replies, pr, 

# Bidirectional Communication

## At  first calculating the bidirectional communications for users that replied to the submission, because it has a fix starting point

In [146]:
# Filter dataframe for comments that directly reply to the submission

df_comments_filtered = df[~df['name'].str.contains("t3_")][df[~df['name'].str.contains("t3_")]['parent_id'].str.match('t3_')]

In [147]:
# Filter dataframe to submissions

df_submissions = df[df['name'].str.contains("t3_")]

In [148]:
# Merge reply and submission

bidir_sub_ink = df_submissions.merge(df_comments_filtered, how="inner", left_on="name",right_on="parent_id", suffixes=("_1", "_2"))

In [149]:
# Delete communication when both user names are [deleted] so the values cannot be assigned to a user name

bidir_sub_ink = bidir_sub_ink.loc[(bidir_sub_ink['author_1'] != "[deleted]")  &  (bidir_sub_ink['author_2'] != "[deleted]")]

In [150]:
# Delete rows when user replied himself

bidir_sub_ink = bidir_sub_ink.loc[(bidir_sub_ink['author_1'] != bidir_sub_ink['author_2'])]

In [151]:
# Reducing df dimensions for computation

bidir_sub_ink = bidir_sub_ink.filter(items=['author_1','name_1','author_2','name_2'])

In [152]:
# Filter dataframe on comments that replied to comments

df_comments_filtered_on_comm = df[~df['name'].str.contains("t3_")][df[~df['name'].str.contains("t3_")]['parent_id'].str.match('t1_')]
df_comments_filtered_on_comm = df_comments_filtered_on_comm.filter(items=['author','name','parent_id'])

In [153]:
# Merge dataframes by name-parent_id and author (submission) - author to ensure that comment was written by the author that contributed the submission

bidir_sub_ink = bidir_sub_ink.merge(df_comments_filtered_on_comm, how="inner", left_on=["name_2","author_1"], right_on=["parent_id","author"], suffixes=(f"_2", f"_3"))
bidir_sub_ink = bidir_sub_ink.rename(columns={'name': f'name_3', 'parent_id': 'parent_id_3', "author" : "author_3"})

In [154]:
# Count the amount of users in author_2 by groupby

bidir_amount_with_sub = bidir_sub_ink.groupby(["author_2"]).size().reset_index(name="Bidir")

In [155]:
# Rename column to merge dataframe with final_df 

bidir_amount_with_sub = bidir_amount_with_sub.rename(columns={"author_2": "author"})

In [156]:
# Bidirectional communications get merged with final_df
final_df = final_df.merge(bidir_amount_with_sub, how="left", left_on="author",right_on="author")

In [157]:
final_df

Unnamed: 0,author,author_fullname,mean_karma,th,mpth,spth,pr,Bidir
0,fixthismess,t2_fnnaj,1.0,,1.000000,0.000000,0.153846,
1,seravivi,t2_148x1q,1.0,,2.400000,3.130495,0.416667,
2,SillySundae,t2_13722gob,1.0,,1.666667,1.118034,0.200000,
3,GreyTartanTee,t2_bxckb6oc,1.0,,1.000000,,0.000000,
4,Magdalan,t2_12yrq8,1.0,,2.073529,2.300656,0.333333,
...,...,...,...,...,...,...,...,...
1599021,Husklik,t2_hm2edcrx,,0.000345,,,,
1599022,jackson01105,t2_cjr1ikd2,,0.000345,,,,
1599023,LocksmithOk9368,t2_akltk5pq,,0.000345,,,,
1599024,kingmaster12345,t2_qoxvq11h,,0.000345,,,,


In [158]:
# df to filter already counted bidir

preparation_bidir = bidir_sub_ink[["author_2","name_2","author_3","name_3", "parent_id_3"]]

In [159]:
preparation_bidir

Unnamed: 0,author_2,name_2,author_3,name_3,parent_id_3
0,ThroarkAway,t1_i6xxer5,Overall-Blood1940,t1_i6xxk82,t1_i6xxer5
1,SpaceCase206,t1_i6xwqos,Overall-Blood1940,t1_i6xwsz0,t1_i6xwqos
2,bluemercutio,t1_i6xvjmk,Ichewthecereal,t1_i6xw40b,t1_i6xvjmk
3,airaqua,t1_i6xuvyd,Ichewthecereal,t1_i6xvtxf,t1_i6xuvyd
4,AzulineAmphisbaena,t1_i6xubzr,Ichewthecereal,t1_i6xw821,t1_i6xubzr
...,...,...,...,...,...
738648,mountaincabinlife,t1_iibzvx3,hiskias,t1_iic10yg,t1_iibzvx3
738649,--SpentBrass--,t1_iibzhxg,hiskias,t1_iic0te6,t1_iibzhxg
738650,jraps26,t1_iibzfkx,hiskias,t1_iic0xvv,t1_iibzfkx
738651,ReturningDukky,t1_iibzctu,hiskias,t1_iibztwc,t1_iibzctu


In [None]:
del df_comments_filtered, df_submissions, bidir_sub_ink,df_comments_filtered_on_comm, bidir_amount_with_sub ,bidir_for_sub

## proportion of a user’s peers where bi-directional communication exists (bin)

In [160]:
# Extract only comments from dataframe and exclude submissions

df_comments = df[~df['name'].str.contains("t3_")][["author","parent_id","name"]]

In [161]:
df_comments

Unnamed: 0,author,parent_id,name
0,fixthismess,t3_ug6my9,t1_i6y00op
1,seravivi,t1_i6x05ks,t1_i6y00mx
2,SillySundae,t3_ufz1tz,t1_i6xzzzq
3,GreyTartanTee,t1_i6xbi37,t1_i6xzz0k
4,Magdalan,t1_i6x1rdg,t1_i6xzyet
...,...,...,...
18611225,[deleted],t1_iib2tvp,t1_iibiudw
18611226,GulagInmate1973,t1_iibb8e7,t1_iibitua
18611227,[deleted],t3_wcajwu,t1_iibiqpr
18611228,Rogue-Ai-01,t3_wc57e8,t1_iibiq1l


In [162]:
# Merge comments by name and parent_id

df_bidir = df_comments.merge(df_comments, how="inner", left_on=["name"], right_on=["parent_id"], suffixes=(f"_1", f"_2"))

In [163]:
df_bidir

Unnamed: 0,author_1,parent_id_1,name_1,author_2,parent_id_2,name_2
0,fixthismess,t3_ug6my9,t1_i6y00op,wozxox3,t1_i6y00op,t1_i6yebi4
1,gfkjhsdfjhgsdjghf,t1_i6xp12n,t1_i6xzxla,Technical-Finding681,t1_i6xzxla,t1_i6y11vu
2,NoPlum2175,t1_i6xsk2h,t1_i6xzxgj,Starchasm,t1_i6xzxgj,t1_i6zfwnn
3,NoPlum2175,t1_i6xsk2h,t1_i6xzxgj,LucyWritesSmut,t1_i6xzxgj,t1_i6yod0l
4,NoPlum2175,t1_i6xsk2h,t1_i6xzxgj,HelenGonne,t1_i6xzxgj,t1_i6y20kg
...,...,...,...,...,...,...
6891132,rentfreeinyohead,t1_iib6l2s,t1_iibixt6,[deleted],t1_iibixt6,t1_iibz1ud
6891133,WuFlu_Tang_Clan,t3_wcap1a,t1_iibix5p,Grossegurke,t1_iibix5p,t1_iies329
6891134,WuFlu_Tang_Clan,t3_wcap1a,t1_iibix5p,wiredog369,t1_iibix5p,t1_iibt1iw
6891135,Rogue-Ai-01,t3_wc57e8,t1_iibiq1l,ImOnTheInstanet,t1_iibiq1l,t1_iid7zim


In [164]:
preparation_bidir_test = preparation_bidir[["name_2","name_3"]].rename(columns={"name_2": "name_1", "name_3": "name_2"})

In [165]:
preparation_bidir_test["marker"] = 1

In [166]:
# Set up dataframe with a marker to exclude all pairs of name_1 and name_2 that were already counted earlier
preparation_bidir_test

Unnamed: 0,name_1,name_2,marker
0,t1_i6xxer5,t1_i6xxk82,1
1,t1_i6xwqos,t1_i6xwsz0,1
2,t1_i6xvjmk,t1_i6xw40b,1
3,t1_i6xuvyd,t1_i6xvtxf,1
4,t1_i6xubzr,t1_i6xw821,1
...,...,...,...
738648,t1_iibzvx3,t1_iic10yg,1
738649,t1_iibzhxg,t1_iic0te6,1
738650,t1_iibzfkx,t1_iic0xvv,1
738651,t1_iibzctu,t1_iibztwc,1


In [167]:
# Merge the marker to the dataframe

df_bidir_filtered = df_bidir.merge(preparation_bidir_test, how="left", left_on=["name_1","name_2"], right_on=["name_1","name_2"])

In [168]:
df_bidir_filtered

Unnamed: 0,author_1,parent_id_1,name_1,author_2,parent_id_2,name_2,marker
0,fixthismess,t3_ug6my9,t1_i6y00op,wozxox3,t1_i6y00op,t1_i6yebi4,
1,gfkjhsdfjhgsdjghf,t1_i6xp12n,t1_i6xzxla,Technical-Finding681,t1_i6xzxla,t1_i6y11vu,
2,NoPlum2175,t1_i6xsk2h,t1_i6xzxgj,Starchasm,t1_i6xzxgj,t1_i6zfwnn,
3,NoPlum2175,t1_i6xsk2h,t1_i6xzxgj,LucyWritesSmut,t1_i6xzxgj,t1_i6yod0l,
4,NoPlum2175,t1_i6xsk2h,t1_i6xzxgj,HelenGonne,t1_i6xzxgj,t1_i6y20kg,
...,...,...,...,...,...,...,...
6891132,rentfreeinyohead,t1_iib6l2s,t1_iibixt6,[deleted],t1_iibixt6,t1_iibz1ud,
6891133,WuFlu_Tang_Clan,t3_wcap1a,t1_iibix5p,Grossegurke,t1_iibix5p,t1_iies329,
6891134,WuFlu_Tang_Clan,t3_wcap1a,t1_iibix5p,wiredog369,t1_iibix5p,t1_iibt1iw,
6891135,Rogue-Ai-01,t3_wc57e8,t1_iibiq1l,ImOnTheInstanet,t1_iibiq1l,t1_iid7zim,


In [169]:
#filtering the bidirs that are already captured in bidir with submission

df_bidir_filtered = df_bidir_filtered[df_bidir_filtered["marker"].isna()]

In [170]:
# Delete communication when both user names are [deleted] so the values cannot be assigned to a user name

df_bidir_filtered = df_bidir_filtered.loc[(df_bidir_filtered[f'author_1'] != "[deleted]")  &  (df_bidir_filtered[f'author_2'] != "[deleted]")]

In [171]:
# Ensure to filter when author_1 replied himself

df_bidir_filtered = df_bidir_filtered.loc[(df_bidir_filtered[f'author_1'] != df_bidir_filtered[f'author_2'])]

In [172]:
df_bidir_filtered = df_bidir_filtered[["author_1","parent_id_1","name_1","author_2","parent_id_2","name_2"]]

In [173]:
#df_comments_f = df_comments[["author","created_utc","parent_id","name"]]

In [174]:
# Merge additional comments to the dataframe, now reply graph has length 3

df_bidir_filtered = df_bidir_filtered.merge(df_comments, how="inner", left_on=["name_2","author_1"], right_on=["parent_id","author"], suffixes=(f"_2", f"_3"))
df_bidir_filtered = df_bidir_filtered.rename(columns={'name': f'name_3', 'parent_id': 'parent_id_3', "author" : "author_3", "created_utc": "created_utc_3"})

In [175]:
df_bidir_filtered

Unnamed: 0,author_1,parent_id_1,name_1,author_2,parent_id_2,name_2,author_3,parent_id_3,name_3
0,haileycolp,t1_i6xz6op,t1_i6xzerv,levelit,t1_i6xzerv,t1_i6y9kxd,haileycolp,t1_i6y9kxd,t1_i7i3jk0
1,ValksCries,t1_i6xu4j0,t1_i6xz244,Brawnhilde,t1_i6xz244,t1_i6y0n84,ValksCries,t1_i6y0n84,t1_i6y1n5r
2,Xerisca,t3_ug601t,t1_i6xyr8e,throwRAstickypast,t1_i6xyr8e,t1_i6yj1zc,Xerisca,t1_i6yj1zc,t1_i6yotsy
3,AzulineAmphisbaena,t1_i6xyfi7,t1_i6xyoh1,MeandMyPelvicfloor,t1_i6xyoh1,t1_i6xyzn8,AzulineAmphisbaena,t1_i6xyzn8,t1_i6xz4qk
4,OnlyNeverAlwaysSure,t1_i6xw52v,t1_i6xyod2,creuter,t1_i6xyod2,t1_i6yqyph,OnlyNeverAlwaysSure,t1_i6yqyph,t1_i6zh46m
...,...,...,...,...,...,...,...,...,...
2157601,Dudes-Abide,t3_wc6u9m,t1_iibkxxk,Big-Employer4543,t1_iibkxxk,t1_iiduz74,Dudes-Abide,t1_iiduz74,t1_iidzthc
2157602,Jorel_Antonius,t3_wcah1v,t1_iibkltf,PyonPyonCal,t1_iibkltf,t1_iibpr6x,Jorel_Antonius,t1_iibpr6x,t1_iibtyr8
2157603,The-chingarito,t3_wc5qwj,t1_iibjv2m,sureynot13,t1_iibjv2m,t1_iibx2q1,The-chingarito,t1_iibx2q1,t1_iibyeku
2157604,GrandpaHardcore,t1_ii9mdvw,t1_iibje49,Alert_Salt7048,t1_iibje49,t1_iibpm4s,GrandpaHardcore,t1_iibpm4s,t1_iibrh6w


In [176]:
n = 4

In [177]:
df_test = df_bidir_filtered.copy()

In [178]:
df_comments = df_comments[~df_comments['parent_id'].str.contains("t3_")]

In [179]:
# Additionally merge two more possbile responses to every reply graph

while True:
    
    print(n)
    
    df_test = df_test.merge(df_comments, how="left", left_on=[f"name_{n-1}",f"author_{n-2}"], right_on=["parent_id","author"], suffixes=(f"_{n-1}", f"_{n}"))
    df_test = df_test.rename(columns={'name': f'name_{n}', 'parent_id': f'parent_id_{n}', "author" : f"author_{n}"})
    
    if n == 5:
        break
        
    n += 1

4
5


In [180]:
df_test

Unnamed: 0,author_1,parent_id_1,name_1,author_2,parent_id_2,name_2,author_3,parent_id_3,name_3,author_4,parent_id_4,name_4,author_5,parent_id_5,name_5
0,haileycolp,t1_i6xz6op,t1_i6xzerv,levelit,t1_i6xzerv,t1_i6y9kxd,haileycolp,t1_i6y9kxd,t1_i7i3jk0,,,,,,
1,ValksCries,t1_i6xu4j0,t1_i6xz244,Brawnhilde,t1_i6xz244,t1_i6y0n84,ValksCries,t1_i6y0n84,t1_i6y1n5r,,,,,,
2,Xerisca,t3_ug601t,t1_i6xyr8e,throwRAstickypast,t1_i6xyr8e,t1_i6yj1zc,Xerisca,t1_i6yj1zc,t1_i6yotsy,throwRAstickypast,t1_i6yotsy,t1_i7duf08,Xerisca,t1_i7duf08,t1_i7e3z61
3,AzulineAmphisbaena,t1_i6xyfi7,t1_i6xyoh1,MeandMyPelvicfloor,t1_i6xyoh1,t1_i6xyzn8,AzulineAmphisbaena,t1_i6xyzn8,t1_i6xz4qk,,,,,,
4,OnlyNeverAlwaysSure,t1_i6xw52v,t1_i6xyod2,creuter,t1_i6xyod2,t1_i6yqyph,OnlyNeverAlwaysSure,t1_i6yqyph,t1_i6zh46m,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2200646,Dudes-Abide,t3_wc6u9m,t1_iibkxxk,Big-Employer4543,t1_iibkxxk,t1_iiduz74,Dudes-Abide,t1_iiduz74,t1_iidzthc,,,,,,
2200647,Jorel_Antonius,t3_wcah1v,t1_iibkltf,PyonPyonCal,t1_iibkltf,t1_iibpr6x,Jorel_Antonius,t1_iibpr6x,t1_iibtyr8,PyonPyonCal,t1_iibtyr8,t1_iibvau1,,,
2200648,The-chingarito,t3_wc5qwj,t1_iibjv2m,sureynot13,t1_iibjv2m,t1_iibx2q1,The-chingarito,t1_iibx2q1,t1_iibyeku,,,,,,
2200649,GrandpaHardcore,t1_ii9mdvw,t1_iibje49,Alert_Salt7048,t1_iibje49,t1_iibpm4s,GrandpaHardcore,t1_iibpm4s,t1_iibrh6w,,,,,,


In [181]:
# Filter down by name_1 just being in column name_1 and name_2 just being in column name_2, to ensure that its the start of the reply graph and not within

index_list = df_test.index[df_test[f"name_1"].isin(df_test[f"name_3"].tolist()) & df_test[f"name_2"].isin(df_test[f"name_4"].tolist())]
df_test = df_test.drop(df_test.index[index_list]).reset_index(drop=True)

In [182]:
# Drop rows when author_1 and author_3 are the same in multiple columns, because then author_1 replied minimum 2 times to author 2 before the communication went on 

df_test = df_test.drop_duplicates(["author_1","name_1","parent_id_1","author_2","name_2","parent_id_2", "author_3","parent_id_3"], keep="first")

In [183]:
bidir_amount_no_sub = df_test.groupby(["author_2"]).size().reset_index(name="Bidir_no_sub")

In [184]:
bidir_amount_no_sub

Unnamed: 0,author_2,Bidir_no_sub
0,------------------f,1
1,----------_______---,4
2,--------3,1
3,------sb,1
4,-----1,2
...,...,...
375013,zzz_red,3
375014,zzz_sleepy_bird_zzz,7
375015,zzzap,1
375016,zzzzebras,1


In [185]:
bidir_amount_no_sub = bidir_amount_no_sub.rename(columns={'author_2': f'author'})

In [186]:
final_df = final_df.merge(bidir_amount_no_sub, how="left", right_on="author",left_on="author")

In [187]:
final_df

Unnamed: 0,author,author_fullname,mean_karma,th,mpth,spth,pr,Bidir,Bidir_no_sub
0,fixthismess,t2_fnnaj,1.0,,1.000000,0.000000,0.153846,,
1,seravivi,t2_148x1q,1.0,,2.400000,3.130495,0.416667,,1.0
2,SillySundae,t2_13722gob,1.0,,1.666667,1.118034,0.200000,,1.0
3,GreyTartanTee,t2_bxckb6oc,1.0,,1.000000,,0.000000,,
4,Magdalan,t2_12yrq8,1.0,,2.073529,2.300656,0.333333,,18.0
...,...,...,...,...,...,...,...,...,...
1599021,Husklik,t2_hm2edcrx,,0.000345,,,,,
1599022,jackson01105,t2_cjr1ikd2,,0.000345,,,,,
1599023,LocksmithOk9368,t2_akltk5pq,,0.000345,,,,,
1599024,kingmaster12345,t2_qoxvq11h,,0.000345,,,,,


In [188]:
final_df[["Bidir","Bidir_no_sub"]] = final_df[["Bidir","Bidir_no_sub"]].fillna(value=0)

In [189]:
final_df

Unnamed: 0,author,author_fullname,mean_karma,th,mpth,spth,pr,Bidir,Bidir_no_sub
0,fixthismess,t2_fnnaj,1.0,,1.000000,0.000000,0.153846,0.0,0.0
1,seravivi,t2_148x1q,1.0,,2.400000,3.130495,0.416667,0.0,1.0
2,SillySundae,t2_13722gob,1.0,,1.666667,1.118034,0.200000,0.0,1.0
3,GreyTartanTee,t2_bxckb6oc,1.0,,1.000000,,0.000000,0.0,0.0
4,Magdalan,t2_12yrq8,1.0,,2.073529,2.300656,0.333333,0.0,18.0
...,...,...,...,...,...,...,...,...,...
1599021,Husklik,t2_hm2edcrx,,0.000345,,,,0.0,0.0
1599022,jackson01105,t2_cjr1ikd2,,0.000345,,,,0.0,0.0
1599023,LocksmithOk9368,t2_akltk5pq,,0.000345,,,,0.0,0.0
1599024,kingmaster12345,t2_qoxvq11h,,0.000345,,,,0.0,0.0


In [81]:
del df_comments, df_bidir, preparation_bidir_test, preparation_bidir, df_bidir_filtered, df_test, bidir_amount_no_sub

# No bidirectional Communication - Calculation

## The search for no bidirectional communication was done per subreddit, cause of computational limitations when merging too many columns

In [195]:
def calculate_no_bidir_per_subreddit(df,subreddit):
    # Merge comments in specified subreddit to reply graph length 2
    rep_graph_len_2 = df[df["subreddit"] == subreddit][["author", "name", "parent_id"]].merge(
        df[(df["subreddit"] == subreddit) & (df["name"].str.contains("t1_")) & (df["author"] != "[deleted]")][["author", "name", "parent_id"]],
        how="inner", left_on=["name"], right_on=["parent_id"], suffixes=("_1", "_2"))
    
    # Merge comments in specified subreddit to reply graph length 3
    rep_graph_len_3 = rep_graph_len_2.merge(
        df[(df["subreddit"] == subreddit) & (df["parent_id"].str.contains("t1_")) & (df["name"].str.contains("t1_"))][
            ["author", "name", "parent_id"]], how="left", left_on=["name_2", "author_1"],
        right_on=["parent_id", "author"], suffixes=("_1", "_2"))
    rep_graph_len_3 = rep_graph_len_3.rename(columns={'name': f'name_3', 'parent_id': 'parent_id_3', "author": "author_3"})

    # Exclude rows where author_1 and author_3 are the same - otherwise would be bidirectional communication
    rep_graph_len_3 = rep_graph_len_3[rep_graph_len_3["author_1"] != rep_graph_len_3["author_3"]]

    # Merge dataframe from the right, to check if there was bidirectional communication beforehand (length: 4)
    rep_graph_len_4 = df[df["subreddit"] == subreddit][["author", "name", "parent_id"]].merge(rep_graph_len_3, how="right",
                                                                                  left_on=["name", "author"],
                                                                                  right_on=["parent_id_1", "author_2"],
                                                                                  suffixes=("_1", "_2"))
    rep_graph_len_4 = rep_graph_len_4.rename(columns={'name': f'name_0', 'parent_id': 'parent_id_0', "author": "author_0"})
    
    # Merge dataframe from the right, to check if there was bidirectional communication beforehand (length: 5)
    rep_graph_len_5 = df[df["subreddit"] == subreddit][["author", "name", "parent_id"]].merge(rep_graph_len_4, how="right",
                                                                                  left_on=["name", "author"],
                                                                                  right_on=["parent_id_0", "author_1"],
                                                                                  suffixes=("_1", "_2"))
    rep_graph_len_5 = rep_graph_len_5.rename(columns={'name': f'name_-1', 'parent_id': 'parent_id_-1', "author": "author_-1"})
    
    # Exclude rows with distance 2 where authors should not match - otherwise would be bidirectional communication
    rep_graph_len_5 = rep_graph_len_5[
        ~((rep_graph_len_5["author_-1"] == rep_graph_len_5["author_1"]) & (rep_graph_len_5["author_0"] == rep_graph_len_5["author_2"]) & (rep_graph_len_5["author_3"].isna()))]
    
    # Drop parent_id , not necessary anymore
    rep_graph_len_5 = rep_graph_len_5.drop(['parent_id_-1', 'parent_id_0', "parent_id_1", "parent_id_2", "parent_id_3"], axis=1)
    
    
    # filter all bidirectional communications that ended at author_2, making author_2 part of it (length: 6)

    rep_graph_len_6 = rep_graph_len_5.merge(
        df[(df["subreddit"] == subreddit) & (df["parent_id"].str.contains("t1_")) & (df["name"].str.contains("t1_"))][
            ["author", "name", "parent_id"]], how="left", left_on=["name_3", "author_2"],
        right_on=["parent_id", "author"], suffixes=("_1", "_2"))
    rep_graph_len_6 = rep_graph_len_6.rename(columns={'name': f'name_4', 'parent_id': 'parent_id_4', "author": "author_4"})

    
    # filter all bidirectional communications that ended at author_3, making author_3 part of it (length: 7)
    rep_graph_len_7 = rep_graph_len_6.merge(
        df[(df["subreddit"] == subreddit) & (df["parent_id"].str.contains("t1_")) & (df["name"].str.contains("t1_"))][
            ["author", "name", "parent_id"]], how="left", left_on=["name_4", "author_3"],
        right_on=["parent_id", "author"], suffixes=("_1", "_2"))
    rep_graph_len_7 = rep_graph_len_7.rename(columns={'name': f'name_5', 'parent_id': 'parent_id_5', "author": "author_5"})

    # Exclude rows with distance 2 where authors should not match - otherwise would be bidirectional communication
    rep_graph_len_7 = rep_graph_len_7[
        ~((rep_graph_len_7["author_2"] == rep_graph_len_7["author_4"]) & (rep_graph_len_7["author_3"] == rep_graph_len_7["author_5"]) & (rep_graph_len_7["author_1"].isna()))]

    
    rep_graph_len_7 = rep_graph_len_7.groupby("author_2").size().reset_index(name=f"No_bidir_{subreddit}")
    rep_graph_len_7 = rep_graph_len_7.rename(columns={"author_2": "author"})
    return rep_graph_len_7

In [197]:
df_no_bidir = df[['author', "author_fullname"]]
df_no_bidir = df_no_bidir[df_no_bidir["author"] != "[deleted]"]
df_no_bidir = df_no_bidir[~df_no_bidir["author_fullname"].isna()]
df_no_bidir = df_no_bidir.drop_duplicates(keep="first")


subreddits = ['teenagers','funny',
 'AskWomen',
 'AskMen',
 'Parenting',
 'science',
 'technology',
 'unpopularopinion',
 'Conservative',
 'TwoXChromosomes',
 'gardening']
for i in subreddits:
    print(i)
    df_no_bidir = df_no_bidir.merge(calculate_no_bidir_per_subreddit(df,i), how="left", on="author")

#print(no_bi_dir)
df_no_bidir = df_no_bidir.fillna(0)

teenagers
funny
AskWomen
AskMen
Parenting
science
technology
unpopularopinion
Conservative
TwoXChromosomes
gardening


In [198]:
df_no_bidir

Unnamed: 0,author,author_fullname,No_bidir_teenagers,No_bidir_funny,No_bidir_AskWomen,No_bidir_AskMen,No_bidir_Parenting,No_bidir_science,No_bidir_technology,No_bidir_unpopularopinion,No_bidir_Conservative,No_bidir_TwoXChromosomes,No_bidir_gardening
0,fixthismess,t2_fnnaj,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,9.0,0.0
1,seravivi,t2_148x1q,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
2,SillySundae,t2_13722gob,0.0,0.0,0.0,4.0,0.0,0.0,3.0,1.0,0.0,3.0,0.0
3,GreyTartanTee,t2_bxckb6oc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,Magdalan,t2_12yrq8,0.0,4.0,0.0,22.0,0.0,1.0,0.0,0.0,0.0,80.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1599021,Husklik,t2_hm2edcrx,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1599022,jackson01105,t2_cjr1ikd2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1599023,LocksmithOk9368,t2_akltk5pq,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1599024,kingmaster12345,t2_qoxvq11h,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [199]:
list_name = []
for i in subreddits:
    list_name.append("No_bidir_" + i)
    

In [200]:
# Sum up the amount of no bidirectional communication for every subreddit
df_no_bidir["no_bidir"] = df_no_bidir.loc[:,list_name].sum(axis=1)


In [201]:
df_no_bidir

Unnamed: 0,author,author_fullname,No_bidir_teenagers,No_bidir_funny,No_bidir_AskWomen,No_bidir_AskMen,No_bidir_Parenting,No_bidir_science,No_bidir_technology,No_bidir_unpopularopinion,No_bidir_Conservative,No_bidir_TwoXChromosomes,No_bidir_gardening,no_bidir
0,fixthismess,t2_fnnaj,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,9.0,0.0,12.0
1,seravivi,t2_148x1q,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,5.0
2,SillySundae,t2_13722gob,0.0,0.0,0.0,4.0,0.0,0.0,3.0,1.0,0.0,3.0,0.0,11.0
3,GreyTartanTee,t2_bxckb6oc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,Magdalan,t2_12yrq8,0.0,4.0,0.0,22.0,0.0,1.0,0.0,0.0,0.0,80.0,0.0,107.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1599021,Husklik,t2_hm2edcrx,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1599022,jackson01105,t2_cjr1ikd2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1599023,LocksmithOk9368,t2_akltk5pq,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1599024,kingmaster12345,t2_qoxvq11h,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [202]:
df_no_bidir = df_no_bidir.drop(list_name, axis=1)

In [204]:
df_no_bidir

Unnamed: 0,author,author_fullname,no_bidir
0,fixthismess,t2_fnnaj,12.0
1,seravivi,t2_148x1q,5.0
2,SillySundae,t2_13722gob,11.0
3,GreyTartanTee,t2_bxckb6oc,1.0
4,Magdalan,t2_12yrq8,107.0
...,...,...,...
1599021,Husklik,t2_hm2edcrx,0.0
1599022,jackson01105,t2_cjr1ikd2,0.0
1599023,LocksmithOk9368,t2_akltk5pq,0.0
1599024,kingmaster12345,t2_qoxvq11h,0.0


In [205]:
# Merge result to final_df

final_df = final_df.merge(df_no_bidir, how="left", right_on=["author","author_fullname"],left_on=["author","author_fullname"])

In [206]:
final_df

Unnamed: 0,author,author_fullname,mean_karma,th,mpth,spth,pr,Bidir,Bidir_no_sub,no_bidir
0,fixthismess,t2_fnnaj,1.0,,1.000000,0.000000,0.153846,0.0,0.0,12.0
1,seravivi,t2_148x1q,1.0,,2.400000,3.130495,0.416667,0.0,1.0,5.0
2,SillySundae,t2_13722gob,1.0,,1.666667,1.118034,0.200000,0.0,1.0,11.0
3,GreyTartanTee,t2_bxckb6oc,1.0,,1.000000,,0.000000,0.0,0.0,1.0
4,Magdalan,t2_12yrq8,1.0,,2.073529,2.300656,0.333333,0.0,18.0,107.0
...,...,...,...,...,...,...,...,...,...,...
1599021,Husklik,t2_hm2edcrx,,0.000345,,,,0.0,0.0,0.0
1599022,jackson01105,t2_cjr1ikd2,,0.000345,,,,0.0,0.0,0.0
1599023,LocksmithOk9368,t2_akltk5pq,,0.000345,,,,0.0,0.0,0.0
1599024,kingmaster12345,t2_qoxvq11h,,0.000345,,,,0.0,0.0,0.0


In [207]:
final_df[["Bidir","Bidir_no_sub","no_bidir"]] = final_df[["Bidir","Bidir_no_sub","no_bidir"]].fillna(value=0)

In [208]:
# Calculate the percentage of bidirectional communcation

final_df["bin"] = (final_df["Bidir"] + final_df["Bidir_no_sub"] )/ (final_df["Bidir"] + final_df["Bidir_no_sub"] + final_df["no_bidir"]) 

In [212]:
final_df

Unnamed: 0,author,author_fullname,mean_karma,th,mpth,spth,pr,Bidir,Bidir_no_sub,no_bidir,bin
0,fixthismess,t2_fnnaj,1.0,,1.000000,0.000000,0.153846,0.0,0.0,12.0,0.000000
1,seravivi,t2_148x1q,1.0,,2.400000,3.130495,0.416667,0.0,1.0,5.0,0.166667
2,SillySundae,t2_13722gob,1.0,,1.666667,1.118034,0.200000,0.0,1.0,11.0,0.083333
3,GreyTartanTee,t2_bxckb6oc,1.0,,1.000000,,0.000000,0.0,0.0,1.0,0.000000
4,Magdalan,t2_12yrq8,1.0,,2.073529,2.300656,0.333333,0.0,18.0,107.0,0.144000
...,...,...,...,...,...,...,...,...,...,...,...
1599021,Husklik,t2_hm2edcrx,,0.000345,,,,0.0,0.0,0.0,
1599022,jackson01105,t2_cjr1ikd2,,0.000345,,,,0.0,0.0,0.0,
1599023,LocksmithOk9368,t2_akltk5pq,,0.000345,,,,0.0,0.0,0.0,
1599024,kingmaster12345,t2_qoxvq11h,,0.000345,,,,0.0,0.0,0.0,


In [213]:
final_df[["bin"]] = final_df[["bin"]].fillna(value=0)

## proportion of posts participated in by the user where bidirectional communication exists (thbi)

In [214]:
df_sm = df[["name","parent_id","author","link_id"]]

In [215]:
# Merge comments to recreate reply graph

df_rep_graph_len_2 = df_sm.merge(df_sm, how="inner", left_on=["name"], right_on=["parent_id"], suffixes=(f"_1", f"_2"))

In [216]:
df_rep_graph_len_2

Unnamed: 0,name_1,parent_id_1,author_1,link_id_1,name_2,parent_id_2,author_2,link_id_2
0,t1_i6y00op,t3_ug6my9,fixthismess,t3_ug6my9,t1_i6yebi4,t1_i6y00op,wozxox3,t3_ug6my9
1,t1_i6xzxla,t1_i6xp12n,gfkjhsdfjhgsdjghf,t3_ufz1tz,t1_i6y11vu,t1_i6xzxla,Technical-Finding681,t3_ufz1tz
2,t1_i6xzxgj,t1_i6xsk2h,NoPlum2175,t3_ug383z,t1_i6zfwnn,t1_i6xzxgj,Starchasm,t3_ug383z
3,t1_i6xzxgj,t1_i6xsk2h,NoPlum2175,t3_ug383z,t1_i6yod0l,t1_i6xzxgj,LucyWritesSmut,t3_ug383z
4,t1_i6xzxgj,t1_i6xsk2h,NoPlum2175,t3_ug383z,t1_i6y20kg,t1_i6xzxgj,HelenGonne,t3_ug383z
...,...,...,...,...,...,...,...,...
12531255,t3_wcbpas,,ChunkyArsenio,t3_wcbpas,t1_iic76my,t3_wcbpas,flabiger,t3_wcbpas
12531256,t3_wcbpas,,ChunkyArsenio,t3_wcbpas,t1_iibuunb,t3_wcbpas,PB_Mack,t3_wcbpas
12531257,t3_wcbpas,,ChunkyArsenio,t3_wcbpas,t1_iibn1nu,t3_wcbpas,ChunkyArsenio,t3_wcbpas
12531258,t3_wcbohq,,ChunkyArsenio,t3_wcbohq,t1_iibtuwq,t3_wcbohq,automatedengineer,t3_wcbohq


In [217]:
# Filter communications where user replies himself
df_rep_graph_len_2 = df_rep_graph_len_2.loc[(df_rep_graph_len_2[f'author_1'] != df_rep_graph_len_2[f'author_2'])]

In [218]:
df_rep_graph_len_2

Unnamed: 0,name_1,parent_id_1,author_1,link_id_1,name_2,parent_id_2,author_2,link_id_2
0,t1_i6y00op,t3_ug6my9,fixthismess,t3_ug6my9,t1_i6yebi4,t1_i6y00op,wozxox3,t3_ug6my9
1,t1_i6xzxla,t1_i6xp12n,gfkjhsdfjhgsdjghf,t3_ufz1tz,t1_i6y11vu,t1_i6xzxla,Technical-Finding681,t3_ufz1tz
2,t1_i6xzxgj,t1_i6xsk2h,NoPlum2175,t3_ug383z,t1_i6zfwnn,t1_i6xzxgj,Starchasm,t3_ug383z
3,t1_i6xzxgj,t1_i6xsk2h,NoPlum2175,t3_ug383z,t1_i6yod0l,t1_i6xzxgj,LucyWritesSmut,t3_ug383z
4,t1_i6xzxgj,t1_i6xsk2h,NoPlum2175,t3_ug383z,t1_i6y20kg,t1_i6xzxgj,HelenGonne,t3_ug383z
...,...,...,...,...,...,...,...,...
12531253,t3_wcbqo3,,nimobo,t3_wcbqo3,t1_iibnjbm,t3_wcbqo3,Verdict1923,t3_wcbqo3
12531254,t3_wcbpas,,ChunkyArsenio,t3_wcbpas,t1_iicea3k,t3_wcbpas,NotAbot10011,t3_wcbpas
12531255,t3_wcbpas,,ChunkyArsenio,t3_wcbpas,t1_iic76my,t3_wcbpas,flabiger,t3_wcbpas
12531256,t3_wcbpas,,ChunkyArsenio,t3_wcbpas,t1_iibuunb,t3_wcbpas,PB_Mack,t3_wcbpas


In [219]:
# Merge additional comments (reply graph length:3)

df_rep_graph_len_3 = df_rep_graph_len_2.merge(df_sm, how="inner", left_on=["name_2","author_1"], right_on=["parent_id","author"], suffixes=(f"_2", f"_3"))
df_rep_graph_len_3 = df_rep_graph_len_3.rename(columns={'name': f'name_3', 'parent_id': 'parent_id_3', "author" : "author_3"})

In [220]:
df_rep_graph_len_3

Unnamed: 0,name_1,parent_id_1,author_1,link_id_1,name_2,parent_id_2,author_2,link_id_2,name_3,parent_id_3,author_3,link_id
0,t1_i6xzpzv,t3_ufu3xe,bubblebathmermaid,t3_ufu3xe,t1_i6yv3rw,t1_i6xzpzv,dingleballs717,t3_ufu3xe,t1_i6yx168,t1_i6yv3rw,bubblebathmermaid,t3_ufu3xe
1,t1_i6xzerv,t1_i6xz6op,haileycolp,t3_ufz1tz,t1_i6y9kxd,t1_i6xzerv,levelit,t3_ufz1tz,t1_i7i3jk0,t1_i6y9kxd,haileycolp,t3_ufz1tz
2,t1_i6xz244,t1_i6xu4j0,ValksCries,t3_ufz1tz,t1_i6y0n84,t1_i6xz244,Brawnhilde,t3_ufz1tz,t1_i6y1n5r,t1_i6y0n84,ValksCries,t3_ufz1tz
3,t1_i6xyr8e,t3_ug601t,Xerisca,t3_ug601t,t1_i6yj1zc,t1_i6xyr8e,throwRAstickypast,t3_ug601t,t1_i6yotsy,t1_i6yj1zc,Xerisca,t3_ug601t
4,t1_i6xyoh1,t1_i6xyfi7,AzulineAmphisbaena,t3_ug75b3,t1_i6xyzn8,t1_i6xyoh1,MeandMyPelvicfloor,t3_ug75b3,t1_i6xz4qk,t1_i6xyzn8,AzulineAmphisbaena,t3_ug75b3
...,...,...,...,...,...,...,...,...,...,...,...,...
3213158,t3_wcdkbb,,hiskias,t3_wcdkbb,t1_iibzvx3,t3_wcdkbb,mountaincabinlife,t3_wcdkbb,t1_iic10yg,t1_iibzvx3,hiskias,t3_wcdkbb
3213159,t3_wcdkbb,,hiskias,t3_wcdkbb,t1_iibzhxg,t3_wcdkbb,--SpentBrass--,t3_wcdkbb,t1_iic0te6,t1_iibzhxg,hiskias,t3_wcdkbb
3213160,t3_wcdkbb,,hiskias,t3_wcdkbb,t1_iibzfkx,t3_wcdkbb,jraps26,t3_wcdkbb,t1_iic0xvv,t1_iibzfkx,hiskias,t3_wcdkbb
3213161,t3_wcdkbb,,hiskias,t3_wcdkbb,t1_iibzctu,t3_wcdkbb,ReturningDukky,t3_wcdkbb,t1_iibztwc,t1_iibzctu,hiskias,t3_wcdkbb


In [221]:
# Filter dataframe to author_2 (has bidirectional communication) and link_id

df_rep_graph_len_3 = df_rep_graph_len_3[["author_2","link_id"]]

In [222]:
#  Add label "has_bidir" to differentiate later on
df_rep_graph_len_3["has_bidir"] = "has_bidir"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rep_graph_len_3["has_bidir"] = "has_bidir"


In [223]:
# Drop duplicates, only relevant if user had bidirectional communication in thread or not

df_rep_graph_len_3 = df_rep_graph_len_3.drop_duplicates()

In [224]:
df_rep_graph_len_3

Unnamed: 0,author_2,link_id,has_bidir
0,dingleballs717,t3_ufu3xe,has_bidir
1,levelit,t3_ufz1tz,has_bidir
2,Brawnhilde,t3_ufz1tz,has_bidir
3,throwRAstickypast,t3_ug601t,has_bidir
4,MeandMyPelvicfloor,t3_ug75b3,has_bidir
...,...,...,...
3213158,mountaincabinlife,t3_wcdkbb,has_bidir
3213159,--SpentBrass--,t3_wcdkbb,has_bidir
3213160,jraps26,t3_wcdkbb,has_bidir
3213161,ReturningDukky,t3_wcdkbb,has_bidir


In [225]:
df_rep_graph_len_3 = df_rep_graph_len_3.rename(columns={'author_2': f'author'})

In [226]:
results_per_thread_per_user = df_sm.merge(df_rep_graph_len_3, how="left", left_on=["link_id","author"], right_on=["link_id","author"])

In [227]:
results_per_thread_per_user

Unnamed: 0,name,parent_id,author,link_id,has_bidir
0,t1_i6y00op,t3_ug6my9,fixthismess,t3_ug6my9,
1,t1_i6y00mx,t1_i6x05ks,seravivi,t3_ufz7z8,
2,t1_i6xzzzq,t3_ufz1tz,SillySundae,t3_ufz1tz,
3,t1_i6xzz0k,t1_i6xbi37,GreyTartanTee,t3_ufz1tz,
4,t1_i6xzyet,t1_i6x1rdg,Magdalan,t3_ug00f6,
...,...,...,...,...,...
14165772,t3_wcb9qh,,jackson01105,t3_wcb9qh,
14165773,t3_wcb7vh,,LocksmithOk9368,t3_wcb7vh,
14165774,t3_wcb6ru,,kingmaster12345,t3_wcb6ru,
14165775,t3_wcb1ra,,B4NNED4LIFE,t3_wcb1ra,


In [228]:
results_per_thread_per_user = results_per_thread_per_user.fillna(0)

In [229]:
# When has_bidir NaN/0 user had no bidirectional communication there. Extracted these rows to no_bi_dir

no_bi_dir = results_per_thread_per_user[results_per_thread_per_user["has_bidir"] == 0]

In [230]:
no_bi_dir

Unnamed: 0,name,parent_id,author,link_id,has_bidir
0,t1_i6y00op,t3_ug6my9,fixthismess,t3_ug6my9,0
1,t1_i6y00mx,t1_i6x05ks,seravivi,t3_ufz7z8,0
2,t1_i6xzzzq,t3_ufz1tz,SillySundae,t3_ufz1tz,0
3,t1_i6xzz0k,t1_i6xbi37,GreyTartanTee,t3_ufz1tz,0
4,t1_i6xzyet,t1_i6x1rdg,Magdalan,t3_ug00f6,0
...,...,...,...,...,...
14165772,t3_wcb9qh,0,jackson01105,t3_wcb9qh,0
14165773,t3_wcb7vh,0,LocksmithOk9368,t3_wcb7vh,0
14165774,t3_wcb6ru,0,kingmaster12345,t3_wcb6ru,0
14165775,t3_wcb1ra,0,B4NNED4LIFE,t3_wcb1ra,0


In [231]:
# Ensure that every user in combination with every threads gets counted once

no_bi_dir = no_bi_dir.drop_duplicates(subset=["author","link_id"], keep="first")

In [232]:
no_bi_dir

Unnamed: 0,name,parent_id,author,link_id,has_bidir
0,t1_i6y00op,t3_ug6my9,fixthismess,t3_ug6my9,0
1,t1_i6y00mx,t1_i6x05ks,seravivi,t3_ufz7z8,0
2,t1_i6xzzzq,t3_ufz1tz,SillySundae,t3_ufz1tz,0
3,t1_i6xzz0k,t1_i6xbi37,GreyTartanTee,t3_ufz1tz,0
4,t1_i6xzyet,t1_i6x1rdg,Magdalan,t3_ug00f6,0
...,...,...,...,...,...
14165772,t3_wcb9qh,0,jackson01105,t3_wcb9qh,0
14165773,t3_wcb7vh,0,LocksmithOk9368,t3_wcb7vh,0
14165774,t3_wcb6ru,0,kingmaster12345,t3_wcb6ru,0
14165775,t3_wcb1ra,0,B4NNED4LIFE,t3_wcb1ra,0


In [233]:
# Calculate the amount of threads where the user had no bidirectional communication

no_bi_dir = no_bi_dir.groupby(["author"]).size().reset_index(name="Amount_no_bidir")

In [235]:
# Extract the threads, where a user had bidirectional communication

with_bi_dir = results_per_thread_per_user[results_per_thread_per_user["has_bidir"] != 0]

In [236]:
with_bi_dir = with_bi_dir.drop_duplicates(subset=["author","link_id"])

In [237]:
# Count amount of threads where user had bidirectional communication

with_bi_dir = with_bi_dir.groupby(["author"]).size().reset_index(name="Amount_bidir")

In [238]:
# Join with_bi_dir and no_bi_dir by author in dataframe 

thbi = no_bi_dir.merge(with_bi_dir, how="inner", left_on="author", right_on="author")

In [239]:
thbi

Unnamed: 0,author,Amount_no_bidir,Amount_bidir
0,----------_______---,12,4
1,--------3,2,1
2,-----1,15,2
3,-----_-_-_-_-_-----,6,3
4,-----deathgiver-----,1,1
...,...,...,...
353619,zzz_sleepy_bird_zzz,22,7
353620,zzzdeyzzz,2,1
353621,zzzkitten,4,1
353622,zzzzebras,2,1


In [240]:
# Calculate ratio of threads where user had bidirectional communication

thbi["thbi"] = thbi["Amount_bidir"] / (thbi["Amount_bidir"] + thbi["Amount_no_bidir"])

In [241]:
thbi  = thbi.drop(["Amount_no_bidir","Amount_bidir"], axis=1)

In [242]:
final_df = final_df.merge(thbi, how="left", right_on="author", left_on="author")

In [None]:
del df_sm,df_test,testi,no_bi_dir,with_bi_dir,thbi

## forum focus dispersion (entropy - lower if a user participates in few forums, higher if the user participates in many forums (ent)

In [243]:
# Calculate the amount of unique subreddits where a user participated

df_ent = df.groupby(["author","subreddit"]).size().reset_index(name="Count").groupby(["author"]).size().reset_index(name="amount")

In [244]:
# Calculate the ratio of subreddits where a user was engaged (11 subreddits where possible)

df_ent["entropy"] = df_ent["amount"] / 11

In [245]:
df_ent = df_ent.drop("amount", axis=1)

In [246]:
final_df = final_df.merge(df_ent, how="left", right_on="author", left_on="author")

In [247]:
final_df.to_csv("datasets/authors_presave.csv")

In [165]:
del df_ent

## in-degree (number of incoming edges from a user’s peers) (ind)

## Approach: Unique Personen  (response) / Unique Personen (No response)

In [248]:
#  Recreate reply graph - step 1 (length:1) 

df_reply_graph_len_1 = df[["name","parent_id","author", "link_id" , "subreddit", "created_utc"]].merge(df[["name","parent_id","author", "link_id" , "subreddit", "created_utc"]], left_on=["name","link_id"], right_on=["parent_id","link_id"], how="inner")

In [249]:
# Ensure that author_x and author_y is not the same user

df_reply_graph_len_1 = df_reply_graph_len_1[df_reply_graph_len_1["author_x"] != df_reply_graph_len_1["author_y"]]

In [250]:
df_reply_graph_len_1

Unnamed: 0,name_x,parent_id_x,author_x,link_id,subreddit_x,created_utc_x,name_y,parent_id_y,author_y,subreddit_y,created_utc_y
0,t1_i6y00op,t3_ug6my9,fixthismess,t3_ug6my9,TwoXChromosomes,1651435486,t1_i6yebi4,t1_i6y00op,wozxox3,TwoXChromosomes,1651441798
1,t1_i6xzxla,t1_i6xp12n,gfkjhsdfjhgsdjghf,t3_ufz1tz,TwoXChromosomes,1651435449,t1_i6y11vu,t1_i6xzxla,Technical-Finding681,TwoXChromosomes,1651435935
2,t1_i6xzxgj,t1_i6xsk2h,NoPlum2175,t3_ug383z,TwoXChromosomes,1651435448,t1_i6zfwnn,t1_i6xzxgj,Starchasm,TwoXChromosomes,1651460394
3,t1_i6xzxgj,t1_i6xsk2h,NoPlum2175,t3_ug383z,TwoXChromosomes,1651435448,t1_i6yod0l,t1_i6xzxgj,LucyWritesSmut,TwoXChromosomes,1651446411
4,t1_i6xzxgj,t1_i6xsk2h,NoPlum2175,t3_ug383z,TwoXChromosomes,1651435448,t1_i6y20kg,t1_i6xzxgj,HelenGonne,TwoXChromosomes,1651436360
...,...,...,...,...,...,...,...,...,...,...,...
12531253,t3_wcbqo3,,nimobo,t3_wcbqo3,Conservative,1659227551,t1_iibnjbm,t3_wcbqo3,Verdict1923,Conservative,1659227684
12531254,t3_wcbpas,,ChunkyArsenio,t3_wcbpas,Conservative,1659227438,t1_iicea3k,t3_wcbpas,NotAbot10011,Conservative,1659241250
12531255,t3_wcbpas,,ChunkyArsenio,t3_wcbpas,Conservative,1659227438,t1_iic76my,t3_wcbpas,flabiger,Conservative,1659237412
12531256,t3_wcbpas,,ChunkyArsenio,t3_wcbpas,Conservative,1659227438,t1_iibuunb,t3_wcbpas,PB_Mack,Conservative,1659231258


In [251]:
# Exclude user names that were deleted

df_reply_graph_len_1 = df_reply_graph_len_1[df_reply_graph_len_1["author_x"] != "[deleted]"]

In [252]:
# Drop duplicates of combinations of author_x and author_y within the same thead, to just count one edge per responder per thread  

df_reply_graph_len_1 = df_reply_graph_len_1.drop_duplicates(subset=["author_x", "author_y", "link_id"])

In [253]:
# Calculate the unique incoming edges for author_x

df_amount_replies = df_reply_graph_len_1.groupby(["author_x","link_id"]).size().reset_index(name="Amount_replies_per_Thread").groupby("author_x")["Amount_replies_per_Thread"].sum().reset_index(name="Sum_replies")

In [254]:
df_amount_replies

Unnamed: 0,author_x,Sum_replies
0,------------------f,1
1,----------_______---,27
2,---------V---------,1
3,--------3,3
4,-------MANGO-------,1
...,...,...
741872,zzzzebras,1
741873,zzzzz94,2
741874,zzzzzacurry,3
741875,zzzzzxx,2


In [255]:
df_no_replies = df[["name","parent_id","author", "link_id" , "subreddit", "created_utc"]]

In [256]:
df_no_replies = df_no_replies.drop_duplicates(subset=["author", "link_id","subreddit"])

In [257]:
df_no_replies

Unnamed: 0,name,parent_id,author,link_id,subreddit,created_utc
0,t1_i6y00op,t3_ug6my9,fixthismess,t3_ug6my9,TwoXChromosomes,1651435486
1,t1_i6y00mx,t1_i6x05ks,seravivi,t3_ufz7z8,TwoXChromosomes,1651435486
2,t1_i6xzzzq,t3_ufz1tz,SillySundae,t3_ufz1tz,TwoXChromosomes,1651435477
3,t1_i6xzz0k,t1_i6xbi37,GreyTartanTee,t3_ufz1tz,TwoXChromosomes,1651435466
4,t1_i6xzyet,t1_i6x1rdg,Magdalan,t3_ug00f6,TwoXChromosomes,1651435459
...,...,...,...,...,...,...
18645652,t3_wcb9qh,,jackson01105,t3_wcb9qh,Conservative,1659226131
18645653,t3_wcb7vh,,LocksmithOk9368,t3_wcb7vh,Conservative,1659225974
18645654,t3_wcb6ru,,kingmaster12345,t3_wcb6ru,Conservative,1659225884
18645655,t3_wcb1ra,,B4NNED4LIFE,t3_wcb1ra,Conservative,1659225483


In [258]:
# Drop deleted users

df_no_replies = df_no_replies[df_no_replies["author"] != "[deleted]"]

In [259]:
df_no_replies

Unnamed: 0,name,parent_id,author,link_id,subreddit,created_utc
0,t1_i6y00op,t3_ug6my9,fixthismess,t3_ug6my9,TwoXChromosomes,1651435486
1,t1_i6y00mx,t1_i6x05ks,seravivi,t3_ufz7z8,TwoXChromosomes,1651435486
2,t1_i6xzzzq,t3_ufz1tz,SillySundae,t3_ufz1tz,TwoXChromosomes,1651435477
3,t1_i6xzz0k,t1_i6xbi37,GreyTartanTee,t3_ufz1tz,TwoXChromosomes,1651435466
4,t1_i6xzyet,t1_i6x1rdg,Magdalan,t3_ug00f6,TwoXChromosomes,1651435459
...,...,...,...,...,...,...
18645652,t3_wcb9qh,,jackson01105,t3_wcb9qh,Conservative,1659226131
18645653,t3_wcb7vh,,LocksmithOk9368,t3_wcb7vh,Conservative,1659225974
18645654,t3_wcb6ru,,kingmaster12345,t3_wcb6ru,Conservative,1659225884
18645655,t3_wcb1ra,,B4NNED4LIFE,t3_wcb1ra,Conservative,1659225483


In [260]:
# Calculate total edges/responses per thread for a user

df_total_replies = df_no_replies.groupby(["link_id","subreddit"]).size().reset_index(name="Amount_response_per_thread")

In [261]:
df_total_replies

Unnamed: 0,link_id,subreddit,Amount_response_per_thread
0,t3_10fa46,teenagers,1
1,t3_10llor,teenagers,2
2,t3_11jxau,teenagers,1
3,t3_11v4oi,teenagers,2
4,t3_120v55,teenagers,1
...,...,...,...
713697,t3_xbqpf,teenagers,1
713698,t3_y029c,gardening,1
713699,t3_yw4rq,teenagers,1
713700,t3_zh6ys,teenagers,4


In [262]:
df_total_replies = df_no_replies.merge(df_total_replies, how="inner", left_on="link_id",right_on="link_id")

In [263]:
df_total_replies

Unnamed: 0,name,parent_id,author,link_id,subreddit_x,created_utc,subreddit_y,Amount_response_per_thread
0,t1_i6y00op,t3_ug6my9,fixthismess,t3_ug6my9,TwoXChromosomes,1651435486,TwoXChromosomes,154
1,t1_i6xzoc3,t3_ug6my9,Hadespuppy,t3_ug6my9,TwoXChromosomes,1651435339,TwoXChromosomes,154
2,t1_i6xzcqr,t1_i6xwnfj,bulimicvegan,t3_ug6my9,TwoXChromosomes,1651435201,TwoXChromosomes,154
3,t1_i6xytz1,t1_i6xykir,Hanzo_The_Ninja,t3_ug6my9,TwoXChromosomes,1651434978,TwoXChromosomes,154
4,t1_i6xyt5y,t1_i6xsrgb,newwriter365,t3_ug6my9,TwoXChromosomes,1651434968,TwoXChromosomes,154
...,...,...,...,...,...,...,...,...
8791687,t3_wcb9qh,,jackson01105,t3_wcb9qh,Conservative,1659226131,Conservative,1
8791688,t3_wcb7vh,,LocksmithOk9368,t3_wcb7vh,Conservative,1659225974,Conservative,1
8791689,t3_wcb6ru,,kingmaster12345,t3_wcb6ru,Conservative,1659225884,Conservative,1
8791690,t3_wcb1ra,,B4NNED4LIFE,t3_wcb1ra,Conservative,1659225483,Conservative,1


In [264]:
# Calculate total amount of incoming edges from unique users

df_total_replies = df_total_replies.groupby("author")["Amount_response_per_thread"].sum().reset_index(name="All unique users")

In [265]:
df_total_replies

Unnamed: 0,author,All unique users
0,------------------16,4769
1,------------------GL,753
2,------------------f,293
3,---------------hw,9857
4,-----------1283,2007
...,...,...
1599021,zzzzzxx,3
1599022,zzzzzzen,4
1599023,zzzzzzz11101,1
1599024,zzzzzzzzzra,178


In [266]:
df_amount_replies = df_amount_replies.rename(columns={"author_x" : "author"})

In [267]:
final_df = final_df.merge(df_amount_replies, how="left", on="author")

In [268]:
final_df = final_df.merge(df_total_replies, how="left", on="author")

In [269]:
final_df[["Sum_replies","All unique users"]] = final_df[["Sum_replies","All unique users"]].fillna(0) 

In [270]:
final_df["indeg%"] = final_df["Sum_replies"] / final_df["All unique users"]

In [271]:
final_df[["indeg%"]] = final_df[["indeg%"]].fillna(0) 

In [272]:
final_df = final_df.drop(["Sum_replies","All unique users"],axis=1)

In [273]:
final_df = final_df.drop(["Bidir","Bidir_no_sub","no_bidir"],axis=1)

In [274]:
final_df

Unnamed: 0,author,author_fullname,mean_karma,th,mpth,spth,pr,bin,thbi,entropy,indeg%
0,fixthismess,t2_fnnaj,1.0,,1.000000,0.000000,0.153846,0.000000,,0.181818,0.001040
1,seravivi,t2_148x1q,1.0,,2.400000,3.130495,0.416667,0.166667,0.200000,0.090909,0.001332
2,SillySundae,t2_13722gob,1.0,,1.666667,1.118034,0.200000,0.083333,0.111111,0.363636,0.000103
3,GreyTartanTee,t2_bxckb6oc,1.0,,1.000000,,0.000000,0.000000,,0.090909,0.000000
4,Magdalan,t2_12yrq8,1.0,,2.073529,2.300656,0.333333,0.144000,0.220588,0.363636,0.001677
...,...,...,...,...,...,...,...,...,...,...,...
1599021,Husklik,t2_hm2edcrx,,0.000345,,,,0.000000,,0.090909,0.000000
1599022,jackson01105,t2_cjr1ikd2,,0.000345,,,,0.000000,,0.090909,0.000000
1599023,LocksmithOk9368,t2_akltk5pq,,0.000345,,,,0.000000,,0.090909,0.000000
1599024,kingmaster12345,t2_qoxvq11h,,0.000345,,,,0.000000,,0.090909,0.000000


## out-degree (number of outgoing edges to a user’s peers) (outdeg)


In [275]:
df_amount_replies = df[["name","parent_id","author", "link_id" , "subreddit", "created_utc"]]

In [276]:
# Remove deleted author names

df_amount_replies = df_amount_replies[df_amount_replies["author"] != "[deleted]"]

In [277]:
# Merge comments by name-parent_id and link_id

df_amount_replies = df_amount_replies.merge(df_amount_replies, left_on=["name","link_id"], right_on=["parent_id","link_id"], how="inner")

In [278]:
df_amount_replies

Unnamed: 0,name_x,parent_id_x,author_x,link_id,subreddit_x,created_utc_x,name_y,parent_id_y,author_y,subreddit_y,created_utc_y
0,t1_i6y00op,t3_ug6my9,fixthismess,t3_ug6my9,TwoXChromosomes,1651435486,t1_i6yebi4,t1_i6y00op,wozxox3,TwoXChromosomes,1651441798
1,t1_i6xzxla,t1_i6xp12n,gfkjhsdfjhgsdjghf,t3_ufz1tz,TwoXChromosomes,1651435449,t1_i6y11vu,t1_i6xzxla,Technical-Finding681,TwoXChromosomes,1651435935
2,t1_i6xzxgj,t1_i6xsk2h,NoPlum2175,t3_ug383z,TwoXChromosomes,1651435448,t1_i6zfwnn,t1_i6xzxgj,Starchasm,TwoXChromosomes,1651460394
3,t1_i6xzxgj,t1_i6xsk2h,NoPlum2175,t3_ug383z,TwoXChromosomes,1651435448,t1_i6yod0l,t1_i6xzxgj,LucyWritesSmut,TwoXChromosomes,1651446411
4,t1_i6xzxgj,t1_i6xsk2h,NoPlum2175,t3_ug383z,TwoXChromosomes,1651435448,t1_i6y20kg,t1_i6xzxgj,HelenGonne,TwoXChromosomes,1651436360
...,...,...,...,...,...,...,...,...,...,...,...
11809434,t3_wcbpas,,ChunkyArsenio,t3_wcbpas,Conservative,1659227438,t1_iic76my,t3_wcbpas,flabiger,Conservative,1659237412
11809435,t3_wcbpas,,ChunkyArsenio,t3_wcbpas,Conservative,1659227438,t1_iibuunb,t3_wcbpas,PB_Mack,Conservative,1659231258
11809436,t3_wcbpas,,ChunkyArsenio,t3_wcbpas,Conservative,1659227438,t1_iibn1nu,t3_wcbpas,ChunkyArsenio,Conservative,1659227452
11809437,t3_wcbohq,,ChunkyArsenio,t3_wcbohq,Conservative,1659227374,t1_iibtuwq,t3_wcbohq,automatedengineer,Conservative,1659230774


In [279]:
# Filter comments where author replied himself

df_amount_replies = df_amount_replies[df_amount_replies["author_x"] != df_amount_replies["author_y"]]

In [280]:
# Drop duplicates of edges between author_x and author_y within the same thread

df_amount_replies = df_amount_replies.drop_duplicates(subset=["author_x", "author_y", "link_id"])

In [281]:
# Calculate amount of replies to unique users by author_y 

df_amount_replies = df_amount_replies.groupby(["author_y","link_id"]).size().reset_index(name="Amount_replies_per_Thread").groupby("author_y")["Amount_replies_per_Thread"].sum().reset_index(name="Sum_replies")

In [282]:
df_amount_replies

Unnamed: 0,author_y,Sum_replies
0,------------------16,1
1,------------------f,2
2,---------------hw,2
3,-----------1283,1
4,----------_______---,20
...,...,...
1413189,zzzzebras,4
1413190,zzzzz94,6
1413191,zzzzzacurry,5
1413192,zzzzzzzzzra,1


In [283]:
df_amount_replies = df_amount_replies.rename(columns={"author_y" : "author"})

In [284]:
final_df = final_df.merge(df_amount_replies, how="left", on="author")

In [285]:
# Reuse total amount of edges per user 

final_df = final_df.merge(df_total_replies, how="left", on="author")

In [286]:
final_df[["Sum_replies","All unique users"]] = final_df[["Sum_replies","All unique users"]].fillna(0) 

In [287]:
final_df["outdeg%"] = final_df["Sum_replies"] / final_df["All unique users"]

In [288]:
final_df[["outdeg%"]] = final_df[["outdeg%"]].fillna(0) 

In [289]:
final_df = final_df.drop(["Sum_replies","All unique users"],axis=1)

In [290]:
final_df

Unnamed: 0,author,author_fullname,mean_karma,th,mpth,spth,pr,bin,thbi,entropy,indeg%,outdeg%
0,fixthismess,t2_fnnaj,1.0,,1.000000,0.000000,0.153846,0.000000,,0.181818,0.001040,0.001387
1,seravivi,t2_148x1q,1.0,,2.400000,3.130495,0.416667,0.166667,0.200000,0.090909,0.001332,0.002663
2,SillySundae,t2_13722gob,1.0,,1.666667,1.118034,0.200000,0.083333,0.111111,0.363636,0.000103,0.000617
3,GreyTartanTee,t2_bxckb6oc,1.0,,1.000000,,0.000000,0.000000,,0.090909,0.000000,0.000781
4,Magdalan,t2_12yrq8,1.0,,2.073529,2.300656,0.333333,0.144000,0.220588,0.363636,0.001677,0.003271
...,...,...,...,...,...,...,...,...,...,...,...,...
1599021,Husklik,t2_hm2edcrx,,0.000345,,,,0.000000,,0.090909,0.000000,0.000000
1599022,jackson01105,t2_cjr1ikd2,,0.000345,,,,0.000000,,0.090909,0.000000,0.000000
1599023,LocksmithOk9368,t2_akltk5pq,,0.000345,,,,0.000000,,0.090909,0.000000,0.000000
1599024,kingmaster12345,t2_qoxvq11h,,0.000345,,,,0.000000,,0.090909,0.000000,0.000000


In [291]:
final_df.to_csv("../datasets/author_metrics.csv")