In [41]:
import pandas as pd
import networkx as nx
from pathlib import Path

# Setup & Imports

In [42]:
DATA_DIR = Path("../../data/raw/")
filename_votes_first = "Votes_01052019_15052019.csv"
filename_votes_second = "Votes_16052019_31052019.csv"

filename_postings_first = "Postings_01052019_15052019.csv"
filename_postings_second = "Postings_01052019_15052019.csv"

# use this output-path for saving figures
FIG_OUTPUT_PATH = Path("../../reports/figures/votes/")

In [43]:
votes_first = pd.read_csv(DATA_DIR / filename_votes_first, sep=';')
votes_second = pd.read_csv(DATA_DIR / filename_votes_second, sep=';')

postings_first = pd.read_csv(DATA_DIR / filename_postings_first, sep=';')
postings_second = pd.read_csv(DATA_DIR / filename_postings_second, sep=';')

In [44]:
votes = pd.concat([votes_first, votes_second])
postings = pd.concat([postings_first, postings_second])

# Basic Exploration

In [48]:
postings.columns

Index(['ID_Posting', 'ID_Posting_Parent', 'ID_CommunityIdentity',
       'PostingHeadline', 'PostingComment', 'PostingCreatedAt', 'ID_Article',
       'ArticlePublishingDate', 'ArticleTitle', 'ArticleChannel',
       'ArticleRessortName', 'UserCommunityName', 'UserGender',
       'UserCreatedAt'],
      dtype='object')

In [59]:
postings = postings[["ID_Posting", "ID_CommunityIdentity",  "UserCommunityName", "UserGender"]]

In [60]:
postings.head()

Unnamed: 0,ID_Posting,ID_CommunityIdentity,UserCommunityName,UserGender
0,1041073586,671476,Ravenspower,
1,1041073839,566938,AlphaRomeo,m
2,1041073872,669286,Hpolditsch,
3,1041080734,671476,Ravenspower,
4,1041080828,671476,Ravenspower,


In [61]:
votes.head()

Unnamed: 0,ID_CommunityIdentity,ID_Posting,VoteNegative,VotePositive,VoteCreatedAt,UserCommunityName,UserGender,UserCreatedAt
0,675862,1041076570,1,0,2019-05-06 16:47:46.883,Heckscheibenwischer,m,2018-06-26 06:04:30.513
1,689023,1041076570,1,0,2019-05-01 22:19:06.240,Heinz Fettleber,,2019-03-08 21:23:11.463
2,24810,1041076745,0,1,2019-05-01 23:54:54.600,Bruce Campbell,m,2011-01-12 16:50:40.597
3,673781,1041076745,0,1,2019-05-01 20:59:29.910,Erdäpfelsack,,2018-05-29 07:13:49.350
4,24810,1041076831,0,1,2019-05-01 23:51:42.730,Bruce Campbell,m,2011-01-12 16:50:40.597


In [64]:
merged_df = pd.merge(votes, postings, on='ID_Posting', suffixes=("_source", "_target"))

In [65]:
merged_df.head()

Unnamed: 0,ID_CommunityIdentity_source,ID_Posting,VoteNegative,VotePositive,VoteCreatedAt,UserCommunityName_source,UserGender_source,UserCreatedAt,ID_CommunityIdentity_target,UserCommunityName_target,UserGender_target
0,675862,1041076570,1,0,2019-05-06 16:47:46.883,Heckscheibenwischer,m,2018-06-26 06:04:30.513,691035,Kopper Stefan,
1,675862,1041076570,1,0,2019-05-06 16:47:46.883,Heckscheibenwischer,m,2018-06-26 06:04:30.513,691035,Kopper Stefan,
2,689023,1041076570,1,0,2019-05-01 22:19:06.240,Heinz Fettleber,,2019-03-08 21:23:11.463,691035,Kopper Stefan,
3,689023,1041076570,1,0,2019-05-01 22:19:06.240,Heinz Fettleber,,2019-03-08 21:23:11.463,691035,Kopper Stefan,
4,68791,1041077081,1,0,2019-05-01 21:01:32.037,guitarero,m,2004-06-30 00:34:39.990,107237,hubsi990,m


In [34]:
positives = round(votes.VotePositive[votes.VotePositive==1].count()/len(votes), 2)

print(f"Votes positive: {positives} \nVotes negative: {round(1-positives, 2)}")

Votes positive: 0.8 
Votes negative: 0.2


In [55]:
votes.head()

Unnamed: 0,ID_CommunityIdentity,ID_Posting,VoteNegative,VotePositive,VoteCreatedAt,UserCommunityName,UserGender,UserCreatedAt
0,675862,1041076570,1,0,2019-05-06 16:47:46.883,Heckscheibenwischer,m,2018-06-26 06:04:30.513
1,689023,1041076570,1,0,2019-05-01 22:19:06.240,Heinz Fettleber,,2019-03-08 21:23:11.463
2,24810,1041076745,0,1,2019-05-01 23:54:54.600,Bruce Campbell,m,2011-01-12 16:50:40.597
3,673781,1041076745,0,1,2019-05-01 20:59:29.910,Erdäpfelsack,,2018-05-29 07:13:49.350
4,24810,1041076831,0,1,2019-05-01 23:51:42.730,Bruce Campbell,m,2011-01-12 16:50:40.597


In [50]:
votes.UserCommunityName.nunique()

14147

# Analysis

## Construct graphs