# User-Data

Create a dataset of Users containing all relevant attributes to assess assortative mixing

In [3]:
import pandas as pd
from pathlib import Path

Load all Data

In [62]:
INPUT_DIR = Path("../../data/raw/")
OUTPUT_DIR = Path("../../data/processed/")
filename_following = "Following_Ignoring_Relationships_01052019_31052019.csv"

filename_votes_first = "Votes_01052019_15052019.csv"
filename_votes_second = "Votes_16052019_31052019.csv"

filename_postings_first = "Postings_01052019_15052019.csv"
filename_postings_second = "Postings_01052019_15052019.csv"

In [8]:
relationships = pd.read_csv(INPUT_DIR / filename_following, sep=';')
votes_first = pd.read_csv(INPUT_DIR / filename_votes_first, sep=';')
votes_second = pd.read_csv(INPUT_DIR / filename_votes_second, sep=';')

postings_first = pd.read_csv(INPUT_DIR / filename_postings_first, sep=';')
postings_second = pd.read_csv(INPUT_DIR / filename_postings_second, sep=';')

Filter the data for relevant attributes

## Relationships

In [9]:
relationships.head()

Unnamed: 0,ID_CommunityIdentity,ID_CommunityIdentityConnectedTo,ID_CommunityConnectionType
0,1778,246490,1
1,5872,5872,1
2,9030,23875,1
3,9030,508504,1
4,10569,10569,1


In [10]:
relationships = relationships.drop(columns=['ID_CommunityConnectionType'])

In [44]:
unique_user_ids_following = pd.unique(relationships[['ID_CommunityIdentity', 'ID_CommunityIdentityConnectedTo']].values.ravel('K'))

In [45]:
len(unique_user_ids_following)

17485

In [46]:
type(unique_user_ids_following)

numpy.ndarray

## Votes

In [18]:
votes = pd.concat([votes_first, votes_second])

In [19]:
votes = votes.drop(columns=["ID_Posting", "VotePositive", "VoteNegative", "VoteCreatedAt", "UserCommunityName"])

In [20]:
votes.head()

Unnamed: 0,ID_CommunityIdentity,UserGender,UserCreatedAt
0,675862,m,2018-06-26 06:04:30.513
1,689023,,2019-03-08 21:23:11.463
2,24810,m,2011-01-12 16:50:40.597
3,673781,,2018-05-29 07:13:49.350
4,24810,m,2011-01-12 16:50:40.597


In [21]:
unique_user_ids_votes = pd.unique(votes["ID_CommunityIdentity"])

In [23]:
len(unique_user_ids_votes)

14147

## Postings

In [29]:
postings = pd.concat([postings_first, postings_second])

In [30]:
postings = postings.drop(columns=["ID_Posting", "ID_Article", "PostingHeadline", "PostingComment", "PostingCreatedAt", "ArticlePublishingDate", "ArticleRessortName", "ArticleTitle", "ArticleChannel", "UserCommunityName", "ID_Posting_Parent"])

In [31]:
postings.head()

Unnamed: 0,ID_CommunityIdentity,UserGender,UserCreatedAt
0,671476,,2018-04-14 13:42:28.470
1,566938,m,2015-08-28 17:07:41.110
2,669286,,2018-03-06 20:03:42.737
3,671476,,2018-04-14 13:42:28.470
4,671476,,2018-04-14 13:42:28.470


In [32]:
unique_user_ids_postings = pd.unique(postings["ID_CommunityIdentity"])

In [33]:
len(unique_user_ids_postings)

6650

## Combine

In [34]:
votes_and_postings = pd.concat([votes, postings])

In [35]:
votes_and_postings.head()

Unnamed: 0,ID_CommunityIdentity,UserGender,UserCreatedAt
0,675862,m,2018-06-26 06:04:30.513
1,689023,,2019-03-08 21:23:11.463
2,24810,m,2011-01-12 16:50:40.597
3,673781,,2018-05-29 07:13:49.350
4,24810,m,2011-01-12 16:50:40.597


In [36]:
len(votes_and_postings)

351540

In [37]:
votes_and_postings = votes_and_postings.drop_duplicates(subset="ID_CommunityIdentity")

In [38]:
len(votes_and_postings)

16119

In [40]:
votes_and_postings = votes_and_postings.reset_index(drop=True)

In [48]:
users = pd.merge(pd.DataFrame(unique_user_ids_following, columns=['ID_CommunityIdentity']), votes_and_postings, on='ID_CommunityIdentity', how='left')

In [50]:
len(users)

17485

In [58]:
users['UserCreatedAt'] = pd.to_datetime(users['UserCreatedAt'])

users['UserCreatedAt'] = users['UserCreatedAt'].dt.date

In [60]:
users.head()

Unnamed: 0,ID_CommunityIdentity,UserGender,UserCreatedAt
0,1778,,NaT
1,5872,m,2003-01-05
2,9030,,NaT
3,10569,,NaT
4,13391,,NaT


In [61]:
users.isnull().sum() / len(users)

ID_CommunityIdentity    0.000000
UserGender              0.479897
UserCreatedAt           0.368144
dtype: float64

In [63]:
filename_output = "user.csv"
users.to_csv(OUTPUT_DIR / filename_output, index=False)