# Import needed libraries

In [None]:
import numpy as np
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path


# Extract the postings data

In [None]:
DATA_DIR = Path("../../data/raw/")
filename_first = "Postings_01052019_15052019.csv"
filename_second = "Postings_16052019_31052019.csv"

# use this output-path for saving figures
FIG_OUTPUT_PATH = Path("../../reports/figures/postings/")

In [None]:
postings1_df = pd.read_csv(DATA_DIR / filename_first, sep=';', dtype=str)
postings2_df = pd.read_csv(DATA_DIR / filename_second, sep=';', dtype=str)

In [None]:
# Create a unique dataframe
postings_df = pd.concat([postings1_df, postings2_df])


# Convert the creation date of the account to the age of the account
postings_df['UserCreatedAt'] = 2024 - (pd.to_datetime(postings_df['UserCreatedAt']).dt.year)
postings_df = postings_df.rename(columns={'UserCreatedAt': 'AccountAge'})


In [None]:
# Let us see how many unique users we have in the dataset - they will be the nodes of the networks
len(postings_df['ID_CommunityIdentity'].unique())

Now we create a new dataframe in which we store the attributes for each user, that will be the attributes of each node in the networks. 

For this analysis we are intrested in:
- gender of the user
- age of the account

In [None]:
# Extract nodes attributes and store them in a pandas df - assuming that one node is a user
df_attributes = postings_df[['ID_CommunityIdentity', 'UserGender', 'AccountAge']].drop_duplicates()
len(df_attributes)

#len(df_attributes['UserCreatedAt'].unique())   # we have 21 differnt years of account creation

## Filter data according to the Article Channel

The idea is to measure assortative mixing by gender and possibly the age of the account considering the different topics of the articles under which users posted a comment. 

In [None]:
# Get the list of unique channels
channels = postings_df['ArticleChannel'].unique()

print(channels)
print(len(channels))


In [None]:
# Get the list of unique sub_channels for inlands
sub_channels = postings1_df[postings1_df['ArticleChannel'] == 'Inland']['ArticleRessortName'].unique()

print(sub_channels)
print(len(sub_channels))

Since they are all related more or less to poliics we will use 'Inland' to study assortive mixing by gender and age in politics. 
A further break down of the Inland channel could be considered, but way this implies a more specific analysis 

(seen the results I assume that Inland stands for polica interna)

In [None]:
# Group the DataFrame by the 'ArticleChannel' column
grouped_df = postings_df.groupby('ArticleChannel')

# Create a dictionary to store DataFrames based on ArticleChannel
channel_dfs = {}

# Iterate through the grouped DataFrame and create individual DataFrames
for channel, channel_group in grouped_df:
    channel_dfs[channel] = channel_group.copy()

In [None]:
# Iterate through the dictionary and print the shape of each DataFrame
for channel, df in channel_dfs.items():
    print(f"Shape of DataFrame for {channel}: {df.shape}")

Measure the index for a subset of topics:
- Sport
- 'Wirtschaft' - Economy
- 'Kultur' - Culture
- 'Bildung' - Education
- 'Karriere' - Career
- Family
- Inland (for politics)

In [None]:
selected_channels = ['Wirtschaft', 'Sport', 'Kultur', 'Bildung', 'Karriere', 'Familie']    # remember to add Inland - not done now cuase it's slow
filtered_dict = {k: v for k, v in channel_dfs.items() if k in selected_channels}

## Build the networks

In [None]:
# Create dictionaries to store edge lists and weighted edge lists
edge_lists = {}
weighted_edge_lists = {}

# Create a dictionary to store df containing source (repliyng user), target (replied user) and weight (number of replies)
edges_dfs = {} 


# Iterate through the dictionary of DataFrames
for channel, df in filtered_dict.items():
    # Obtain the list of edges for the current DataFrame
    edgeList = [
        [post.ID_CommunityIdentity, next(iter(df[df.ID_Posting == post.ID_Posting_Parent].ID_CommunityIdentity))]
        for idx, post in df.iterrows()
        if not pd.isna(post.ID_Posting_Parent)
    ]

    # Obtain the weights for the edges
    weightedEdgeList = [(edge[0], edge[1], edgeList.count(edge)) for edge in edgeList]
    weightedEdgeList = list(set(weightedEdgeList))

    # Store the lists for each DataFrame in the dictionaries
    edge_lists[channel] = edgeList
    weighted_edge_lists[channel] = weightedEdgeList

    # Create a DataFrame for the current list of edges
    edges_df = pd.DataFrame(weightedEdgeList, columns=['source', 'target', 'weight'])         # needed for creating the network
    edges_dfs[channel] = edges_df



In [None]:
# Create and store a graph for each edges_df
graphs = {}
for channel_name, df in edges_dfs.items():
    G = nx.from_pandas_edgelist(df, 'source', 'target', edge_attr='weight', create_using=nx.DiGraph())

    # To add attributes
    G.add_nodes_from((n, dict(d)) for n, d in df_attributes.iterrows())
    
    graphs[channel_name] = G


In [None]:
# Iterate through the dictionary of graphs
for channel, G in graphs.items():
    # Compute assortative mixing by gender and age
    assortativity_gender = nx.attribute_assortativity_coefficient(G, attribute='UserGender')
    assortativity_age = nx.attribute_assortativity_coefficient(G, attribute='AccountAge')

    # Print results with channel name
    print(f"Channel: {channel}")
    print(f"Assortative Mixing by Gender: {assortativity_gender:.4f}")
    print(f"Assortative Mixing by Age: {assortativity_age:.4f}")
    print('------------------------------------------------------')

---------------------------------------------------------------------------------------
STILL TO DO:
- see how missing values are handles
- try to plot something
- votes network