# Social Network Analysis - Project sna_topic_11

In [2]:
import numpy as np
np.random.seed(42)
import random
random.seed(42)

import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [3]:
pd.reset_option("^display")

In [4]:
pd.set_option('display.float_format', '{:20,.4f}'.format)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 2000)

## The Data

The data set is provided by Der Standard, one of the top Austrian newspapers.
In the online Standard people can post comments below articles and up/down vote comments.

1. There are two files containing all **postings** to articles in May 2019 (due to high amount of data, the postings were split into two files). The respective file also contains additionally meta-data of the postings and articles and some details about the users who composed the postings.

2. There are two files containing all **votes** for the postings in point 1 (due to high amount of data, the votes were split into two files). The respective file also contains information whether the vote was negative or positive and some details about the user who did the voting.

3. There is one file containing **following and ignoring relationships** among all the users who posted (see point 1) or voted (see point 2) to articles published in May 2019. A following relationship (i.e., the user with the `ID_CommunityIdentity` given in column 1 follows the user with the `ID_CommunityIdentityConnectedTo` given in column 2) is indicated by a “1” in column the `“ID_CommunityConnectionType”`, a ignoring relationship by a “2” in that column (i.e., the user with the `ID_CommunityIdentity` given in column 1 ignores the user with the `ID_CommunityIdentityConnectedTo` given in column 2).

There are different entities in the data set: 
* **Users** - identified by *ID_CommunityIdentity* (or *UserCommunityName*)
* **Postings** - identified by *ID_Posting*
* **Articles** - identified by *ID_Article*

In [5]:
# define path to datasets:
file_postings_1 = 'data/Postings_01052019_15052019.csv'
file_postings_2 = 'data/Postings_16052019_31052019.csv'
file_votes_1 = 'data/Votes_01052019_15052019.csv'
file_votes_2 = 'data/Votes_16052019_31052019.csv'
file_following_ignoring = 'data/Following_Ignoring_Relationships_01052019_31052019.csv'

output_dir = 'output/'

In [6]:
df_postings_1 = pd.read_csv(file_postings_1, sep=';')
display(df_postings_1.head(2))
df_postings_1.info()

Unnamed: 0,ID_Posting,ID_Posting_Parent,ID_CommunityIdentity,PostingHeadline,PostingComment,PostingCreatedAt,ID_Article,ArticlePublishingDate,ArticleTitle,ArticleChannel,ArticleRessortName,UserCommunityName,UserGender,UserCreatedAt
0,1041073586,1041073234.0,671476,Das hat gestern bereits der Voggenhuber angeführt!,schieder hatte dem inhaltlich nichts entgegenzusetzen. https://www.youtube.com/watch?v=yiJ-sdjn2Zg,2019-05-01 18:21:15.127,2000102330973,2019-05-01 10:28:57.49,1. Mai in Wien: SPÖ fordert von Strache Rücktritt,Inland,Parteien,Ravenspower,,2018-04-14 13:42:28.470
1,1041073839,1041072504.0,566938,,...und meinen Bezirk bekommst du als Erbe mit.,2019-05-01 18:28:22.040,2000102330973,2019-05-01 10:28:57.49,1. Mai in Wien: SPÖ fordert von Strache Rücktritt,Inland,Parteien,AlphaRomeo,m,2015-08-28 17:07:41.110


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343160 entries, 0 to 343159
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   ID_Posting             343160 non-null  int64  
 1   ID_Posting_Parent      237112 non-null  float64
 2   ID_CommunityIdentity   343160 non-null  int64  
 3   PostingHeadline        93344 non-null   object 
 4   PostingComment         313870 non-null  object 
 5   PostingCreatedAt       343160 non-null  object 
 6   ID_Article             343160 non-null  int64  
 7   ArticlePublishingDate  343160 non-null  object 
 8   ArticleTitle           343160 non-null  object 
 9   ArticleChannel         343160 non-null  object 
 10  ArticleRessortName     343160 non-null  object 
 11  UserCommunityName      343159 non-null  object 
 12  UserGender             256591 non-null  object 
 13  UserCreatedAt          343160 non-null  object 
dtypes: float64(1), int64(3), object(10)


In [7]:
df_postings_2 = pd.read_csv(file_postings_2, sep=';')
display(df_postings_2.head(2))
df_postings_2.info()

Unnamed: 0,ID_Posting,ID_Posting_Parent,ID_CommunityIdentity,PostingHeadline,PostingComment,PostingCreatedAt,ID_Article,ArticlePublishingDate,ArticleTitle,ArticleChannel,ArticleRessortName,UserCommunityName,UserGender,UserCreatedAt
0,1041515171,,182351,da hat er aber recht ...auch wenn hier nun einige noch immer Naive...,"denn Österreich ist von lauter sicheren (EU) Ländern umgeben...die Migranten kommen vorher schon über x sichere Drittländer(=GFK -nur im 1.sicheren Flüchtlingsstatus)... und detto sein Argument zu Griechenland (bei Italien reduziert sich das durch Salvinis Konsequenz gegen sg. ""Retter"" d. kurz vor d. Libyschen Küste wiedermal Schleppern d. Arbeit abnhemen und illegal in d. EU transportieren wollen UND es gab eine EU-Gipfelbschluß im Juni 2018 das auszutrocknen so what? Ö hat nebenbei pro Kopf d. höchsten Zahlen an sg. ""Flüchtlingen"" & auch sogar Anerennungen (+ was vergessen wird: kumulativ!) - auch hier schon mal geschrieben https://derstandard.at/2000082091102/Was-aus-liberaler-Sicht-fuer-eine-Festung-Europa-spricht Es wird also Zeit",2019-05-16 11:25:39.287,2000103241947,2019-05-16 10:57:22.00,Innenminister Kickl will überhaupt keine Asylanträge mehr,Inland,Integrationspolitik,nadaschauichaber,m,2012-11-25 15:09:03.087
1,1041515292,1041514595.0,182351,,außer von den Naiven die aus 2015 nichts gelernt haben und am Rechtsruck in Europa damit schuld sind - denn keiner will mehr weitere allzuheftiges enstastand inzwischen schon https://www.sueddeutsche.de/news/panorama/kriminalitaet---duesseldorf-die-ehre-der-familie-lagebild-sieht-104-kriminelle-clans-dpa.urn-newsml-dpa-com-20090101-190514-99-218262 bzw https://www.deutschlandfunk.de/erstes-lagebild-clankriminalitaet-im-kampf-gegen.720.de.html?dram:article_id=448878 wollen wir das auch?,2019-05-16 11:28:44.703,2000103241947,2019-05-16 10:57:22.00,Innenminister Kickl will überhaupt keine Asylanträge mehr,Inland,Integrationspolitik,nadaschauichaber,m,2012-11-25 15:09:03.087


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395934 entries, 0 to 395933
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   ID_Posting             395934 non-null  int64  
 1   ID_Posting_Parent      263201 non-null  float64
 2   ID_CommunityIdentity   395934 non-null  int64  
 3   PostingHeadline        113953 non-null  object 
 4   PostingComment         363437 non-null  object 
 5   PostingCreatedAt       395934 non-null  object 
 6   ID_Article             395934 non-null  int64  
 7   ArticlePublishingDate  395934 non-null  object 
 8   ArticleTitle           395934 non-null  object 
 9   ArticleChannel         395934 non-null  object 
 10  ArticleRessortName     395934 non-null  object 
 11  UserCommunityName      395934 non-null  object 
 12  UserGender             293078 non-null  object 
 13  UserCreatedAt          395934 non-null  object 
dtypes: float64(1), int64(3), object(10)


In [8]:
df_votes_1 = pd.read_csv(file_votes_1, sep=';')
display(df_votes_1.head(2))
df_votes_1.info()

Unnamed: 0,ID_CommunityIdentity,ID_Posting,VoteNegative,VotePositive,VoteCreatedAt,UserCommunityName,UserGender,UserCreatedAt
0,675862,1041076570,1,0,2019-05-06 16:47:46.883,Heckscheibenwischer,m,2018-06-26 06:04:30.513
1,689023,1041076570,1,0,2019-05-01 22:19:06.240,Heinz Fettleber,,2019-03-08 21:23:11.463


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1570737 entries, 0 to 1570736
Data columns (total 8 columns):
 #   Column                Non-Null Count    Dtype 
---  ------                --------------    ----- 
 0   ID_CommunityIdentity  1570737 non-null  int64 
 1   ID_Posting            1570737 non-null  int64 
 2   VoteNegative          1570737 non-null  int64 
 3   VotePositive          1570737 non-null  int64 
 4   VoteCreatedAt         1570737 non-null  object
 5   UserCommunityName     1570731 non-null  object
 6   UserGender            1212591 non-null  object
 7   UserCreatedAt         1570737 non-null  object
dtypes: int64(4), object(4)
memory usage: 95.9+ MB


In [9]:
df_votes_2 = pd.read_csv(file_votes_2, sep=';')
display(df_votes_2.head(2))
df_votes_2.info()

Unnamed: 0,ID_CommunityIdentity,ID_Posting,VoteNegative,VotePositive,VoteCreatedAt,UserCommunityName,UserGender,UserCreatedAt
0,571503,1041620947,0,1,2019-05-18 16:34:10.213,vonWeitem,m,2015-11-04 15:45:11.493
1,178936,1041622392,0,1,2019-05-18 15:57:07.637,phischi,m,2008-09-19 02:02:59.060


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2254247 entries, 0 to 2254246
Data columns (total 8 columns):
 #   Column                Dtype 
---  ------                ----- 
 0   ID_CommunityIdentity  int64 
 1   ID_Posting            int64 
 2   VoteNegative          int64 
 3   VotePositive          int64 
 4   VoteCreatedAt         object
 5   UserCommunityName     object
 6   UserGender            object
 7   UserCreatedAt         object
dtypes: int64(4), object(4)
memory usage: 137.6+ MB


In [10]:
df_following_ignoring = pd.read_csv(file_following_ignoring, sep=';')
display(df_following_ignoring.head(2))
df_following_ignoring.info()

Unnamed: 0,ID_CommunityIdentity,ID_CommunityIdentityConnectedTo,ID_CommunityConnectionType
0,1778,246490,1
1,5872,5872,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86776 entries, 0 to 86775
Data columns (total 3 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   ID_CommunityIdentity             86776 non-null  int64
 1   ID_CommunityIdentityConnectedTo  86776 non-null  int64
 2   ID_CommunityConnectionType       86776 non-null  int64
dtypes: int64(3)
memory usage: 2.0 MB


## Pre-processing Concat Dataset

In [11]:
# concat postings1 and postings2 in order! 
df_postings = pd.concat([df_postings_1, df_postings_2], ignore_index=True)

df_postings[['PostingCreatedAt', 'ArticlePublishingDate', 'UserCreatedAt']] = df_postings[['PostingCreatedAt', 'ArticlePublishingDate', 'UserCreatedAt']].astype('datetime64')

df_postings.info()
df_postings.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 739094 entries, 0 to 739093
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   ID_Posting             739094 non-null  int64         
 1   ID_Posting_Parent      500313 non-null  float64       
 2   ID_CommunityIdentity   739094 non-null  int64         
 3   PostingHeadline        207297 non-null  object        
 4   PostingComment         677307 non-null  object        
 5   PostingCreatedAt       739094 non-null  datetime64[ns]
 6   ID_Article             739094 non-null  int64         
 7   ArticlePublishingDate  739094 non-null  datetime64[ns]
 8   ArticleTitle           739094 non-null  object        
 9   ArticleChannel         739094 non-null  object        
 10  ArticleRessortName     739094 non-null  object        
 11  UserCommunityName      739093 non-null  object        
 12  UserGender             549669 non-null  obje

Unnamed: 0,ID_Posting,ID_Posting_Parent,ID_CommunityIdentity,PostingHeadline,PostingComment,PostingCreatedAt,ID_Article,ArticlePublishingDate,ArticleTitle,ArticleChannel,ArticleRessortName,UserCommunityName,UserGender,UserCreatedAt
0,1041073586,1041073234.0,671476,Das hat gestern bereits der Voggenhuber angeführt!,schieder hatte dem inhaltlich nichts entgegenzusetzen. https://www.youtube.com/watch?v=yiJ-sdjn2Zg,2019-05-01 18:21:15.127,2000102330973,2019-05-01 10:28:57.490,1. Mai in Wien: SPÖ fordert von Strache Rücktritt,Inland,Parteien,Ravenspower,,2018-04-14 13:42:28.470
1,1041073839,1041072504.0,566938,,...und meinen Bezirk bekommst du als Erbe mit.,2019-05-01 18:28:22.040,2000102330973,2019-05-01 10:28:57.490,1. Mai in Wien: SPÖ fordert von Strache Rücktritt,Inland,Parteien,AlphaRomeo,m,2015-08-28 17:07:41.110


In [11]:
# TODO is it necessary to subset the data because it is too large?

#df_postings['PostingCreatedAt'].dt.date.head()
# df_postings = df_postings[pd.to_datetime(df_postings['PostingCreatedAt'].dt.date) == '2019-05-01']
df_postings.shape

(739094, 14)

## Play with some NLP extractions of text and one hot encoding

In [12]:
# download spacy german language model - used for tokenization, lemmatization of article text (vector data)
!python -m spacy download de_core_news_sm

Collecting de-core-news-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.2.0/de_core_news_sm-3.2.0-py3-none-any.whl (19.1 MB)
[K     |████████████████████████████████| 19.1 MB 9.8 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [13]:
from sklearn.preprocessing import OneHotEncoder

# encode article channel and article ressort name with one hot encoding
 
# ArticleChannel	ArticleRessortName
one_hot_encoder_article_channel = OneHotEncoder(handle_unknown='ignore').fit(df_postings[['ArticleChannel', 'ArticleRessortName']])

print(one_hot_encoder_article_channel.categories_)

transformed_channel_resort = one_hot_encoder_article_channel.transform(df_postings[['ArticleChannel', 'ArticleRessortName']])
print(transformed_channel_resort.shape)
display(transformed_channel_resort.toarray())

[array(['AutoMobil', 'Bildung', 'Diverses', 'Etat', 'Familie',
       'Gesundheit', 'Immobilien', 'Inland', 'International', 'Karriere',
       'Kultur', 'Lifestyle', 'Meinung', 'Panorama', 'Reisen', 'Sport',
       'User', 'Web', 'Wirtschaft', 'Wissenschaft', 'Zukunft',
       'dieStandard'], dtype=object), array(['#MeToo und die Folgen', '1. FC Köln', '1., Innere Stadt',
       '10., Favoriten', '11., Simmering', '2., Leopoldstadt',
       '20 Jahre Etat', '21., Floridsdorf', '22., Donaustadt',
       '6., Mariahilf', '7., Neubau', 'Abtreibung', 'Adipositas',
       'Afghanistan', 'Afrika', 'Aktuelles Buch', 'Albanien', 'Albertina',
       'Albumkritiken', 'Albumkritiken 2019', 'Algerien', 'Alkohol',
       'Allergien', 'Alltag', 'Alternativmedizin', 'Alzheimer & Demenz',
       'Amerika', 'Analoge Games ', 'Android', 'András Szigetvari',
       'Antisemitismus und Rechtsextremismus', 'Antonio Fian: Dramolette',
       'Apple', 'Apps', 'Arbeit & Gesundheit', 'Arbeitsmarkt',
       'A

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

The following step for creating 'output/combinedPostingText.spacy' is usually not necessary. It takes around 64 minutes and you can just use my file:

In [134]:
import spacy
from spacy.tokens import DocBin
nlp = spacy.load("de_core_news_sm")

df_postings['CombinedArticlePostingText'] = df_postings['PostingHeadline'].fillna('') + ' ' + df_postings['PostingComment'].fillna('') + ' ' + df_postings['ArticleTitle'].fillna('')

doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)

for doc in nlp.pipe(df_postings['CombinedArticlePostingText'].str.lower()):
    #print(repr(doc))
    doc_bin.add(doc)

#bytes_data = doc_bin.to_bytes()
#doc_bin = DocBin().from_bytes(bytes_data)

file_name_spacy = 'output/combinedPostingText.spacy'
doc_bin.to_disk(file_name_spacy)

# save tokenized spacy Doc objects to file
# the file contains the combined text of posting headline, posting comment and article title in a tokenized form
# and also meta data about a token, like its lemma form, whether a token is a punctuation, or bracket etc.
# processing took around 64 minutes on the whole df_postings for me.

Continue here:

In [135]:
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import spacy
from spacy.tokens import DocBin
nlp = spacy.load("de_core_news_sm")

doc_bin = DocBin().from_disk(file_name_spacy)

docs = list(doc_bin.get_docs(nlp.vocab))
#print(docs)
print(len(docs))
tokenized_lemmatized_texts = [[token.lemma_ for token in doc 
                               if not token.is_stop and not token.is_punct and not token.is_space 
                               and not token.like_url and not token.like_email and not token.is_quote 
                               and not token.is_currency and not token.is_bracket and not token.is_quote
                               and not token.is_left_punct and not token.is_right_punct and not token.is_bracket]
                               for doc in docs]
print(len(tokenized_lemmatized_texts))
print(tokenized_lemmatized_texts[0])
print(tokenized_lemmatized_texts[100])


vectorizer = TfidfVectorizer(ngram_range=(1, 1), lowercase=False, tokenizer=lambda x: x, max_features=3000)
vectorizer = vectorizer.fit(tokenized_lemmatized_texts)
#print(list(vectorizer.get_feature_names_out()))

# TODO train test split!
# only fit on train data and not on test data!

# text_vectorized = vectorizer.transform(tokenized_lemmatized_texts)
#print(text_vectorized.toarray().shape)
#display(text_vectorized.toarray()[0])

739094
739094
['gestern', 'voggenhuber', 'anführen', 'schieder', 'inhaltlich', 'entgegensetzen', '1.', 'mai', 'wien', 'spö', 'fordern', 'strache', 'rücktritt']
['sofortig', 'stopp', 'subventionen', 'städter', 'steuergeld', 'wundern', 'landbewohner', 'fordern', 'leben', 'stadt', 'mein', 'land', 'aussehen', 'ruhig', 'schlafen', 'garantieren', 'köstinger', 'verfehlen', 'klimaziele', 'kosten', 'steuerentlastung']




In [138]:
# just a check to see which tf-idf features were extracted:
with open('output/feature_names.txt', 'w') as f:
    for item in vectorizer.get_feature_names_out():
        f.write("%s\n" % item)

## Relation 1: User_A commented/posted to post of User_B

### Extracting Edge relation

Instead of iterating over the dataframe rows to create a edge list, I use a join, which is way faster on the whole dataset. <br>
But, by default `merge` changes the sort order, so you have to sort by the original dataframe index!<br>
The following code creates a dataframe for posting source user and target posting user. It has the same shape as the original dataframe `df_postings`.
So, target posting user may be `NaN`, due to left join and preserving all observations in the posting dataframe.
One could filter the Target_User column by not `NaN` to get only relations between users.

In [12]:
postings_user_commented_relation_df = df_postings.reset_index()[['index', 'ID_Posting', 'ID_CommunityIdentity', 'PostingCreatedAt', 'ID_Posting_Parent']].merge(
    df_postings[['ID_Posting', 'ID_CommunityIdentity']], 
    left_on='ID_Posting_Parent', right_on='ID_Posting',
    suffixes=('', '_parent'), how='left', sort=False
)[['index', 'ID_Posting', 'PostingCreatedAt', 'ID_CommunityIdentity', 'ID_CommunityIdentity_parent']].sort_values(by='index')

postings_user_commented_relation_df.set_index('index', inplace=True) # index may be useful for selection?
# postings_user_commented_relation_df.set_index(['ID_Posting', 'PostingCreatedAt'], inplace=True) # index may be useful for selection?
postings_user_commented_relation_df.rename(columns={'ID_CommunityIdentity': 'Source_User', 'ID_CommunityIdentity_parent': 'Target_User'}, inplace=True)

# postings_user_commented_relation_df.drop('index', axis='columns', inplace=True) # index is ascending as original df_postings, index column not needed anymore

# SAME number of rows and SAME order as original df_postings:
print(df_postings.shape)
print(postings_user_commented_relation_df.shape)

display(postings_user_commented_relation_df.head(5))


(739094, 14)
(739094, 4)


Unnamed: 0_level_0,ID_Posting,PostingCreatedAt,Source_User,Target_User
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1041073586,2019-05-01 18:21:15.127,671476,233191.0
1,1041073839,2019-05-01 18:28:22.040,566938,640123.0
2,1041073872,2019-05-01 18:29:05.533,669286,680772.0
3,1041080734,2019-05-01 22:37:56.010,671476,51817.0
4,1041080828,2019-05-01 22:42:06.310,671476,


In [70]:
# Writing to a file takes longer than just obtaining the edge list.
postings_user_commented_relation_df.to_csv("output/postings_user_commented_relation.csv")

In [13]:
postings_user_commented_relation_nonan_df = postings_user_commented_relation_df.dropna()
# check number of rows after dropping NAN
print(postings_user_commented_relation_df.shape)
print(postings_user_commented_relation_nonan_df.shape)

(739094, 4)
(500312, 4)


### Undirected Graph measures

see for example:
* https://networkx.org/documentation/stable/reference/algorithms/link_prediction.html


In [73]:
# Create undirected graph:
G_postings_commented_undirected = nx.from_pandas_edgelist(postings_user_commented_relation_nonan_df,
                                                          source='Source_User',
                                                          target='Target_User',
                                                          create_using=nx.Graph())

In [75]:
print(nx.info(G_postings_commented_undirected))

Graph with 21761 nodes and 337257 edges


#### Jaccard Index

In [131]:
# https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.link_prediction.jaccard_coefficient.html#networkx.algorithms.link_prediction.jaccard_coefficient
G_postings_jaccard_coefs_iter = nx.jaccard_coefficient(G_postings_commented_undirected)
#for u, v, p in G_postings_jaccard_coefs:
    #print(f"({u}, {v}) -> {p:.8f}")

# returns iterator
# G_postings_jaccard_coefs = list(G_postings_jaccard_coefs_iter)
# print(len(G_postings_jaccard_coefs))
# display(G_postings_jaccard_coefs[0:5])

In [None]:
G_postings_jaccard_coefs = list(filter(lambda r: r[2] != 0, nx.jaccard_coefficient(G_postings_commented_undirected))) # filter out those coefs with value zero, can be derived

In [143]:
with open('output/postings_user_commented_jaccard_coefs.txt', 'w') as f:
    for u, v, p in filter(lambda r: r[2] != 0, nx.jaccard_coefficient(G_postings_commented_undirected)):
        f.write(f"{u},{v},{p:.10f}\n")

#### Adamic-Adar

In [None]:
# https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.link_prediction.adamic_adar_index.html#networkx.algorithms.link_prediction.adamic_adar_index

G_postings_adamic_adar_iter = nx.adamic_adar_index(G_postings_commented_undirected)
#for u, v, p in G_postings_adamic_adar_iter:
    #print(f"({u}, {v}) -> {p:.8f}")
# returns iterator

In [None]:
G_postings_adamic_adar = list(filter(lambda r: r[2] != 0, nx.adamic_adar_index(G_postings_commented_undirected)))   # filter out those coefs with value zero, can be derived

In [None]:
with open('output/postings_user_commented_adamic_adar.txt', 'w') as f:
    for u, v, p in filter(lambda r: r[2] != 0, nx.adamic_adar_index(G_postings_commented_undirected)):
        f.write(f"{u},{v},{p:.10f}\n")

#### Preferential attachment score

In [None]:
# https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.link_prediction.preferential_attachment.html#networkx.algorithms.link_prediction.preferential_attachment

G_postings_preferential_attachment_iter = nx.preferential_attachment(G_postings_commented_undirected)
#for u, v, p in G_postings_preferential_attachment_iter:
    #print(f"({u}, {v}) -> {p:.8f}")
# returns iterator

In [None]:
G_postings_preferential_attachment = list(filter(lambda r: r[2] != 0, nx.preferential_attachment(G_postings_commented_undirected)))   # filter out those coefs with value zero, can be derived

In [None]:
with open('output/postings_user_commented_preferential_attachment.txt', 'w') as f:
    for u, v, p in filter(lambda r: r[2] != 0, nx.preferential_attachment(G_postings_commented_undirected)):
        f.write(f"{u},{v},{p:.10f}\n")

### Directed Graph (DiGraph)

In [37]:
# Create multi directed graph:


postings_user_commented_relation_nonan_count_df = postings_user_commented_relation_nonan_df[['ID_Posting', 'Source_User', 'Target_User']].groupby(['Source_User', 'Target_User'])['ID_Posting'].count().reset_index(name="weight")

G_postings_commented_directed = nx.from_pandas_edgelist(postings_user_commented_relation_nonan_count_df,
                                                          source='Source_User',
                                                          target='Target_User',
                                                          edge_attr='weight',
                                                          create_using=nx.DiGraph())
display(postings_user_commented_relation_nonan_count_df.sort_values('weight'))
print(nx.info(G_postings_commented_directed))

Unnamed: 0,Source_User,Target_User,weight
0,38,30911.0000,1
259918,577332,83284.0000,1
259916,577332,76585.0000,1
259914,577332,60845.0000,1
259912,577332,58849.0000,1
...,...,...,...
212099,538958,668321.0000,131
32176,51140,635206.0000,133
312795,635206,51140.0000,136
269117,582531,553628.0000,209


DiGraph with 21761 nodes and 404234 edges


In [38]:
# check if edge data is named 'weight' - is default edge data key to use as weight:
print(G_postings_commented_directed.get_edge_data(635206, 51140))
print(G_postings_commented_directed.get_edge_data(51140, 635206))

{'weight': 136}
{'weight': 133}


#### PageRank

Returns the PageRank of the nodes in the graph.
PageRank computes a ranking of the nodes in the graph G based on the structure of the incoming links. It was originally designed as an algorithm to rank web pages.

Edge data attribute 'weight' is default edge data key to use as weight.


In [39]:
# https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html#networkx.algorithms.link_analysis.pagerank_alg.pagerank
# Edge data key to use as weight is by default 'weight'.

G_postings_pagerank = nx.pagerank(G_postings_commented_directed)
# Returns the PageRank of the nodes in the graph.

In [40]:
# expecting length of dictionary to be number of nodes in graph
print(len(G_postings_pagerank))
G_postings_pagerank

21761


{38: 1.2249058598721818e-05,
 30911.0: 8.877108790930713e-05,
 170664.0: 0.00015987779443988565,
 234591.0: 5.721715974589273e-05,
 508346.0: 0.0001572258382390197,
 553371.0: 0.00013233537522584917,
 556513.0: 0.0007781561477493687,
 693786.0: 0.00013440752351819992,
 50: 1.690836347518488e-05,
 7652.0: 0.00022781664259237166,
 516173.0: 0.0010945546130256528,
 667739.0: 0.00033775748439483604,
 77: 1.4509751866557404e-05,
 135856.0: 7.031503080400138e-05,
 237529.0: 2.5964967545656608e-05,
 519189.0: 0.0005257408553813981,
 519657.0: 0.0021997460905035897,
 653998.0: 0.0004792194223743271,
 676028.0: 0.0005842582615026205,
 689506.0: 0.00010442486036455212,
 148: 1.076161239393451e-05,
 174116.0: 0.00038136456626167425,
 233: 1.672295621319266e-05,
 188128.0: 1.4892135799873878e-05,
 233356.0: 0.00029530061129730446,
 304: 2.542696247458096e-05,
 101845.0: 8.392640897063384e-05,
 136338.0: 0.00012396847744744133,
 178915.0: 0.0003027501883274946,
 190950.0: 0.00019195709767888614,
 2

#### HITS hubs and authorities values for nodes

In [None]:
# https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.link_analysis.hits_alg.hits.html#networkx.algorithms.link_analysis.hits_alg.hits

G_postings_hits = nx.hits(G_postings_commented_directed)


#### Katz weighted


In [41]:
# https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.katz_centrality.html#networkx.algorithms.centrality.katz_centrality

G_postings_katz_weighted = nx.katz_centrality(G_postings_commented_directed)