# Social Network Analysis - Project sna_topic_11

In [1]:
import numpy as np
np.random.seed(42)
import random
random.seed(42)

import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
from itertools import compress
import pickle
from itertools import compress, product
import datetime

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
pd.reset_option("^display")

In [3]:
pd.set_option('display.float_format', '{:20,.4f}'.format)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 2000)

## The Data

The data set is provided by Der Standard, one of the top Austrian newspapers.
In the online Standard people can post comments below articles and up/down vote comments.

1. There are two files containing all **postings** to articles in May 2019 (due to high amount of data, the postings were split into two files). The respective file also contains additionally meta-data of the postings and articles and some details about the users who composed the postings.

2. There are two files containing all **votes** for the postings in point 1 (due to high amount of data, the votes were split into two files). The respective file also contains information whether the vote was negative or positive and some details about the user who did the voting.

3. There is one file containing **following and ignoring relationships** among all the users who posted (see point 1) or voted (see point 2) to articles published in May 2019. A following relationship (i.e., the user with the `ID_CommunityIdentity` given in column 1 follows the user with the `ID_CommunityIdentityConnectedTo` given in column 2) is indicated by a “1” in column the `“ID_CommunityConnectionType”`, a ignoring relationship by a “2” in that column (i.e., the user with the `ID_CommunityIdentity` given in column 1 ignores the user with the `ID_CommunityIdentityConnectedTo` given in column 2).

There are different entities in the data set: 
* **Users** - identified by *ID_CommunityIdentity* (or *UserCommunityName*)
* **Postings** - identified by *ID_Posting*
* **Articles** - identified by *ID_Article*

In [4]:
# define path to datasets:
file_postings_1 = 'data/Postings_01052019_15052019.csv'
file_postings_2 = 'data/Postings_16052019_31052019.csv'
file_votes_1 = 'data/Votes_01052019_15052019.csv'
file_votes_2 = 'data/Votes_16052019_31052019.csv'
file_following_ignoring = 'data/Following_Ignoring_Relationships_01052019_31052019.csv'

output_dir = 'output/'

In [5]:
df_postings_1 = pd.read_csv(file_postings_1, sep=';')
display(df_postings_1.head(2))
df_postings_1.info()

Unnamed: 0,ID_Posting,ID_Posting_Parent,ID_CommunityIdentity,PostingHeadline,PostingComment,PostingCreatedAt,ID_Article,ArticlePublishingDate,ArticleTitle,ArticleChannel,ArticleRessortName,UserCommunityName,UserGender,UserCreatedAt
0,1041073586,1041073234.0,671476,Das hat gestern bereits der Voggenhuber angeführt!,schieder hatte dem inhaltlich nichts entgegenzusetzen. https://www.youtube.com/watch?v=yiJ-sdjn2Zg,2019-05-01 18:21:15.127,2000102330973,2019-05-01 10:28:57.49,1. Mai in Wien: SPÖ fordert von Strache Rücktritt,Inland,Parteien,Ravenspower,,2018-04-14 13:42:28.470
1,1041073839,1041072504.0,566938,,...und meinen Bezirk bekommst du als Erbe mit.,2019-05-01 18:28:22.040,2000102330973,2019-05-01 10:28:57.49,1. Mai in Wien: SPÖ fordert von Strache Rücktritt,Inland,Parteien,AlphaRomeo,m,2015-08-28 17:07:41.110


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343160 entries, 0 to 343159
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   ID_Posting             343160 non-null  int64  
 1   ID_Posting_Parent      237112 non-null  float64
 2   ID_CommunityIdentity   343160 non-null  int64  
 3   PostingHeadline        93344 non-null   object 
 4   PostingComment         313870 non-null  object 
 5   PostingCreatedAt       343160 non-null  object 
 6   ID_Article             343160 non-null  int64  
 7   ArticlePublishingDate  343160 non-null  object 
 8   ArticleTitle           343160 non-null  object 
 9   ArticleChannel         343160 non-null  object 
 10  ArticleRessortName     343160 non-null  object 
 11  UserCommunityName      343159 non-null  object 
 12  UserGender             256591 non-null  object 
 13  UserCreatedAt          343160 non-null  object 
dtypes: float64(1), int64(3), object(10)


In [6]:
df_postings_2 = pd.read_csv(file_postings_2, sep=';')
display(df_postings_2.head(2))
df_postings_2.info()

Unnamed: 0,ID_Posting,ID_Posting_Parent,ID_CommunityIdentity,PostingHeadline,PostingComment,PostingCreatedAt,ID_Article,ArticlePublishingDate,ArticleTitle,ArticleChannel,ArticleRessortName,UserCommunityName,UserGender,UserCreatedAt
0,1041515171,,182351,da hat er aber recht ...auch wenn hier nun einige noch immer Naive...,"denn Österreich ist von lauter sicheren (EU) Ländern umgeben...die Migranten kommen vorher schon über x sichere Drittländer(=GFK -nur im 1.sicheren Flüchtlingsstatus)... und detto sein Argument zu Griechenland (bei Italien reduziert sich das durch Salvinis Konsequenz gegen sg. ""Retter"" d. kurz vor d. Libyschen Küste wiedermal Schleppern d. Arbeit abnhemen und illegal in d. EU transportieren wollen UND es gab eine EU-Gipfelbschluß im Juni 2018 das auszutrocknen so what? Ö hat nebenbei pro Kopf d. höchsten Zahlen an sg. ""Flüchtlingen"" & auch sogar Anerennungen (+ was vergessen wird: kumulativ!) - auch hier schon mal geschrieben https://derstandard.at/2000082091102/Was-aus-liberaler-Sicht-fuer-eine-Festung-Europa-spricht Es wird also Zeit",2019-05-16 11:25:39.287,2000103241947,2019-05-16 10:57:22.00,Innenminister Kickl will überhaupt keine Asylanträge mehr,Inland,Integrationspolitik,nadaschauichaber,m,2012-11-25 15:09:03.087
1,1041515292,1041514595.0,182351,,außer von den Naiven die aus 2015 nichts gelernt haben und am Rechtsruck in Europa damit schuld sind - denn keiner will mehr weitere allzuheftiges enstastand inzwischen schon https://www.sueddeutsche.de/news/panorama/kriminalitaet---duesseldorf-die-ehre-der-familie-lagebild-sieht-104-kriminelle-clans-dpa.urn-newsml-dpa-com-20090101-190514-99-218262 bzw https://www.deutschlandfunk.de/erstes-lagebild-clankriminalitaet-im-kampf-gegen.720.de.html?dram:article_id=448878 wollen wir das auch?,2019-05-16 11:28:44.703,2000103241947,2019-05-16 10:57:22.00,Innenminister Kickl will überhaupt keine Asylanträge mehr,Inland,Integrationspolitik,nadaschauichaber,m,2012-11-25 15:09:03.087


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395934 entries, 0 to 395933
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   ID_Posting             395934 non-null  int64  
 1   ID_Posting_Parent      263201 non-null  float64
 2   ID_CommunityIdentity   395934 non-null  int64  
 3   PostingHeadline        113953 non-null  object 
 4   PostingComment         363437 non-null  object 
 5   PostingCreatedAt       395934 non-null  object 
 6   ID_Article             395934 non-null  int64  
 7   ArticlePublishingDate  395934 non-null  object 
 8   ArticleTitle           395934 non-null  object 
 9   ArticleChannel         395934 non-null  object 
 10  ArticleRessortName     395934 non-null  object 
 11  UserCommunityName      395934 non-null  object 
 12  UserGender             293078 non-null  object 
 13  UserCreatedAt          395934 non-null  object 
dtypes: float64(1), int64(3), object(10)


## Pre-processing Concat Dataset

In [7]:
# concat postings1 and postings2 in order! 
df_postings = pd.concat([df_postings_1, df_postings_2], ignore_index=True)

df_postings[['PostingCreatedAt', 'ArticlePublishingDate', 'UserCreatedAt']] = df_postings[['PostingCreatedAt', 'ArticlePublishingDate', 'UserCreatedAt']].astype('datetime64')

df_postings.info()
df_postings.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 739094 entries, 0 to 739093
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   ID_Posting             739094 non-null  int64         
 1   ID_Posting_Parent      500313 non-null  float64       
 2   ID_CommunityIdentity   739094 non-null  int64         
 3   PostingHeadline        207297 non-null  object        
 4   PostingComment         677307 non-null  object        
 5   PostingCreatedAt       739094 non-null  datetime64[ns]
 6   ID_Article             739094 non-null  int64         
 7   ArticlePublishingDate  739094 non-null  datetime64[ns]
 8   ArticleTitle           739094 non-null  object        
 9   ArticleChannel         739094 non-null  object        
 10  ArticleRessortName     739094 non-null  object        
 11  UserCommunityName      739093 non-null  object        
 12  UserGender             549669 non-null  obje

Unnamed: 0,ID_Posting,ID_Posting_Parent,ID_CommunityIdentity,PostingHeadline,PostingComment,PostingCreatedAt,ID_Article,ArticlePublishingDate,ArticleTitle,ArticleChannel,ArticleRessortName,UserCommunityName,UserGender,UserCreatedAt
0,1041073586,1041073234.0,671476,Das hat gestern bereits der Voggenhuber angeführt!,schieder hatte dem inhaltlich nichts entgegenzusetzen. https://www.youtube.com/watch?v=yiJ-sdjn2Zg,2019-05-01 18:21:15.127,2000102330973,2019-05-01 10:28:57.490,1. Mai in Wien: SPÖ fordert von Strache Rücktritt,Inland,Parteien,Ravenspower,,2018-04-14 13:42:28.470
1,1041073839,1041072504.0,566938,,...und meinen Bezirk bekommst du als Erbe mit.,2019-05-01 18:28:22.040,2000102330973,2019-05-01 10:28:57.490,1. Mai in Wien: SPÖ fordert von Strache Rücktritt,Inland,Parteien,AlphaRomeo,m,2015-08-28 17:07:41.110


In [8]:
# TODO is it necessary to subset the data because it is too large?

#df_postings['PostingCreatedAt'].dt.date.head()
# df_postings = df_postings[pd.to_datetime(df_postings['PostingCreatedAt'].dt.date) == '2019-05-01']
df_postings.shape

(739094, 14)

## Relation 1: User_A commented/posted to post of User_B

### Extracting Edge relation

Instead of iterating over the dataframe rows to create a edge list, I use a join, which is way faster on the whole dataset. <br>
But, by default `merge` changes the sort order, so you have to sort by the original dataframe index!<br>
The following code creates a dataframe for posting source user and target posting user. It has the same shape as the original dataframe `df_postings`.
So, target posting user may be `NaN`, due to left join and preserving all observations in the posting dataframe.
One could filter the Target_User column by not `NaN` to get only relations between users.

In [9]:
postings_user_commented_relation_df = df_postings.reset_index()[['index', 'ID_Posting', 'ID_CommunityIdentity', 'PostingCreatedAt', 'ID_Posting_Parent']].merge(
    df_postings[['ID_Posting', 'ID_CommunityIdentity']], 
    left_on='ID_Posting_Parent', right_on='ID_Posting',
    suffixes=('', '_parent'), how='left', sort=False
)[['index', 'ID_Posting', 'PostingCreatedAt', 'ID_CommunityIdentity', 'ID_CommunityIdentity_parent']].sort_values(by='index')

postings_user_commented_relation_df.set_index('index', inplace=True) # index may be useful for selection?
# postings_user_commented_relation_df.set_index(['ID_Posting', 'PostingCreatedAt'], inplace=True) # index may be useful for selection?
postings_user_commented_relation_df.rename(columns={'ID_CommunityIdentity': 'Source_User', 'ID_CommunityIdentity_parent': 'Target_User'}, inplace=True)

# postings_user_commented_relation_df.drop('index', axis='columns', inplace=True) # index is ascending as original df_postings, index column not needed anymore

# SAME number of rows and SAME order as original df_postings:
print(df_postings.shape)
print(postings_user_commented_relation_df.shape)
assert df_postings.shape[0] == postings_user_commented_relation_df.shape[0]

# add column PostingCreatedAtDay to have easy access to Day of month
postings_user_commented_relation_df['PostingCreatedAtDay'] = postings_user_commented_relation_df['PostingCreatedAt'].dt.day

display(postings_user_commented_relation_df.head(5))
display(postings_user_commented_relation_df.info())

(739094, 14)
(739094, 4)


Unnamed: 0_level_0,ID_Posting,PostingCreatedAt,Source_User,Target_User,PostingCreatedAtDay
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1041073586,2019-05-01 18:21:15.127,671476,233191.0,1
1,1041073839,2019-05-01 18:28:22.040,566938,640123.0,1
2,1041073872,2019-05-01 18:29:05.533,669286,680772.0,1
3,1041080734,2019-05-01 22:37:56.010,671476,51817.0,1
4,1041080828,2019-05-01 22:42:06.310,671476,,1


<class 'pandas.core.frame.DataFrame'>
Int64Index: 739094 entries, 0 to 739093
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   ID_Posting           739094 non-null  int64         
 1   PostingCreatedAt     739094 non-null  datetime64[ns]
 2   Source_User          739094 non-null  int64         
 3   Target_User          500312 non-null  float64       
 4   PostingCreatedAtDay  739094 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 33.8 MB


None

In [10]:
def get_mask_created_at(series, day_start, day_end):
    """
    create a boolean mask of series where day is in day_start incl. and day_end_incl.
    """
    return series.isin(range(day_start, day_end+1))

In [11]:
mask_posting_created_at = get_mask_created_at(postings_user_commented_relation_df['PostingCreatedAtDay'], 
                                                    day_start=2, day_end=4)

display(postings_user_commented_relation_df.loc[mask_posting_created_at].sample(10))
list(postings_user_commented_relation_df.loc[mask_posting_created_at]['PostingCreatedAtDay'].unique())

Unnamed: 0_level_0,ID_Posting,PostingCreatedAt,Source_User,Target_User,PostingCreatedAtDay
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
261921,1041122420,2019-05-03 13:15:27.730,652259,572922.0,3
274585,1041124660,2019-05-03 14:23:14.000,634552,500307.0,3
214053,1041136645,2019-05-03 21:31:47.440,584635,,3
40877,1041106463,2019-05-02 20:57:40.253,690687,,2
63312,1041132144,2019-05-03 18:34:18.970,35950,,3
289961,1041124479,2019-05-03 14:16:08.487,666841,,3
213338,1041121051,2019-05-03 12:37:15.593,173358,222059.0,3
12658,1041127145,2019-05-03 15:42:57.790,548063,635120.0,3
23537,1041105562,2019-05-02 20:23:52.823,499569,2968.0,2
50516,1041100737,2019-05-02 17:04:43.847,558714,,2


[3, 2, 4]

Be careful when using the df_posting dataframe without `NAN` values below. Some operations require the original `df_postings` order (NLP text extraction)!

In [12]:
postings_user_commented_relation_nonan_df = postings_user_commented_relation_df.dropna()
# check number of rows after dropping NAN
print(postings_user_commented_relation_df.shape)
print(postings_user_commented_relation_nonan_df.shape)

(739094, 5)
(500312, 5)


### Compute graph topology features

The following function can compute graph features based on posting relation data frame and start and end day:

In [13]:
def computation_input_df_combined_postings(postings_relation_df, day_start, day_end):
    """
    day_start incl. and day_end incl.
    """
    print('computation_input_df_combined_postings: Days {}-{}'.format(str(day_start),str(day_end)))
    assert (day_start <= day_end) & (day_start > 0) & (day_end <= 31)

    postings_relation_df = postings_relation_df.copy()

    # get postings relations where day is between day_start and day_end
    mask_posting_created_at = get_mask_created_at(postings_relation_df['PostingCreatedAtDay'], 
                                                  day_start=day_start, day_end=day_end)
    postings_relation_df_day_subset = postings_relation_df.loc[mask_posting_created_at]
    # assert all postings are created between day_start and day_end:
    assert postings_relation_df_day_subset['PostingCreatedAtDay'].isin(range(day_start, day_end+1)).all()

    # display(postings_relation_df_day_subset.sample(4))

    # drop NaN in Target_User, because these rows have no relation:
    postings_relation_df_day_subset_no_nan = postings_relation_df_day_subset.dropna().copy()
    postings_relation_df_day_subset_no_nan['Target_User'] = postings_relation_df_day_subset_no_nan['Target_User'].astype('int64')

    #display(postings_relation_df_day_subset_no_nan.sample(4))
    #display(postings_relation_df_day_subset_no_nan.info())

    # obtain interaction count as edge weight
    postings_relation_df_day_subset_no_nan_count = postings_relation_df_day_subset_no_nan[['ID_Posting', 'Source_User', 'Target_User']].groupby(
                                                        ['Source_User', 'Target_User'])['ID_Posting'].count().reset_index(name="weight")

    #display(postings_relation_df_day_subset_no_nan_count.sample(4))
    #display(postings_relation_df_day_subset_no_nan_count.info())

    #Creation of directed graph for day range
    G_postings_commented_days_sub_directed = nx.from_pandas_edgelist(postings_relation_df_day_subset_no_nan_count,
                                                                     source='Source_User',
                                                                     target='Target_User',
                                                                     edge_attr='weight',
                                                                     create_using=nx.DiGraph())
    print('Created a', nx.info(G_postings_commented_days_sub_directed))
    # assert that the graph has weight as edge attribute with a weight > 0 (should be weight column in df)
    assert G_postings_commented_days_sub_directed.get_edge_data(*tuple(postings_relation_df_day_subset_no_nan_count.loc[0, ['Source_User', 'Target_User']]))['weight'] > 0

    G_postings_commented_days_sub_undirected = nx.from_pandas_edgelist(postings_relation_df_day_subset_no_nan_count,
                                                                       source='Source_User',
                                                                       target='Target_User',
                                                                       create_using=nx.Graph())
    print('Created a', nx.info(G_postings_commented_days_sub_undirected))

    # node list, ebunch is pairs of node and should be of length (len(nodes)*(len(nodes)-1)):
    nodes = list(G_postings_commented_days_sub_directed.nodes)
    ebunch = [(i,j) for i,j in product(nodes, nodes) if i!=j] # all node pairs
    assert len(ebunch) == (len(nodes)*(len(nodes)-1))

    print(f'{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} Starting computation')

    # Computation of Jaccard coefficient
    jac = nx.jaccard_coefficient(G_postings_commented_days_sub_undirected, ebunch)
    jac_df = pd.DataFrame(list(jac), columns = ['Source_User', 'Target_User', 'jaccard_coef'])
    assert jac_df.shape[0] == (len(nodes)*(len(nodes)-1))
    print(f'{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} Jaccard: ok!')

    # Computation of Adamic and Adar index
    ad_ad = nx.adamic_adar_index(G_postings_commented_days_sub_undirected, ebunch)
    ad_ad_df = pd.DataFrame(list(ad_ad), columns = ['Source_User', 'Target_User', 'adamic_adar_index'])
    assert ad_ad_df.shape[0] == (len(nodes)*(len(nodes)-1))
    print(f'{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} Adamic and Adar: ok!')
    result_df = jac_df.merge(ad_ad_df)

    # Computation of Preferential Attachment
    pref_att = nx.preferential_attachment(G_postings_commented_days_sub_undirected, ebunch)
    pref_att_df = pd.DataFrame(list(pref_att), columns = ['Source_User', 'Target_User', 'preferential_attachment_index'])
    assert pref_att_df.shape[0] == (len(nodes)*(len(nodes)-1))
    print(f'{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} Preferential Attachment: ok!')
    result_df = result_df.merge(pref_att_df)

    # Clustering coefficient score (on directed graph)
    cluster_dict = nx.clustering(G_postings_commented_days_sub_directed)
    cluster_dict_df = pd.DataFrame.from_dict(cluster_dict, orient='index', columns=['clustering_coefficient_score']).reset_index().rename(columns={'index':'User'})
    assert cluster_dict_df.shape[0] == len(nodes)
    #display(cluster_dict_df.head())
    # Add a column for clustering_coefficient_score_Source_User using the Clustering coefficient for the Source_User node
    result_df = result_df.merge(cluster_dict_df.rename(columns={'User': 'Source_User', 
                                                               'clustering_coefficient_score': 'clustering_coefficient_score_Source_User'}),
                                on='Source_User', how='left')
    # Add a column for clustering_coefficient_score_Target_User using the Clustering coefficient for the Target_User node
    result_df = result_df.merge(cluster_dict_df.rename(columns={'User': 'Target_User', 
                                                               'clustering_coefficient_score': 'clustering_coefficient_score_Target_User'}),
                                on='Target_User', how='left')
    print(f'{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} Clustering coefficient: ok!')
    
    # Pagerank (on directed graph)
    pagerank_dict = nx.pagerank(G_postings_commented_days_sub_directed)
    pagerank_dict_df = pd.DataFrame.from_dict(pagerank_dict, orient='index', columns=['pagerank']).reset_index().rename(columns={'index':'User'})
    assert pagerank_dict_df.shape[0] == len(nodes)
     # Add a column for pagerank_Source_User using the Pagerank for the Source_User node
    result_df = result_df.merge(pagerank_dict_df.rename(columns={'User': 'Source_User', 
                                                               'pagerank': 'pagerank_Source_User'}),
                                on='Source_User', how='left')
    # Add a column for pagerank_Target_User using the Pagerank for the Target_User node
    result_df = result_df.merge(pagerank_dict_df.rename(columns={'User': 'Target_User', 
                                                               'pagerank': 'pagerank_Target_User'}),
                                on='Target_User', how='left')
    print(f'{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} Pagerank: ok!')
    print('Resulting result_df has shape', result_df.shape)
    assert result_df.shape[0] == (len(nodes)*(len(nodes)-1))
    display(result_df.head(5))
    #display(result_df[result_df['clustering_coefficient_score_Target_User'] > 0])

    # Write resulting graph topology features for Source_User and Target_User combinations to a file
    pickle_out_file_path_features = 'output/G_features_postings_days_{}-{}.pkl'.format(str(day_start), str(day_end))
    with open(pickle_out_file_path_features, 'wb') as f:
        pickle.dump(result_df, f)
    print(f'{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")} Successfully written result_df to file "{pickle_out_file_path_features}".')


#### Compute graph features

Calculate for days and save to file:

In [None]:
postings_graph_features = computation_input_df_combined_postings(postings_user_commented_relation_df, day_start=1, day_end=4)

In [None]:
postings_graph_features = computation_input_df_combined_postings(postings_user_commented_relation_df, day_start=1, day_end=1)

In [None]:
postings_graph_features = computation_input_df_combined_postings(postings_user_commented_relation_df, day_start=2, day_end=2)

In [None]:
postings_graph_features = computation_input_df_combined_postings(postings_user_commented_relation_df, day_start=3, day_end=3)

In [None]:
postings_graph_features = computation_input_df_combined_postings(postings_user_commented_relation_df, day_start=4, day_end=4)

In [None]:
postings_graph_features = computation_input_df_combined_postings(postings_user_commented_relation_df, day_start=6, day_end=9)

In [None]:
postings_graph_features = computation_input_df_combined_postings(postings_user_commented_relation_df, day_start=6, day_end=6)

In [None]:
postings_graph_features = computation_input_df_combined_postings(postings_user_commented_relation_df, day_start=7, day_end=7)

In [None]:
postings_graph_features = computation_input_df_combined_postings(postings_user_commented_relation_df, day_start=8, day_end=8)

In [None]:
postings_graph_features = computation_input_df_combined_postings(postings_user_commented_relation_df, day_start=9, day_end=9)

## Play with some NLP extractions of text and one hot encoding

In [None]:
# download spacy german language model - used for tokenization, lemmatization of article text (vector data)
!python -m spacy download de_core_news_sm

### One-Hot encoding (NLP)

One Hot encoding of `'ArticleChannel'`, `'ArticleRessortName'` for ML pipeline:

In [None]:
from sklearn.preprocessing import OneHotEncoder

# encode article channel and article ressort name with one hot encoding

# TODO use it like that or use the TEST set for one hot encoding. 
# Assuming all article channels and article resort names are known a priori this is ok imho for derStandard.
# Just use a subset of df_posting e.g. certain days, if the number of features is too high in ML pipeline...


# TODO be aware of the order when adding it

# ArticleChannel	ArticleRessortName
one_hot_encoder_article_channel = OneHotEncoder(handle_unknown='ignore').fit(df_postings[['ArticleChannel', 'ArticleRessortName']])

print(one_hot_encoder_article_channel.categories_)

transformed_channel_resort = one_hot_encoder_article_channel.transform(df_postings[['ArticleChannel', 'ArticleRessortName']])
print(transformed_channel_resort.shape)
display(transformed_channel_resort.toarray())

### TF-IDF feature extraction (NLP)

TF-IDF feature extraction of combined `PostingHeadline`, `PostingComment` and `ArticleTitle` next:

In [None]:
file_name_spacy = 'output/combinedPostingText.spacy'

The following step for creating `'output/combinedPostingText.spacy'` is usually not necessary. It takes around 64 minutes and you can just use my file: <br>
Run the next chunk, if you want to wait long for computing the `"combinedPostingText.spacy"` file.<br>
`doc_bin` holds document information in order of `df_postings`. Existing `combinedPostingText.spacy` stores nlp information for all `df_postings` rows, not just a subset of days. Take care of the order of `df_postings`!

In [None]:
import spacy
from spacy.tokens import DocBin
nlp = spacy.load("de_core_news_sm")

df_postings['CombinedArticlePostingText'] = df_postings['PostingHeadline'].fillna('') + ' ' + df_postings['PostingComment'].fillna('') + ' ' + df_postings['ArticleTitle'].fillna('')

doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)

for doc in nlp.pipe(df_postings['CombinedArticlePostingText'].str.lower()):
    #print(repr(doc))
    doc_bin.add(doc)

#bytes_data = doc_bin.to_bytes()
#doc_bin = DocBin().from_bytes(bytes_data)

doc_bin.to_disk(file_name_spacy)

# save tokenized spacy Doc objects to file
# the file contains the combined text of posting headline, posting comment and article title in a tokenized form
# and also meta data about a token, like its lemma form, whether a token is a punctuation, or bracket etc.
# processing took around 64 minutes on the whole df_postings for me.

The next takes around 5 minutes for me. You could also just use the pickle file `tokenized_lemmatized_texts.pkl`.

In [None]:
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import spacy
from spacy.tokens import DocBin
nlp = spacy.load("de_core_news_sm")

doc_bin = DocBin().from_disk(file_name_spacy)

docs = list(doc_bin.get_docs(nlp.vocab))
#print(docs)
print(len(docs))
tokenized_lemmatized_texts = [[token.lemma_ for token in doc 
                               if not token.is_stop and not token.is_punct and not token.is_space 
                               and not token.like_url and not token.like_email and not token.is_quote 
                               and not token.is_currency and not token.is_bracket and not token.is_quote
                               and not token.is_left_punct and not token.is_right_punct and not token.is_bracket]
                               for doc in docs]
print(len(tokenized_lemmatized_texts))
print(tokenized_lemmatized_texts[0])
print(tokenized_lemmatized_texts[100])

# tokenized_lemmatized_texts contains nlp information for all documents in df_postings in order!

# save as pickle file:
with open('output/tokenized_lemmatized_texts.pkl', 'wb') as f:
    pickle.dump(tokenized_lemmatized_texts, f)

Continue here: Load the pickled file `tokenized_lemmatized_texts.pkl`:

In [None]:
with open('output/tokenized_lemmatized_texts.pkl', 'rb') as f:
    tokenized_lemmatized_texts = pickle.load(f)

In [None]:
# tokenized_lemmatized_texts contains nlp information for all documents in df_postings in order!
# is a list of list of tokens, a list of tokens for the combined texts for a posting-article-combination for a certain posting entry in posting_df.
print(tokenized_lemmatized_texts[0:2])
print(len(tokenized_lemmatized_texts))
print(len(df_postings))

assert len(tokenized_lemmatized_texts) == len(df_postings)

I will show an example of utilizing the NLP tf-idf features by adding them to the matrix of graph features.<br>
The example is illustrated with days 1–4 combined. We may use these graph feature for Training and Testing!

In [None]:
# training set - graph based features used are from days 1--4 combined.
# training set - vetor-based NLP features used are from days 1--4:

# filter tokenized_lemmatized_texts by days of PostingCreatedAt:
mask_posting_created_at_days_1to4 = get_mask_created_at(postings_user_commented_relation_df['PostingCreatedAtDay'],
                                                        day_start=1, day_end=4)
# filter target user is not nan
mask_target_user_non_nan = ~postings_user_commented_relation_df['Target_User'].isna()

mask_posting_days_1to4_target_user_non_nan = (mask_posting_created_at_days_1to4 & mask_target_user_non_nan)

tokenized_lemmatized_texts_at_days_1to4 = list(compress(tokenized_lemmatized_texts, list(mask_posting_days_1to4_target_user_non_nan)))
assert (len(tokenized_lemmatized_texts_at_days_1to4)) == mask_posting_days_1to4_target_user_non_nan.sum()
display(tokenized_lemmatized_texts_at_days_1to4[0:1])
print('There are', len(tokenized_lemmatized_texts_at_days_1to4),'tokenized-lemmatized texts for these days where Target_User is not NaN')

assert postings_user_commented_relation_df.loc[mask_posting_days_1to4_target_user_non_nan].isna().sum(axis=0).sum() == 0
assert postings_user_commented_relation_df.loc[mask_posting_days_1to4_target_user_non_nan]['PostingCreatedAtDay'].isin(range(1, 4+1)).all()
assert ~postings_user_commented_relation_df.loc[mask_posting_days_1to4_target_user_non_nan]['PostingCreatedAtDay'].isin(range(5, 31+1)).all()
assert len(tokenized_lemmatized_texts_at_days_1to4) == postings_user_commented_relation_df.loc[mask_posting_days_1to4_target_user_non_nan].shape[0]

In [None]:
max_tf_idf_features = 500

In [None]:
# TODO train test split!
# only fit on train data and not on test data!

# fit tokenized-lemmatized text for days subset on train data:

tf_idf_vectorizer = TfidfVectorizer(ngram_range=(1, 1), lowercase=False, tokenizer=lambda x: x, max_features=max_tf_idf_features)
vectorizer = tf_idf_vectorizer.fit(tokenized_lemmatized_texts_at_days_1to4) # training data
print(list(tf_idf_vectorizer.get_feature_names_out()))

In [None]:
# Obtain tf-idf vectorized text features for training set:

text_tf_idf_features_train = tf_idf_vectorizer.transform(tokenized_lemmatized_texts_at_days_1to4).toarray() # training or test data (here training)
print('TF-IDF vectorized text features have shape:', text_tf_idf_features_train.shape)

assert len(tokenized_lemmatized_texts_at_days_1to4) == text_tf_idf_features_train.shape[0]
assert max_tf_idf_features == text_tf_idf_features_train.shape[1]

In [None]:
# append tf-idf features to the right of postings_user_commented_relation_df content
# BE CAREFUL OF THE ORDER!

display(postings_user_commented_relation_df.loc[mask_posting_days_1to4_target_user_non_nan].reset_index(drop=True).head())
print(postings_user_commented_relation_df.loc[mask_posting_days_1to4_target_user_non_nan].reset_index(drop=True).shape)

display(pd.DataFrame(text_tf_idf_features_train).head())
print(pd.DataFrame(text_tf_idf_features_train).shape)

# append features
postings_user_commented_relation_df_with_tfidf_features = pd.concat([
    postings_user_commented_relation_df.loc[mask_posting_days_1to4_target_user_non_nan].reset_index(drop=True), 
    pd.DataFrame(text_tf_idf_features_train)], axis=1, sort=False)

postings_user_commented_relation_df_with_tfidf_features['Target_User'] = postings_user_commented_relation_df_with_tfidf_features['Target_User'].astype('int64')

display(postings_user_commented_relation_df_with_tfidf_features)
print(postings_user_commented_relation_df_with_tfidf_features.shape)

assert postings_user_commented_relation_df_with_tfidf_features.shape[0] == text_tf_idf_features_train.shape[0]
assert postings_user_commented_relation_df_with_tfidf_features.shape[1] == (text_tf_idf_features_train.shape[1] + postings_user_commented_relation_df.loc[mask_posting_days_1to4_target_user_non_nan].shape[1])

In [None]:
# make sure to copy the pickle file from our shared Google Drive folder
with open('output/G_features_postings_days_1-4.pkl', 'rb') as f:
    G_features_postings_days_1to4 = pickle.load(f)
# dataframe of graph features for Source_User and Target_User
G_features_postings_days_1to4.head()

In [None]:
G_features_postings_days_1to4

In [None]:
postings_tfidf_and_graph_features_only_positive = pd.merge(postings_user_commented_relation_df_with_tfidf_features, G_features_postings_days_1to4,
                                                           on=['Source_User', 'Target_User'], how='left')

In [None]:
postings_user_commented_relation_df_with_tfidf_features[(postings_user_commented_relation_df_with_tfidf_features['Source_User'] == 671476) & ((postings_user_commented_relation_df_with_tfidf_features['Target_User'] == 233191))]

In [None]:
G_features_postings_days_1to4[(G_features_postings_days_1to4['Source_User'] == 233191)]

In [None]:
print(postings_tfidf_and_graph_features_only_positive.shape)
postings_tfidf_and_graph_features_only_positive.head()
postings_tfidf_and_graph_features_only_positive.info(verbose=True, null_counts=True)

In [None]:
df_postings.loc[mask_posting_created_at_days_1to4].head()

In [None]:
# filter tokenized_lemmatized_texts by days of PostingCreatedAt:


mask_posting_created_at_days_1to4 = get_mask_created_at(postings_user_commented_relation_df['PostingCreatedAtDay'],
                                                          day_start=1, day_end=5)
tokenized_lemmatized_texts_at_days_1_to_5 = list(compress(tokenized_lemmatized_texts, list(mask_posting_created_at_days_1to4)))

assert (len(tokenized_lemmatized_texts_at_days_1_to_5)) == mask_posting_created_at_days_1to4.sum()

In [None]:
# TODO train test split!
# only fit on train data and not on test data!
# 
max_tf_idf_features = 300

vectorizer = TfidfVectorizer(ngram_range=(1, 1), lowercase=False, tokenizer=lambda x: x, max_features=max_tf_idf_features)
vectorizer = vectorizer.fit(tokenized_lemmatized_texts)
#print(list(vectorizer.get_feature_names_out()))

# text_vectorized = vectorizer.transform(tokenized_lemmatized_texts)
#print(text_vectorized.toarray().shape)
#display(text_vectorized.toarray()[0])

In [None]:
# just a check to see which tf-idf features were extracted:
with open('output/feature_names.txt', 'w') as f:
    for item in vectorizer.get_feature_names_out():
        f.write("%s\n" % item)

In [None]:
# make sure to copy the pickle file from our shared Google Drive folder
with open('output/G_features_postings_days_1-1.pkl', 'rb') as f:
    G_features_postings_days_1 = pickle.load(f)
# dataframe of graph features for Source_User and Target_User
G_features_postings_days_1.head()

In [None]:
G_features_postings_days_1[(G_features_postings_days_1['Source_User'] == 671476) & ((G_features_postings_days_1['Target_User'] == 233191))]

In [None]:
# TODO