# Social Network Analysis - Project sna_topic_11

In [None]:
import numpy as np
np.random.seed(42)
import random
random.seed(42)

import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
pd.reset_option("^display")

In [None]:
pd.set_option('display.float_format', '{:20,.4f}'.format)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 2000)

## The Data

The data set is provided by Der Standard, one of the top Austrian newspapers.
In the online Standard people can post comments below articles and up/down vote comments.

1. There are two files containing all **postings** to articles in May 2019 (due to high amount of data, the postings were split into two files). The respective file also contains additionally meta-data of the postings and articles and some details about the users who composed the postings.

2. There are two files containing all **votes** for the postings in point 1 (due to high amount of data, the votes were split into two files). The respective file also contains information whether the vote was negative or positive and some details about the user who did the voting.

3. There is one file containing **following and ignoring relationships** among all the users who posted (see point 1) or voted (see point 2) to articles published in May 2019. A following relationship (i.e., the user with the `ID_CommunityIdentity` given in column 1 follows the user with the `ID_CommunityIdentityConnectedTo` given in column 2) is indicated by a “1” in column the `“ID_CommunityConnectionType”`, a ignoring relationship by a “2” in that column (i.e., the user with the `ID_CommunityIdentity` given in column 1 ignores the user with the `ID_CommunityIdentityConnectedTo` given in column 2).

There are different entities in the data set: 
* **Users** - identified by *ID_CommunityIdentity* (or *UserCommunityName*)
* **Postings** - identified by *ID_Posting*
* **Articles** - identified by *ID_Article*

In [None]:
# define path to datasets:
file_postings_1 = 'data/Postings_01052019_15052019.csv'
file_postings_2 = 'data/Postings_16052019_31052019.csv'
file_votes_1 = 'data/Votes_01052019_15052019.csv'
file_votes_2 = 'data/Votes_16052019_31052019.csv'
file_following_ignoring = 'data/Following_Ignoring_Relationships_01052019_31052019.csv'

output_dir = 'output/'

In [None]:
df_postings_1 = pd.read_csv(file_postings_1, sep=';')
display(df_postings_1.head(2))
df_postings_1.info()

In [None]:
df_postings_2 = pd.read_csv(file_postings_2, sep=';')
display(df_postings_2.head(2))
df_postings_2.info()

In [None]:
df_votes_1 = pd.read_csv(file_votes_1, sep=';')
display(df_votes_1.head(2))
df_votes_1.info()

In [None]:
df_votes_2 = pd.read_csv(file_votes_2, sep=';')
display(df_votes_2.head(2))
df_votes_2.info()

In [None]:
df_following_ignoring = pd.read_csv(file_following_ignoring, sep=';')
display(df_following_ignoring.head(2))
df_following_ignoring.info()

# Relations

## Relation 1: User_A commented/posted to post of User_B

In [None]:
df_postings = pd.concat([df_postings_1, df_postings_2], ignore_index=True)

df_postings[['PostingCreatedAt', 'ArticlePublishingDate', 'UserCreatedAt']] = df_postings[['PostingCreatedAt', 'ArticlePublishingDate', 'UserCreatedAt']].astype('datetime64')

df_postings.info()
df_postings.head(2)

In [None]:
# subsetting dataset because it is too large

#df_postings['PostingCreatedAt'].dt.date.head()
df_postings = df_postings[pd.to_datetime(df_postings['PostingCreatedAt'].dt.date) == '2019-05-01']
df_postings.shape

## Play with some NLP extractions of text and one hot encoding

In [None]:
!python -m spacy download de_core_news_sm

In [None]:
from sklearn.preprocessing import OneHotEncoder

# encode article channel and article ressort name with one hot encoding
 
# ArticleChannel	ArticleRessortName
one_hot_encoder_article_channel = OneHotEncoder(handle_unknown='ignore').fit(df_postings[['ArticleChannel', 'ArticleRessortName']])

print(one_hot_encoder_article_channel.categories_)

transformed_channel_resort = one_hot_encoder_article_channel.transform(df_postings[['ArticleChannel', 'ArticleRessortName']])
print(transformed_channel_resort.shape)
display(transformed_channel_resort.toarray())

In [None]:
# TODO decide on a vectorizer to use:
# vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=10000, lowercase=False)

import spacy
nlp = spacy.load("de_core_news_sm")

def tokenize_lemmatize(doc):
    """
    tokenize, remove stopwords, punctuation and spaces, apply lemmatization to get base form
    """
    #print(doc)
    #print(repr([token.lemma_ for token in nlp(doc) if not token.is_stop and not token.is_punct and not token.is_space]))
    return [token.lemma_ for token in nlp(doc) if not token.is_stop and not token.is_punct and not token.is_space]

def join_text_parts(ph, pc, at):
    """
    combine the three arguments with a space if not empty string
    """
    return ' '.join(filter(None, [ph, pc, at]))

text_columns = ['PostingHeadline', 'PostingComment', 'ArticleTitle']
#df_postings['CombinedArticlePostingText'] = [join_text_parts(ph,pc,at) for (ph,pc,at) in 
#                                             df_postings[['PostingHeadline', 'PostingComment', 'ArticleTitle']].fillna('').to_numpy()]

df_postings['CombinedArticlePostingText'] = df_postings['PostingHeadline'].fillna('') + ' ' + df_postings['PostingComment'].fillna('') + ' ' + df_postings['ArticleTitle'].fillna('')

display(df_postings[['CombinedArticlePostingText'] + text_columns].head(4))

# TODO which vectorizer to use?
#vectorizer = CountVectorizer(ngram_range=(1, 1), max_features=3000, 
                            #lowercase=True, tokenizer=tokenize_lemmatize).fit(df_postings['CombinedArticlePostingText'])
vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=3000, lowercase=True, tokenizer=tokenize_lemmatize).fit(df_postings['CombinedArticlePostingText'])

print(vectorizer.get_feature_names_out())

res = vectorizer.transform(df_postings['CombinedArticlePostingText'])
print(res.shape)

# TODO train test split!
display(res.toarray())
