# Exploring StackOverflow!

In [12]:
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime

## Populating the Graphs

Each graph is build independantly from the provided `.txt` files of temporal network of interactions. \
Users are represented as nodes and answers\comments as edges.

The design choices of the following method are:
- Using undirected graphs for robustness and efficacy of the model. 
- *Simple* graphs, there are no loops in the graphs. Users who answer to themselves are discarded cases.
- Only one attributes is assigned to the edges: weight. The weights are:
    - `1` for answers to questions
    - `2/3` for comments on questions
    - `1/2` for comments to answers
- Time resolution is one day.
- The graphs are build given a time input to avoid such attribute for the sake of simplicity and robustess.
---

In [13]:
def from_time_int_to_dates(time_interval):
    # Convert the time interval in start and end dates 
    time_interval = tuple(map(datetime.fromisoformat, time_interval)) # converting time interval into datetime format
    time_interval = tuple(map(datetime.timestamp, time_interval)) # converting time interval into POSIX timestamp 
    start_d = int(time_interval[0]) #converting to string to compare with the txt
    end_d = int(time_interval[1])
    return start_d, end_d

In [14]:
def get_graph(time_interval, file = 3):
    
    # Initialize the graph
    G = nx.Graph()
    
    # Create mapping of files and mapping of weights
    map_files = {1: "sx-stackoverflow-a2q.txt", 2:"sx-stackoverflow-c2q.txt", 3:"sx-stackoverflow-c2a.txt"}
    map_weights = {1: 1.0, 2: 2/3, 3: 1/2}
    
    # Get the start and end dates 
    start, end = from_time_int_to_dates(time_interval)
    
    # Select the file chosen, open it and read the lines
    with open(map_files[file], "r", encoding="UTF-8") as f:
        for line in tqdm(f.readlines()):
            
            # Parse the line
            elems = line.split(' ')
            
            # Add to the graph if it is in the time interval
            if start <= int(elems[2]) <= end:
                # If the edge already exists --> increment the weight, else simply add the new edge
                if (elems[0], elems[1]) in G.edges():
                    G[elems[0]][elems[1]]['weight'] += float(map_weights[file])
                else:
                    G.add_edge(elems[0], elems[1], weight = float(map_weights[file]))
                    
    return G

In [23]:
interval = ("2008-11-01","2008-11-02")

In [15]:
graph1 = get_graph(time_interval=interval, file=1)

100%|██████████| 17823525/17823525 [00:11<00:00, 1540013.76it/s]


In [16]:
graph2 = get_graph(time_interval=interval, file=2)

100%|██████████| 20268151/20268151 [00:12<00:00, 1629114.19it/s]


In [17]:
graph3 = get_graph(time_interval=interval, file=3)

100%|██████████| 25405374/25405374 [00:15<00:00, 1635052.26it/s]


In [22]:
print(graph1)
print(graph2)
print(graph3)

Graph with 1044 nodes and 1398 edges
Graph with 135 nodes and 100 edges
Graph with 534 nodes and 557 edges


### Merging the graphs

In [18]:
def merged_graph(graph_1, graph_2):
    
    # Iterate over the edges from the second graph
    for edge_2 in graph_2.edges(data = True):
        # If the edge of graph 2 is also in graph 1, only sum weights
        if (edge_2[0],edge_2[1]) in graph_1.edges():
            graph_1[edge_2[0]][edge_2[1]]['weight'] += float(edge_2[2]['weight'])
        # Else add the edge of graph 2 also in graph 1
        else:
            graph_1.add_edge(edge_2[0], edge_2[1], weight = float(edge_2[2]['weight']))
            
    return graph_1

In [19]:
merged = merged_graph(graph1, graph2)
merged = merged_graph(merged, graph3)

In [20]:
print(merged)

Graph with 1044 nodes and 1398 edges


## Implementation of the backend

### Functionality 1 - Get the overall features of the graph

Graph density/sprcity is computed as defined in *"Introduction to Algorithms" by Cormern, Leiserson, Rivest, and Stein*: \
A graph $G = (E, V)$, with $E$ denoting the edges and $V$ denoting the vertices,  is sparse if $|E| << |V|^2$ and dense if $|E|$ is close to $|V|$. \
And so if $|E|$ differs an order of magnitude from $|V|$ the graph is considered sparse, otherwise it is dense.
The density degree expression is:

\begin{equation}
2\frac{|V|}{|E|(|E| - 1)} \approx 2\frac{|V|}{|E|^2}
\end{equation}

In [7]:
def F1_OverallFeatures(Graph):
    """
    Input: One of the 3 graphs

    Output:
      Whether the graph is directed or not
      Number of users
      Number of answers/comments
      Average number of links per user
      Density degree of the graph
      Whether the graph is sparse or dense
    """

    Directed = Graph.is_directed() # quering if the graph is directed
    if Directed: # using a variable to print the output
        IsDirectedPrint = 'directed'
    else:
        IsDirectedPrint = 'undirected'

    NofUsers = Graph.number_of_nodes() # computing the number of users
    NofInteractions = Graph.number_of_edges() # computing the number of interactions

    DegreeDict = dict(a2qGraph.degree()) # a dictionary with keys -> nodes, values -> the number of edges incident 
                                        # to the node a.k.a degree
    AvgUserLinks = sum(DegreeDict.values()) / len(DegreeDict) # computing the avarege number of links per user

    DensityDegree = 2 * NofInteractions / NofUsers**2 # computing density degree

    if NofUsers / NofInteractions**2 < 10: # evaluating sparsity/density
        SparseDense = 'sparse'
    else:
        SparseDense = 'dense'

    print(f"The input graph is {IsDirectedPrint}\n\
            Number of users: {NofUsers}\n\
            Number of answers/comments: {NofInteractions}\n\
            Average number of links per user: {AvgUserLinks:.2}\n\
            Density degree: {DensityDegree: .2}\n\
            The graph is {SparseDense}")

In [8]:
F1_OverallFeatures(a2qGraph)

The input graph is undirected
Number of users: 714167
Number of answers/comments: 1429455
Average number of links per user: 4.0
Density degree:  5.6e-06
The graph is sparse


### Functionality 2 - Find the best users!

#### Degree centrality
As defined in *Introduction to Graph Concepts for Data Science of Aris Anagnostopoulos*, the normalized degree centrality of node $v$ is:

\begin{equation}
\frac{d_v}{|V|-1} \approx \frac{d_v}{|V|}
\end{equation}

with $d_v$ the degree of the node $v$, i.e. the number of edges incident to the node.

In [32]:
def DegreeCentrality(Graph, Node):
    Output = Graph.degree(Node) / Graph.number_of_edges()
    print(f"{Output:.2}")

In [33]:
DegreeCentrality(a2qGraph, 3841803)

0.00012


#### Closeness centrality
As defined in *Introduction to Graph Concepts for Data Science of Aris Anagnostopoulos*, the normalized closeness centrality of node $v$ is:


\begin{equation}
\frac{|V|-1}{\sum_{u \epsilon V} d(v,u)} \approx \frac{|V|}{\sum_{u \epsilon V} d(v,u)}
\end{equation}

with $d(v,u)$ the distance between nodes $v$ and $u$, that is the length of a shortest path between $v$ and $u$.\
As a design choice $d(v,u)$ is taken as the inverse of the weight of the edge of $(v,u)$, this leads to an inverse relationship between interaction and distances: the more a user posts, the closer he gets to the comunity.  

In [183]:
def ClosenessCentrality(Graph, Node):
   #WIP
   SummedDistances = Graph[Node]
   
   return Graph.number_of_nodes() / SummedDistances

#### Page rank

In [181]:
def PageRank(Graph, Node):
    pass

#### Betweeness

As defined in *Introduction to Graph Concepts for Data Science of Aris Anagnostopoulos*, the Betweeness centrality of node $v$ is:

\begin{equation}
\sum_{u, w \epsilon V \backslash \{v\}} \frac{g_{u w}^v}{g_{uw}} \frac{2}{|V|^2 - 3|V| + 2} 
\end{equation}


with $g_{uw}$ the shortest path that connects nodes $u$ and $w$ and $g_{u w}^v$ the set of those shortest paths between u and w that contain node $v$.

In [195]:
def Betweeness(Graph, Node):
   

In [201]:
def F2_BestUser(Node, TimeInterval, Metric, Graph = MergedGraphs):
    """
    Input:
    A user/node
    A tuple of two dates in ISO format representing an interval of time, e.g. (2015-12-04, 2016-12-04) 
    An integer corresponding to the following metrics: 
      1 -> Betweeness 
      2 -> PageRank
      3 -> ClosenessCentrality 
      4 -> DegreeCentrality
    The graph on which to perform the analysis.

    Output:
    The value of the given metric applied over the complete graph for the given interval of time.
    """

    TimeInterval = tuple(map(dt.fromisoformat, TimeInterval)) # converting time interval into datetime format
    IntToMetric = {1: Betweeness, 2: PageRank, 3: ClosenessCentrality, 4: DegreeCentrality} # dictionary associating the
                                                                                           # input integer to the
                                                                                           # corresponding metric function

    TimeIntervalGraph = Graph.copy() # creating a copy on which to perform the time filtering
    TimeIntervalGraph.remove_edges_from([(n1, n2) for n1, n2, time in TimeIntervalGraph.edges(data="time") 
                                        if (time >= TimeInterval[0] & time <= TimeInterval[1])]) # removing the edges that not belong
                                                                                                 # to the input time interval  

    return IntToMetric[Metric](TimeIntervalGraph, Node)