### 3.1 

In [22]:
import pandas as pd
import networkx as nx
import random

In [6]:
df1 = pd.read_csv('twitter-small.tsv', sep='\t', header=None,names=['timestamp', 'user', 'content'])
df2 = pd.read_csv('twitter-larger.tsv', sep='\t', header=None,names=['timestamp', 'user', 'content'])
print(df1)

                 timestamp             user  \
0      2009-06-11 17:03:30  hatespeechradio   
1      2009-06-11 17:13:11  champagnemanoir   
2      2009-06-11 17:14:01     emperoranton   
3      2009-06-11 17:15:20   seattlelawgirl   
4      2009-06-11 17:23:56        fridayluv   
...                    ...              ...   
39935  2009-06-30 13:35:58      looney_mesh   
39936  2009-06-30 13:37:13        kasyyoung   
39937  2009-06-30 13:49:31   kimberanna_com   
39938  2009-06-30 13:49:49   kimberanna_com   
39939  2009-06-30 13:54:27    reneebarronmn   

                                                 content  
0      #followfriday (because I like to be a day earl...  
1      RT @Ruth_Z don't tweet blindly... check out ho...  
2      RT @Ohdoctah @micah @sarahrobinson Twitter Fol...  
3      BTW, it's not #FollowFriday but @go2girlevents...  
4      On Fridays don&#8217;t &#8216;pollute your str...  
...                                                  ...  
39935  Vote @Studmisil

In [11]:
def makeList(df, G, name):
    for index, row in df.iterrows():
        user = row['user']
        content = row['content']

        words = content.split()
        mentioned_users = [word[1:] for word in words if word.startswith('@')]
        
        for mentioned_user in mentioned_users:
            if G.has_edge(user, mentioned_user):
                #add weight when user has mentioned before
                G[user][mentioned_user]['weight'] += 1
            else:
                #else, create edge
                G.add_edge(user, mentioned_user, weight=1)

    edge_list = nx.to_pandas_edgelist(G)
    edge_list.to_csv(f'{name}.csv', index=False)
    return G

In [13]:
G1 = nx.DiGraph()
G2 = nx.DiGraph()

G1 = makeList(df1, G1, 'small')
G2 = makeList(df2, G2, 'larger')

### 3.2

In [14]:
def showNodesAndEdges(G, name):
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    
    print(f'{name} nodes:', num_nodes)
    print(f'{name} edges:', num_edges)
    
showNodesAndEdges(G1, 'G1')
showNodesAndEdges(G2, 'G2')

G1 nodes: 102999
G1 edges: 157025
G2 nodes: 620352
G2 edges: 1360894


In [16]:
def showWeakAndStrong(G, name):
    strong_components = [len(c) for c in nx.strongly_connected_components(G)]
    num_strong_components = len(strong_components)
    largest_strong_component = max(strong_components) if strong_components else 0

    weak_components = [len(c) for c in nx.weakly_connected_components(G)]
    num_weak_components = len(weak_components)
    largest_weak_component = max(weak_components) if weak_components else 0

    print(f'{name} strongly connected components:', num_strong_components)
    print(f'{name} size of largest strongly connected component:', largest_strong_component)
    print(f'{name} weakly connected components:', num_weak_components)
    print(f'{name} size of largest weakly connected component:', largest_weak_component)

showWeakAndStrong(G1, 'G1')
showWeakAndStrong(G2, 'G2')

G1 strongly connected components: 101612
G1 size of largest strongly connected component: 1110
G1 weakly connected components: 4419
G1 size of largest weakly connected component: 85860
G2 strongly connected components: 598527
G2 size of largest strongly connected component: 20068
G2 weakly connected components: 17865
G2 size of largest weakly connected component: 564050


In [17]:
def showDensity(G, name):
    density = nx.density(G)
    print(f'{name} density:', density)

showDensity(G1, 'G1')
showDensity(G2, 'G2')

G1 density: 1.4801543373296188e-05
G2 density: 3.536296131767815e-06


In [20]:
#average clustering coefficient
def showACC(G, name):
    ACC = nx.average_clustering(G)
    print(f'{name} average clustering coefficient:', ACC)

showACC(G1, 'G1')
showACC(G2, 'G2')

G1 average clustering coefficient: 0.019285324303747912
G2 average clustering coefficient: 0.030628227982516916


In [24]:
def showAverageDistance(G, name, sample_size=100):
    G_undirected = G.to_undirected()
    largest_component = max(nx.connected_components(G_undirected), key=len)
    giant_component = G_undirected.subgraph(largest_component)
    nodes = list(giant_component.nodes)
    sampled_nodes = random.sample(nodes, min(sample_size, len(nodes)))
    
    total_distance = 0
    num_pairs = 0

    for i in range(len(sampled_nodes)):
        for j in range(i + 1, len(sampled_nodes)):
            try:
                distance = nx.shortest_path_length(giant_component, source=sampled_nodes[i], target=sampled_nodes[j])
                total_distance += distance
                num_pairs += 1
            except nx.NetworkXNoPath:
                continue  

    average_distance = total_distance / num_pairs if num_pairs > 0 else float('inf')
    print(f'{name} average distance:', average_distance)

showAverageDistance(G1, 'G1', 100)
showAverageDistance(G2, 'G2', 100)

UnboundLocalError: local variable 'total_distance' referenced before assignment