<h1> Analysis 1 (Co-occurence networks) </h1>


Takes Time corpus data and tf-idf scores. Exclude words, filters nodes & edges and creates Gephi file of co-occurrence networks.

In [12]:
#imports

#functions
import pandas as pd
import pickle
import numpy as np
import math
import networkx as nx
import matplotlib.pyplot as plt



#time_words
ntp_words = ['time', 'period', 'periods', 'duration', 'clock', 'temporal', 'spacetime', 'timespan', 'timespans', 'timeline', 'timelines', 'elapse', 'elapsed', 'length', 'timewise', 'velocity', 'pace', 'rate', 'tempo', 'pass', 'passing', 'passed']
ftp_words = ['quick','quicker', 'quickly', 'quickest', 'fast', 'faster', 'fastest', 'fastened', 'rapid','rapidly', 'short', 'shorter', 'shortly', 'shortest','speedy', 'speedy','speeded', 'speedier', 'hurry', 'hurried', 'swift', 'swifter', 'swiftly', 'haste', 'hasty', 'brisk', 'turbo', 'accelerate', 'acceleration', 'accelerated', 'accelerating']
stp_words = ['slow', 'slower', 'slowly', 'slows', 'slowed', 'slowest', 'slowing', 'slowdown', 'long', 'looong', 'longer', 'longer', 'longest', 'steady', 'deceleration', 'decelerate', 'decelerating', 'decelerated', 'dilatory', 'dilation', 'infinity', 'eternity', 'lengthy', 'prolonged', 'protracted', 'extended', 'unending', 'endless']
time_words = sorted(ntp_words + ftp_words + stp_words)


<h4> Function: Aggregate for class </h4>

Remember, Time corpus is in this format: 

|substance| classes | seed   | source    | tagert   | weight | colourbias | 
|----------------| ------------------------- | ------ | --------- | -------- | ------ | ---------- |
|LSD| Serotonergic psychedelics | slowly | periphery | abstract | 1      | -0.1       |


This function removes all the rows from non-selected class. And the aggregates the weight and colourbias for word pair duplicate (e.g. if source node is 'slowly' and target node is 'periphery' twice).



In [95]:
def aggregate_for_class(df2, class_): #"class_" can refer both class or substance with many reports


    df2.insert(4, 'weight', 1)

    if class_ == "all":
        #remove non class rows
        grouped = df2.groupby(["classes", "source", "target"], as_index=False)
        
        
    elif class_ in ["Serotonergic psychedelics", "Dissociative psychedelics", "Entactogens", "Deliriants", "Depressant / sedatives", "Stimulants", "Antidepressants / antipsychotics"]:
        #remove non class rows
        df2 = df2[df2['classes'] == class_]
        grouped = df2.groupby(["source", "target"], as_index=False)


    elif class_ in ["LSD", "Psilocybin mushrooms", "DMT", "MDMA", "Cannabis spp.", "Salvia divinorum"]:
        #remove non substance rows
        df2 = df2[df2['substance'] == class_.lower()]
        grouped = df2.groupby(["source", "target"], as_index=False)
    

    df2 = grouped[["weight"]].agg({'weight': np.sum}) 
    df2 = grouped[["weight", "colourbias"]].agg({'weight': np.sum, 'colourbias': np.sum})



    #formatting
    df2.sort_values(by=["weight"], ascending=False, inplace=True)
    df2.reset_index(drop=True, inplace=True)  

    return df2

<h4> Function: Exclude words </h4>

Exclude undesired words. In my case, self-loopsm, time words, frequent time words about the 'time of the day' were removed.

In [96]:
def exclude_words(df2):



    #exclude self-loops (source = target)
    df2 = df2[df2['source'] != df2['target']]


    #remove non_seed_time_words of time of the day + time words - FOR NEXT RUN COULD INCLUDE 'frequent_non_seed_time_words' AGAIN
    frequent_non_seed_time_words = ["second", "seconds", "minute", "minutes", "hour", "hours", "day", "days", "week", "weeks", "weekend", "weekends", "month", "months", "year", "years", "times", "spend", "spent", "spending", "timestamp", "timestamps"]

    remove_list = frequent_non_seed_time_words + time_words 
    df2 = df2[~df2.filter(items=['source', 'target']).isin(remove_list).any(axis=1)]



    #Not implemeneted. Could be used to colour nodes on network with 1-5 score of concreteness vs abstraction. Dataset for this available at http://crr.ugent.be/archives/1330
    """
    #concretenesss
    #get concreteness score as dictionary from Concreteness_ratings_Brysbaert_et_al_BRM
    #exclude words not in Concreteness_ratings corpus
    df_temp = pd.read_excel("Concreteness_ratings_Brysbaert_et_al_BRM.xlsx")
    concreteness_dict = pd.Series(df_temp["Conc.M"].values,index=df_temp.Word).to_dict()
    df2 = df2[df2["source"].isin(list(concreteness_dict)) & df2["target"].isin(list(concreteness_dict))] 
    """
    
    return df2

<h4> Function: Filter by tf-idf </h4>

Only keep rows, where both source, target node are in top tf-idf list. How many values in top tf-idf, should depend on which class you are creating graph for. Parameter 'gamma' sets how many. (Larger dataset -> Larger gamma).

In [97]:
def filter_tfidf(tfidf_df, class_, df2, gamma):  

    #lowercase align
    if class_ in ["LSD", "Psilocybin mushrooms", "DMT", "MDMA", "Cannabis spp.", "Salvia divinorum"]:
        class_ = class_.lower()
    
    #sort values by highest to lowest tfidf scores for a class
    tfidf_df.sort_values(by=class_, ascending=False, inplace=True)
    tfidf_df.reset_index(drop=True, inplace=True)


    #select top delta tfidf words and filter all edges where both words not in top tfidf
    top_tfidf = tfidf_df["word"].to_list()[:gamma]
    df2 = df2[df2['source'].isin(top_tfidf) & df2['target'].isin(top_tfidf)] # '&' means both nodes in row have to be in top_idf


    #formatting
    df2.sort_values(by=["weight"], ascending=False, inplace=True)
    df2.reset_index(drop=True, inplace=True)


    return df2

<h4> Function: Filter by relative weight size </h4>

Filter all edges that do "not carry a disproportionate fraction of a node's strength". 

$$p_{ij} = (1 - \frac{w_{ij}}{s_{i}})^{k_{i} - 1}$$

$w_{ij}$ is the weight of an edge. $k_{i}$ and $s_{i}$ are the degegree and strength of a node<sub>i</sub>. The strength is a weighted version of degree by multiplying the sum of all the weights of edges to/from that node. If $p_{ij}$ is above a set threshold, the edge is excluded.

Steps: 
- Create temporary graph with network
- Calculate  $p_{ij}$
- Exclude all rows where $p_{ij} > delta$

In [16]:
def filterby_rw(df2, delta):
    
    #create temporary graph G1 (NetworkX) to calculate strength and degree for each node
    G1 = nx.from_pandas_edgelist(df2, edge_attr="weight")
    strength_dict = dict(G1.degree(weight="weight"))
    degree_dict = dict(G1.degree())

    df2["prob_source"] = 0
    df2["prob_target"] = 0

    #iterate through all rows and calculate prob of null model for each edge
    for i, weight in enumerate(df2.weight):
        source = df2.loc[i, "source"]
        target = df2.loc[i, "target"]

        df2.loc[i, "prob_source"] = (1 - (weight / strength_dict[source]))**(degree_dict[source]-1)
        df2.loc[i, "prob_target"] = (1 - (weight / strength_dict[target]))**(degree_dict[target]-1)

    
    #delta threshold for null model -> filter all edges where the prob of either node for null model is above delta
    df2 = df2[((df2["prob_source"] < delta) & (df2["prob_target"] < delta))]
    df2.drop(["prob_source", "prob_target"], axis=1, inplace=True)
    df2.reset_index(drop=True, inplace=True)


    #formating
    df2.sort_values(by=["weight"], ascending=False, inplace=True)
    df2.reset_index(drop=True, inplace=True)


    return df2

<h4> Function: Create GEFX file for Gephi </h4>

Gephi is a software, you can plot the co-occurrence network in. 

For nodes and edges, create a 'normal' and 'relative' colourbias dictionary. For latter, include 'ceiling' The motivation for creating a relative colourbias (divided by weight) and with a ceiling is explained by me in this [YouTube video](https://youtu.be/U1zzyvW_WjM?t=146).

Prepare
- Create folders for each class.

In [17]:
def get_GEFX_with_attributes(df2, class_, gamma, delta):


    #create relative colourbias by dividing through weight
    df2['relative colourbias with ceiling'] = 0
    df2['relative colourbias with ceiling'] = df2.apply(lambda row: row['colourbias'] / row['weight'], axis=1)


    #nx_from_pandas_edgelist only works for 'A-B', or 'B-A' edges but does not sum them as one edge. The code below allows us to treat 'A-B' and 'B-A' as the same edge, and combine their weights.

    # Create a copy of the DataFrame with reversed edges
    df2_rev = df2.rename(columns={'source': 'target', 'target': 'source'})

    # Concatenate the original DataFrame with the reversed DataFrame
    df2_combined = pd.concat([df2, df2_rev])

    # Group the combined DataFrame by the two nodes and sum the weights
    df2 = df2_combined.groupby(['source', 'target']).agg({'weight': 'sum', 'colourbias': 'sum', 'relative colourbias with ceiling': 'sum'}).reset_index()


    #colour with smaller absolute max value, sets ceiling for other colour - important for visualisation 
    max_value = df2['relative colourbias with ceiling'].max()
    min_value = df2['relative colourbias with ceiling'].min()
    ceiling_value_edges = min(max_value, abs(min_value))  
    print(ceiling_value_edges)

    #make sure that the most positive and most negative edge colourbias are the same value (at the ceiling)
    for i, item in enumerate(df2['relative colourbias with ceiling']):
        if abs(item) > ceiling_value_edges:
            df2.loc[i, 'relative colourbias with ceiling'] = (item/abs(item))* ceiling_value_edges

        

    # Create a co-occurrence network from the DataFrame
    G2 = nx.from_pandas_edgelist(df2, 'source', 'target', edge_attr=['weight', 'colourbias', 'relative colourbias with ceiling'])

    

    # Colourbias dictionary for clustering algorithm
    node_colour_bias_attr_dict1 = {}
    for col in ['source', 'target']:
        for i, item in enumerate(df2[col].unique()):
            # Get the rows corresponding to the node
            node_rows = df2[df2[col] == item]
            # Calculate the total colourbias for the node based on the new values
            total_colourbias = node_rows['colourbias'].sum()
            # Update the dictionary with the new value
            node_colour_bias_attr_dict1[item] = total_colourbias


    
    #create relative colourbias by dividing through word frequency in Time Corpus
    freq_dict = {}
    for col in ['source', 'target']:
        for i, item in enumerate(df2[col]):
            freq_dict[item] = freq_dict.setdefault(item, 0) + df2.loc[i, "weight"]



    node_colour_bias_attr_dict2 = node_colour_bias_attr_dict1.copy()
    for item in node_colour_bias_attr_dict2:
        node_colour_bias_attr_dict2[item] = node_colour_bias_attr_dict2[item]/freq_dict[item]




    #colour with smaller absolute max value, sets ceiling for other colour - important for visualisation 
    max_value = max(node_colour_bias_attr_dict2.items(), key=lambda x: x[1])[1]
    min_value = min(node_colour_bias_attr_dict2.items(), key=lambda x: x[1])[1]
    ceiling_value_nodes = min(max_value, abs(min_value))
    

    #make sure that the most positive and most negative node colourbias are the same value (at the ceiling)
    for item in node_colour_bias_attr_dict2:
        if abs(node_colour_bias_attr_dict2[item]) > ceiling_value_nodes:
            node_colour_bias_attr_dict2[item] = (node_colour_bias_attr_dict2[item]/abs(node_colour_bias_attr_dict2[item]))* ceiling_value_nodes



    #set node colour & concreteness attributes
    nx.set_node_attributes(G2, node_colour_bias_attr_dict1, name="absolute colour_bias")
    nx.set_node_attributes(G2, node_colour_bias_attr_dict2, name="relative colour_bias with ceiling")



    #Not implemeneted. Could be used to colour nodes on network with 1-5 score of concreteness vs abstraction.
    """
    #remove all concreteness attributes, for which there are no graph nodes
    concreteness_attr_dict = {}
    for item in np.unique(df2[['source', 'target']].values):
        concreteness_attr_dict[item] = concreteness_dict[item]

    nx.set_node_attributes(G2, concreteness_attr_dict, name="concreteness 1-5")
    """


    # "/" does not possible in folders, files
    if class_ == "Antidepressants / antipsychotics":
        class_ = "Antidepressants antipsychotics"

    elif class_ == "Depressant / sedatives":
        class_ = "Depressant sedatives"

    if class_ == "Cannabis spp.":
        class_ = "Cannabis spp"
    
    
    return df2, nx.write_gexf(G2, f"Gephi/{class_}/{class_}_gamma={gamma}_delta={delta}-RANDOM.gexf")

<h4> Run functions </h4>

Using Time corpus (df2) and tf-idf scores, run all the functins above. 

Here you can select: 
- class_
- gamma (Threshold for tf-idf filtering)
- delta (Significance threshold for relative weight filtering)


Available classes:
- 'all', 'Serotonergic psychedelics', 'Dissociative psychedelics', 'Entactogens', 'Deliriants', 'Depressant / sedatives', 'Stimulants', 'Antidepressants / antipsychotics', 'LSD', 'Psilocybin mushrooms', 'DMT', 'MDMA', 'Cannabis spp.', 'Salvia divinorum'


In [None]:
#data and tfidf scores
df2 = pd.read_pickle("timecorpus_C=4.pkl")
tfidf_df = pd.read_pickle("tfidf_df_C=4.pkl")
tfidf_df_SUB = pd.read_pickle("tfidf_df_C=4_SUBSTANCES.pkl")
tfidf_df_merged = pd.merge(tfidf_df, tfidf_df_SUB, on='word')



def run_class(df2, class_, tfidf_df, gamma, delta):
    df2 = aggregate_for_class (df2, class_)
    df2 = exclude_words(df2)
    df2 = filter_tfidf(tfidf_df, class_, df2, gamma)
    df2 = filterby_rw(df2, delta)
    df2 = get_GEFX_with_attributes(df2, class_, gamma, delta)
    return df2, G2




df2, G2  = run_class(df2=df2, class_="Serotonergic psychedelics", tfidf_df=tfidf_df_merged, gamma=5000, delta=0.03)


<h4> Next steps </h4>

Install [Gephi](https://gephi.org/) and replicate my workflow from this [YouTube video](https://youtu.be/U1zzyvW_WjM?t=146).