<h1> Get Time corpus 1 (Co-occurence networks only) </h1>

In [2]:
#imports
import pandas as pd
import pickle
import numpy as np
from collections import Counter, OrderedDict
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import random

import matplotlib.pyplot as plt
import math

<h4> Data after Pre-Processing 1 and 2 </h4>

Variables: 
- df = Dataframe with information about title, substance, class, url, text of each report
- corpus_list = List of all the report texts concatenated 

In [3]:
df = pd.read_pickle("processed_data_3.pkl")


#corpus list
corpus_list = []
for text in df.text:
    corpus_list += text

corpus_list = list(filter(lambda a: a not in ["PERSON", "ORG", "GPE", "LOC", "DATE", "PLACEHOLDER"], corpus_list))


counter_ = Counter(corpus_list)
corpus_dict = dict(Counter({k: c for k, c in counter_.items()}))

<h4> Seed time perception words </h4>

- These words act as 'seeds' in the Erowid corpus. Words surrounding the seed might describe information about time perception. 

In [4]:
ntp_words = ['time', 'period', 'periods', 'duration', 'clock', 'temporal', 'spacetime', 'timespan', 'timespans', 'timeline', 'timelines', 'elapse', 'elapsed', 'length', 'timewise', 'velocity', 'pace', 'rate', 'tempo', 'pass', 'passing', 'passed']
ftp_words = ['quick','quicker', 'quickly', 'quickest', 'fast', 'faster', 'fastest', 'fastened', 'rapid','rapidly', 'short', 'shorter', 'shortly', 'shortest','speedy', 'speedy','speeded', 'speedier', 'hurry', 'hurried', 'swift', 'swifter', 'swiftly', 'haste', 'hasty', 'brisk', 'turbo', 'accelerate', 'acceleration', 'accelerated', 'accelerating']
stp_words = ['slow', 'slower', 'slowly', 'slows', 'slowed', 'slowest', 'slowing', 'slowdown', 'long', 'looong', 'longer', 'longer', 'longest', 'steady', 'deceleration', 'decelerate', 'decelerating', 'decelerated', 'dilatory', 'dilation', 'infinity', 'eternity', 'lengthy', 'prolonged', 'protracted', 'extended', 'unending', 'endless']
time_words = sorted(ntp_words + ftp_words + stp_words)


<h4> Get context window function </h4>

- Used to iterate through a list and yield C words before and after each seed word.

In [5]:
# Get context window function for time_words
def get_windows(words, C):
    i = C
    while i < len(words) - C:
        center_word = words[i]
        if center_word not in time_words:
            i += 1
            pass
        else:
            context_words = words[(i - C):i] + words[(i+1):(i+C+1)]
            yield context_words, center_word
            i += 1

<h4> Context window examples </h4>

- Get context window and report url for specific time seed words. 

- Line 7 can be adjusted to get the context windows for specific seed or context words. Very useful for understanding nodes in the co-occurence network graph (later).

In [None]:
#co-occurences in context window examples

listx = []


def context_words_edges_vertices(C, listx):
    for i, text in enumerate(df.text):
        url = df.loc[i, "url"]
        substance = df.loc[i, "substance"]
        for context_words, center_word in get_windows(text, C):
            #can look for specific words
            if "symphony" in context_words and "silence" in context_words: # line 7
                print(substance)
                print(context_words[0:C])
                print(center_word)
                print(context_words[C:])
                print(url)
                print("\n")
                listx += context_words

context_words_edges_vertices(C=5, listx=listx)

<h4> Get all co-occurences in Time corpus  </h4>

- Time corpus = All context windows seeded by all time words.

- Get all pairs of word-occurence in each context window in the Time corpus. 

- Pairs stored in dataframe (df2) in the following format:

    |substance| classes | seed   | source    | tagert   | weight | colourbias | 
    |----------------| ------------------------- | ------ | --------- | -------- | ------ | ---------- |
    |LSD| Serotonergic psychedelics | slowly | periphery | abstract | 1      | -0.1       |

<br>

- The colourbias quantifies whether the seed word is a ntp, ftp, or stp. This score is used for visulization later. 

In [None]:
%%time
#co-occurence network - edges
def context_words_edges_vertices(C):
    #splitting the dataframe into parts and saving locally reduces memory load on juypter notebook and speeds up code significantly
    for k in range(216): #adjust 216 - in my case reports = 21,548, so 100*216 = 21,600
        print(k)
        df2  = pd.DataFrame(columns = ["substance", "classes", "seed", "source", "target", "colourbias"])
        for i, text in enumerate(df.text[k*100:(k+1)*100]): 
            
            substance = df.loc[i+(k*100), "substance"]
            class_ = df.loc[i+(k*100), "classes"]
            

            for context_words, center_word in get_windows(text, C):
                if center_word in ntp_words:
                    all_pairs = [(a, b) for idx, a in enumerate(context_words) for b in context_words[idx + 1:]]
                    for a, b in all_pairs:
                        df2.loc[len(df2.index)] = [substance, class_, center_word, a, b, 0]   
                elif center_word in ftp_words:
                    all_pairs = [(a, b) for idx, a in enumerate(context_words) for b in context_words[idx + 1:]]
                    for a, b in all_pairs:
                        df2.loc[len(df2.index)] = [substance, class_, center_word, a, b, 1]
                elif center_word in stp_words:
                    all_pairs = [(a, b) for idx, a in enumerate(context_words) for b in context_words[idx + 1:]]
                    for a, b in all_pairs:
                        df2.loc[len(df2.index)] = [substance, class_, center_word, a, b, -1]    
        df2.to_pickle("timecorpus" + str(k) + ".pkl")

    

#adjust context window C
context_words_edges_vertices(C=4)


#concatenate dataframe parts
df2  = pd.DataFrame(columns = ["classes", "seed", "source", "target", "colourbias"])
for k in range(216):
    temp = pd.read_pickle("timecorpus" + str(k) + ".pkl")
    df2 = pd.concat([df2, temp], axis=0)


#Format and save file
#keep "timecorpus.pkl", delete timecorpus0-215.pkl
df2.reset_index(inplace=True, drop=True)
df2.to_pickle("timecorpus_C=4.pkl")

<h4> EXTRA: Get all co-occurrences from random seed words (null model) </h4>

 This is seperate from generating the Time corpus pipeline. Compute the co-occurrences for a network from random seed words. 
 
 Steps:

 - Calculate distribution characteristics of time perception seed words.
 - Create random list of unique words (randomly selected from Erowid corpus) that match the distribution characteristics of time perception seed words. 
 - For reach random list, compute the co-occurrences for set context window and save as file.
 - For me, each random network ~ 1h compute time, therefore for high number of random networks, this cell may run a couple of days. 


<br>

Beforehand: 
- Create local folders 'Temp Random corpus' and 'Random corpus'

In [None]:
%%time

#frequency of timewords in Erowid corpus
time_words_count_list = []
for word in time_words:
    time_words_count_list.append(corpus_list.count(word))


# Calculate the mean
time_mean = sum(time_words_count_list) / len(time_words_count_list)

# Calculate the variance
time_variance = sum((x - time_mean) ** 2 for x in time_words_count_list) / len(time_words_count_list)

# Calculate the standard deviation
time_std_deviation = math.sqrt(time_variance)




for j in range(30): #Adjust N, to determine how many random (null) networks you want. (Compuate time: 1h each)
    print(j)
    random_list = []
    random_words_count_list = []
    keys = list(set(corpus_list))
    condition = False
    num = 0
    # Check that the number of words to select is less than or equal to the total number of keys
    while not condition: 
        num += 1
        random_list = random.sample(keys, k=81)
        random_count_list = [corpus_dict[key] for key in random_list if key in corpus_dict]
        random_mean = sum(random_count_list) / len(random_count_list)
        random_variance = sum((x - random_mean) ** 2 for x in random_count_list) / len(random_count_list)
        random_std_deviation = math.sqrt(random_variance)
        #if a random word list shares the same number of words (81) and a +-5% mean, standard deviation and cummulative frequency with the time words, it's chosen
        if (0.95 * sum(time_words_count_list)) < sum(random_count_list) < (1.05 * sum(time_words_count_list)) and (0.95*time_mean) < random_mean < (1.05*time_mean) and (0.95*time_std_deviation) < random_std_deviation < (1.05*time_std_deviation):
            condition = True
            print(random_list)


            # Get context window function for random_words
            def modified_get_windows(words, C):
                i = C
                while i < len(words) - C:
                    center_word = words[i]
                    if center_word not in random_list:
                        i += 1
                        pass
                    else:
                        context_words = words[(i - C):i] + words[(i+1):(i+C+1)]
                        yield context_words, center_word
                        i += 1


            #co-occurence network - edges
            def context_words_edges_vertices(C):
                #splitting the dataframe into parts and saving locally reduces memory load on juypter notebook and speeds up code significantly
                for k in range(216): #adjust 216 - in my case reports = 21,548, so 100*216 = 21,600
                    print(k)
                    df2  = pd.DataFrame(columns = ["substance", "classes", "seed", "source", "target"])
                    for i, text in enumerate(df.text[k*100:(k+1)*100]): 
                        
                        substance = df.loc[i+(k*100), "substance"]
                        class_ = df.loc[i+(k*100), "classes"]
                        
                        for context_words, center_word in modified_get_windows(text, C):
                            all_pairs = [(a, b) for idx, a in enumerate(context_words) for b in context_words[idx + 1:]]
                            for a, b in all_pairs:
                                df2.loc[len(df2.index)] = [substance, class_, center_word, a, b]
                    df2.to_pickle("Temp Random corpus/timecorpus" + str(k) + ".pkl")

        

            #adjust context window 
            context_words_edges_vertices(C=4)


            #concatenate dataframe parts
            df2  = pd.DataFrame(columns = ["classes", "seed", "source", "target"])
            for k in range(216):    
                temp = pd.read_pickle("Temp Random corpus/timecorpus" + str(k) + ".pkl")
                df2 = pd.concat([df2, temp], axis=0)


            #Format and save file
            #keep "timecorpus.pkl", delete timecorpus0-215.pkl
            df2.reset_index(inplace=True, drop=True)
            df2.to_pickle(f"Random corpus/_randomcorpus_C=4_j={j}.pkl")