<h1> Get Tf-Idf (For co-occurence networks only) </h1>

This is a $tf\text{-}idf$ implementation where we calculate the $tf$ (term frequency) for the time corpus of each substance class. $idf$ (inverse document frequency) is calculated by cocatenating all of the Erowid corpus subtracted by the time corpus for a particular substance, and splitting them into docs of 50 words. 

In [6]:
#import functions
import pandas as pd
import pickle
from collections import Counter, OrderedDict, defaultdict
from math import log
import math

#import data
df = pd.read_pickle("processed_data_3.pkl")

<h3> Seed words </h3>

In [7]:
ntp_words = ['time', 'period', 'periods', 'duration', 'clock', 'temporal', 'spacetime', 'timespan', 'timespans', 'timeline', 'timelines', 'elapse', 'elapsed', 'length', 'timewise', 'velocity', 'pace', 'rate', 'tempo', 'pass', 'passing', 'passed']
ftp_words = ['quick','quicker', 'quickly', 'quickest', 'fast', 'faster', 'fastest', 'fastened', 'rapid','rapidly', 'short', 'shorter', 'shortly', 'shortest','speedy', 'speedy','speeded', 'speedier', 'hurry', 'hurried', 'swift', 'swifter', 'swiftly', 'haste', 'hasty', 'brisk', 'turbo', 'accelerate', 'acceleration', 'accelerated', 'accelerating']
stp_words = ['slow', 'slower', 'slowly', 'slows', 'slowed', 'slowest', 'slowing', 'slowdown', 'long', 'looong', 'longer', 'longer', 'longest', 'steady', 'deceleration', 'decelerate', 'decelerating', 'decelerated', 'dilatory', 'dilation', 'infinity', 'eternity', 'lengthy', 'prolonged', 'protracted', 'extended', 'unending', 'endless']
time_words = sorted(ntp_words + ftp_words + stp_words)


<h3> Functions </h3>

In [3]:
# Get context window function for time_words
def get_windows(words, C):
    i = C
    while i < len(words) - C:
        center_word = words[i]
        if center_word not in time_words:
            i += 1
            pass
        else:
            context_words = words[(i - C):i] + words[(i+1):(i+C+1)]
            yield context_words, center_word
            i += 1

# Get indices for each seed word in corpus list
def get_time_indices(words, C):
    i = C
    while i < len(words) - C:
        if words[i] not in time_words:
            i += 1
            pass
        else:
            yield i
            i += 1

<h3> Erowid corpus </h3>

- class_sorted_df = df sorted by class alphabetically
- corpus_list = Erowid corpus cocatenated sorted by class

In [4]:
C=4 #change C here

#class_sorted_df
class_sorted_df = df.copy()
class_sorted_df.sort_values(by=["classes"], ascending=True, inplace=True)
class_sorted_df.reset_index(drop=True, inplace=True)  

#corpus list
corpus_list = []
for text in class_sorted_df.text:
    corpus_list += text

<h3> Tf-idf info dictionary for each class </h3> 

- classes_info_dict = stores tf-idf info for each class separately.

Format:

```ruby
{Stimulants:[[XXX, XXX], #'Stimulants corpus' indexed in corpus_list
              ["word 1", "word 2", "word 1"], #tf list
              ["try new stimulants was excited ...", "closed my eyes and ..."] #idf doc list (each doc 50 words)  
            ]}

```

In [None]:
#class info dict: 1st list: start and end index, 2nd list:tf list, 3rd list: idf doc list

classes_info_dict = {"all":[[0, 0], [], []]}
for class_ in class_sorted_df.classes.unique():
    classes_info_dict[class_] = [[0, 0], [], []]  

#add values to #class info dict
for class_ in classes_info_dict:
    start_index = 0
    temp_class_ = class_
    for temp_class_ in classes_info_dict:
        if temp_class_ == "all":
            classes_info_dict["all"][0][0] = 0
            classes_info_dict["all"][0][1] = len(corpus_list) - 1

        else:   
            classes_info_dict[temp_class_][0][0] = start_index
            last_index = start_index - 1
            for i, text in enumerate(class_sorted_df.text):
                if temp_class_ == class_sorted_df.loc[i, "classes"]:
                    last_index += len(text)
                    classes_info_dict[temp_class_][0][1] = last_index
            start_index = last_index + 1


    class_index_start = classes_info_dict[class_][0][0]
    class_index_stop = classes_info_dict[class_][0][1]


    #tf words list per classs
    tf_words_list = []
    for context_words, center_word in get_windows(corpus_list[class_index_start:class_index_stop], C=C):
        tf_words_list += context_words

    #save tf words list in dict
    classes_info_dict[class_][1] += tf_words_list


    #indices of time words in Stimulants corpus
    indices = []
    for i in get_time_indices(corpus_list, C=C): 
        if i in range(class_index_start, class_index_stop):
            indices.append(i)

    # Sort indices in descending order so that we can delete items without affecting the indices of the remaining items
    indices.sort(reverse=True)

    # Get indices of entire context windows to delete in a separate list
    to_delete = []
    for i in indices:
        # Calculate the indices to delete
        context_window_indices = [index for index in range(i - C, i + C + 1)]
        to_delete += context_window_indices


    # Only keep text in corpus list that that does not match the indices of context windows
    to_delete_set = set(to_delete)
    idf_words_list = [word for i, word in enumerate(corpus_list) if i not in to_delete_set]

    #Convert idf words into list of string docs with 50 words each
    idf_doc_list = []

    j=0
    for i in range(0,len(idf_words_list), 50):
        idf_doc_list.append(" ".join(idf_words_list[j:i]))
        j = i
    idf_doc_list.pop(0)

    #save to dict
    classes_info_dict[class_][2] += idf_doc_list

<h3> Calculate tf-idf values for word in each class and collect in dataframe </h3>

In [None]:
classes = ["all"] + list(class_sorted_df.classes.unique())

tfidf_df = pd.DataFrame(columns=classes)
 
#insert column with all unique words in corpus_list
tfidf_df.insert(loc=0, column="word", value=list(set(corpus_list)))

#replace "nan" with 0
tfidf_df.fillna(0,inplace=True)


for class_ in classes_info_dict:
    print(class_)
    #get tf words list and idf doc list for every class
    tf_words_list = classes_info_dict[class_][1]
    idf_doc_list = classes_info_dict[class_][2]

    # Count the occurrences of each word in the document
    tf_words_counter = Counter(tf_words_list)

    # Pre-calculate IDF values for each unique word in the document list
    unique_words = set(tf_words_list)
    idf_values = {}
    for word in unique_words:
        num_docs_containing_word = sum(1 for d in idf_doc_list if word in d)
        idf_values[word] = log(len(idf_doc_list) / (1 + num_docs_containing_word*5))

    tfidf_dict = {}

    # Calculate TF-IDF values for each unique word in the document
    for i, word in enumerate(unique_words):
        tf = log(tf_words_counter[word], 1.01)
        idf = idf_values[word]
        tfidf_dict[word] = round(tf * idf, 2)


    for i, word in enumerate(tfidf_df.word):
        if word in tfidf_dict:
            tfidf_df.loc[i, class_] = tfidf_dict[word]

#save
tfidf_df.to_pickle(f"tfidf_df_C="{C}".pkl")

<h3> Below is a similar implemntation but for substances with large number of reports </h3>

In [4]:
C=4 #change C here
substances = ["lsd", "psilocybin mushrooms", "dmt", "mdma", "cannabis spp.", "salvia divinorum"]


#class_sorted_df - NOTE: I am calling subtances
substance_sorted_df = df.copy()
substance_sorted_df.sort_values(by=["substance"], ascending=True, inplace=True)
substance_sorted_df.reset_index(drop=True, inplace=True)  



#corpus list
substance_corpus_list = []
for text in substance_sorted_df.text:
    substance_corpus_list += text

In [5]:
#Substance info dict: 1st list: start and end index, 2nd list:tf list, 3rd list: idf doc list



substance_info_dict = {}
for substance in substances:
    substance_info_dict[substance] = [[0, 0], [], []]  
    first_time = True
    substance_corpus_list_index = -1
    for i, text in enumerate(substance_sorted_df.text):
        substance_corpus_list_index += len(text)
        if substance == substance_sorted_df.loc[i, "substance"] and first_time:
            substance_info_dict[substance][0][0] = substance_corpus_list_index - len(text) #start_index
            first_time = False
        
        if substance != substance_sorted_df.loc[i, "substance"] and not first_time:
            substance_info_dict[substance][0][1] = substance_corpus_list_index - len(text) #last_index  
            first_time = True




#add values to #substance info dict
for substance in substance_info_dict:

    substance_index_start = substance_info_dict[substance][0][0]
    substance_index_stop = substance_info_dict[substance][0][1]


    #tf words list per substance
    tf_words_list = []
    for context_words, center_word in get_windows(substance_corpus_list[substance_index_start:substance_index_stop], C=C):
        tf_words_list += context_words

    #save tf words list in dict
    substance_info_dict[substance][1] += tf_words_list


    #indices of time words in Stimulants corpus
    indices = []
    for i in get_time_indices(substance_corpus_list, C=C): 
        if i in range(substance_index_start,substance_index_stop):
            indices.append(i)

    # Sort indices in descending order so that we can delete items without affecting the indices of the remaining items
    indices.sort(reverse=True)

    # Get indices of entire context windows to delete in a separate list
    to_delete = []
    for i in indices:
        # Calculate the indices to delete
        context_window_indices = [index for index in range(i - C, i + C + 1)]
        to_delete += context_window_indices


    # Only keep text in corpus list that that does not match the indices of context windows
    to_delete_set = set(to_delete)
    idf_words_list = [word for i, word in enumerate(substance_corpus_list) if i not in to_delete_set]

    #Convert idf words into list of string docs with 50 words each
    idf_doc_list = []

    j=0
    for i in range(0,len(idf_words_list), 50):
        idf_doc_list.append(" ".join(idf_words_list[j:i]))
        j = i
    idf_doc_list.pop(0)

    #save to dict
    substance_info_dict[substance][2] += idf_doc_list

In [None]:


tfidf_df = pd.DataFrame(columns=substances)
 
#insert column with all unique words in corpus_list
tfidf_df.insert(loc=0, column="word", value=list(set(substance_corpus_list)))

#replace "nan" with 0
tfidf_df.fillna(0,inplace=True)


for substance in substance_info_dict:
    print(substance)
    #get tf words list and idf doc list for every class
    tf_words_list = substance_info_dict[substance][1]
    idf_doc_list = substance_info_dict[substance][2]

    # Count the occurrences of each word in the document
    tf_words_counter = Counter(tf_words_list)

    # Pre-calculate IDF values for each unique word in the document list
    unique_words = set(tf_words_list)
    idf_values = {}
    for word in unique_words:
        num_docs_containing_word = sum(1 for d in idf_doc_list if word in d)
        idf_values[word] = log(len(idf_doc_list) / (1 + num_docs_containing_word*5))

    tfidf_dict = {}

    # Calculate TF-IDF values for each unique word in the document
    for i, word in enumerate(unique_words):
        tf = log(tf_words_counter[word], 1.01)
        idf = idf_values[word]
        tfidf_dict[word] = round(tf * idf, 2)


    for i, word in enumerate(tfidf_df.word):
        if word in tfidf_dict:
            tfidf_df.loc[i, substance] = tfidf_dict[word]

#save
tfidf_df.to_pickle(f"tfidf_df_C={C}_SUBSTANCES.pkl")