<h1> Analysis 2 (BERTTopic) </h1>



This analysis involved using the (detailed) [BERTopic Documentation](https://maartengr.github.io/BERTopic/api/bertopic.html). Since it involved many manual steps that were specific to our data (e.g. removing certain topics by index), only the key steps are shown

<h4> Import data </h4>

Remember your data is in this format:

|index| url | substance    | classes   | center_word | doc | 
|----------------| ------------------------- | ------ | --------- | -------- | ------ |
|5| https://erowid.org/experiences/... | 1p-lsd | Serotonergic psychedelics | time | I was talking around the ... |


In [None]:
#imports
import pandas as pd
import pickle   
from bertopic import BERTopic
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import plotly.offline as pyo
import matplotlib.pyplot as plt

bertopic versions
all C=15

NEW
1) bertopic_model_1: topic_model = BERTopic(seed_topic_list=seed_topic_list) - docs, n-gram, stop_words
2) bertopic_model_2: topic_model = BERTopic(seed_topic_list=seed_topic_list) - docs2, n-gram, stop_words, calculate_probabilities=True


For Antidepressants / antipsychotics, Deliriants, Psilocybin mushrooms (surprising for this one that neceesary) min topic size reduced to 4 -> more topics appeared also above count 4, then excluded above 4. 

For each substance class, do removal of topics manually.

In [3]:
remove_df1 = pd.DataFrame()

remove_df1["Serotonergic psychedelics"] = [0, 3, 5,  6, 8, 10, 11, 13, 14, 15, 16, 17, 18, 25, 26, 30, 31, 34, 43, 44, 46, 50, 51, 58, 60, 61, 63, 65, 68, 70, 73, 81, 86, 90, 91, 93, 94, 97, 99, 105, 108, 111, 112, 116, 118, 120, 122, 126, 130, 142]
remove_df1["Dissociative psychedelics"] = [0, 3, 4, 6, 8, 12, 13, 19, 20, 21, 22, 26, 28, 29, 31, 36, 43, 44, 47, 1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000, 1000]
remove_df1["Entactogens"] = [0, 1, 2, 4, 5, 7, 8, 9, 11, 14, 18, 20, 22, 24, 25, 26, 27, 36, 40, 44, 47, 50, 52, 53, 54, 55, 57, 61, 63, 64, 65,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000, 1000]
remove_df1["Depressant sedatives"] = [0, 1, 3, 6, 9, 10, 15, 16, 17, 20, 21, 22, 23, 30, 31, 32, 33, 35, 38, 41, 45, 46, 47, 51, 52, 53, 56, 57, 61, 62, 65, 74, 76, 79, 81, 82, 89, 90, 96, 97,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000, 1000]
remove_df1["Stimulants"] = [0, 1, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 16, 17, 19, 20, 21, 22, 23, 26, 27, 28, 30, 33, 37, 39, 45, 47, 50, 52, 54, 55, 56, 59,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000, 1000]
remove_df1["Deliriants"] = [0, 2, 3, 4, 5, 7, 8, 10, 14, 15, 23, 31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000, 1000]
remove_df1["Antidepressants antipsychotics"] = [2, 3, 4, 5, 6, 8, 9, 10, 11, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000]
remove_df1["LSD"] = [1, 3, 4, 5, 7, 10, 11, 19, 20, 21, 22, 28, 38, 41, 42,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000, 1000]
remove_df1["MDMA"] = [0, 1, 2, 5, 6, 7, 12, 15, 17, 19, 21, 22, 28, 30, 32, 34, 37, 38, 40, 41, 42, 44, 45, 46, 47, 48, 50, 51, 52, 54, 59,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000, 0]
remove_df1["Cannabis spp"] = [0, 1, 2, 7, 14, 17, 19, 21, 29, 30, 31, 35, 38, 39,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000, 1000]
remove_df1["Salvia divinorum"] = [0, 1, 3, 4, 6, 7, 9, 11, 15, 16, 20, 28, 34, 35, 37, 38, 40, 45, 46, 56,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000, 1000]
remove_df1["DMT"] = [1, 3, 4, 6, 9, 12, 14, 15, 19, 20, 25, 31, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]

psilocybin_mushrooms_remove1 = [0, 1, 3, 8, 12, 13, 14, 21, 22, 23, 24, 26, 31, 38, 52, 55, 57, 58, 66, 70, 72, 82, 85, 90, 95, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244]



depressant_sedatives_remove2 = [2, 4, 5, 8, 10, 19, 29, 31, 41, 45, 52, 55, 61, 66, 69, 73, 79, 82, 83, 87, 89, 91, 93, 94, 96, 101, 102, 104, 108, 110, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253]
deliriants_remove2 = [0, 1, 2, 9, 10, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]
antidepressants_antipsychotic_remove2 = [1, 7, 8, 9, 10, 11, 12, 52]
lsd_remove2 = [1, 2, 5, 14, 30, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150]
dmnt_remove2 = [2, 20, 21, 27, 32, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73]

In [4]:
#Get docs and topic model for particular class or substance
substance_or_class = "Serotonergic psychedelics"

topic_model = BERTopic.load(f"BERTopic files/BERTopic models/bertopic_model_1_{substance_or_class}")


docs = pd.read_pickle(f"BERTopic files/BERTopic docs/BERTopic_docs_{substance_or_class}.pkl")

temp_remove_list1 = list(set(remove_df1[substance_or_class].tolist()))

document_info = topic_model.get_document_info(docs)


#get df for class/substance
class_df  = pd.read_pickle(f"BERTopic files/BERTopic docs/BERTopic_df_{substance_or_class}.pkl")

#merge df and document info, so doc, ntp-ftp-stp class and topic in one df for color map
temp_df = pd.merge(class_df, document_info, left_on='docs', right_on='Document')


#update df based of keep list
df2 = temp_df[~temp_df.filter(items=['Topic']).isin(temp_remove_list1).any(axis=1)]
df2.reset_index(drop=True, inplace=True)


In [7]:
#get 3 most representative docs2 per topic
topic=87 #change topic here


list_of_3 = topic_model.get_representative_docs(topic=topic)
print(topic_model.get_topic(topic))
print()
print()
for doc in list_of_3:
    print(doc)
    i = df[df.docs == doc].index[0] 
    print("-----------")
    print("Seed: " + str(df.loc[i, "seed"]))
    print("Substance: " + str(df.loc[i, "substance"]))
    print("Class: " + str(df.loc[i, "classes"]))
    print("Url: " + str(df.loc[i, "url"]))
    print("-----------")
    print()

[('fractal', 0.15781796218083613), ('fractals', 0.09967148237011583), ('infinity', 0.03936794261871484), ('endless', 0.029808580435880267), ('arrangements', 0.029772939986120232), ('mathematical', 0.028383255619488658), ('universes', 0.025683584556699914), ('symbol', 0.0250403827875864), ('exploded', 0.023966655871418916), ('thousands', 0.02156385888031063)]


fractal . things , people , objects , all in a colorful light grid of eternity and infinity . this is not what i was expecting . not like this .
-----------
Seed: eternity
Substance: dmt
Class: Serotonergic psychedelics
Url: https://erowid.org/experiences/exp.php?ID=90880
-----------

i once again encountered the fractal mind but PLACEHOLDER PLACEHOLDER it would explode off into infinity but i was still able to comprehend all of it at once . i realized
-----------
Seed: infinity
Substance: lsd
Class: Serotonergic psychedelics
Url: https://erowid.org/experiences/exp.php?ID=18714
-----------

down into the fractal the PLACEHOLDER P

In [21]:
#find similar topics based of keyword(s)
similar_topics, similarity = topic_model.find_topics("slow", top_n=15)
print(similar_topics)

[33, 12, -1, 193, 111, 119, 190, 172, 22, 99, 222, 211, 2, 62, 175]


<h3> Class topic modelling </h3>

In [None]:
import matplotlib.pyplot as plt

x = []

for topic in df3['Topic'].unique():
    subset = df3[df3['Topic'] == topic]
    print(f"Topic: {topic}")
    normal_count = int(subset[subset['type'] == 'Neutral time perception words']['weight'].values)
    slow_count = int(subset[subset['type'] == 'Slow time perception words']['weight'].values)
    fast_count = int(subset[subset['type'] == 'Fast time perception words']['weight'].values)
    print(f"Normal count: {normal_count}, Slow count: {slow_count}, Fast count: {fast_count}")
    print(f"x {int(slow_count/fast_count)}")
    x.append(int(slow_count/fast_count))





In [None]:
import matplotlib.pyplot as plt


for topic in enumerate(df3.Topic):
    ftp = 0
    ntp = 0
    stp = 0

    if df3.loc[i, "type"] == "Fast time perception words":
        ftp = df3.loc[i, "weight"]
    if df3.loc[i, "type"] == "Normal time perception words":
        ntp = df3.loc[i, "weight"]
    if df3.loc[i, "type"] == "Slow time perception words":
        stp = df3.loc[i, "weight"]
    sum_ = ftp + ntp + stp
    print(f"FTP: {(ftp/sum_)*100}%,  NTP: {(ntp/sum_)*100}%,  STP: {(stp/sum_)*100}%")


In [None]:
#get substance class or type labels for each doc
#substance = df2.substance.to_list()
types = df2.type.to_list()



#topics per class or type - https://maartengr.github.io/BERTopic/getting_started/topicsperclass/topicsperclass.html
#df3 = topic_model.topics_per_class(docs2, classes=substances)
df3 = topic_model.topics_per_class(df2.Document.to_list(), classes=types)



#get frequency of substance class/documents of that class
#freq_dict = dict(df2.substance.value_counts())
freq_dict = dict(df2.type.value_counts())


#the series 'class' is not about substance classes but has substances in it. The term class comes from the BerTopic documentation.
for i, item in enumerate(df3.Frequency):
    df3.loc[i, "Frequency"] = df3.loc[i, "Frequency"]/freq_dict[df3.loc[i, "Class"]]
    

<h3> Other visualisations </h3>

In [None]:


#hierarchy things

#get hierarhical documents
hierarchical_topics = topic_model.hierarchical_topics(docs2)


# Visualize these representations
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)


#topic tree
print(topic_model.get_topic_tree(hierarchical_topics, max_distance=None, tight_layout=False))


#itnertopic distance map
topic_model.visualize_topics()

#bar charts
topic_model.visualize_barchart(topics=XXX)

#heat map
topic_model.visualize_heatmap(topics=[3, 28])

<h3> Ereate Erowid qutoes </h3>

In [None]:
#create Erowid qutoes (ignore)
dictx = {}
for i, item in enumerate(df.classes):
    substance = df.loc[i, "substance"]
    if item not in dictx:
        dictx[item] = []
    if substance not in dictx[item]:
        dictx[item].append(substance)


listx = []
for i, item in enumerate(topic_info.CustomName):
    if int(topic_info.loc[i, "Topic"]) not in remove_list:
        listx.append(item)


#for substances
for item in dictx.values():
    for i in item:
        print(f"<option value='{i}'>{i}</option>")

#for substances
for i in dictx.keys():
    print(f"<option value='{i}'>{i}</option>")

<h3> Visualize documents by topic colour and ntp-ftp-stp class colour </h3>

The graph generated with is a modified version of the <code>visualize_documents</code> and <code>visualize__hierarhical_documents</code>, so that one can see how the topics modelled reflect normal, fast or slow time perception (ntp-ftp, stp). In colour mode 1 documents are coloured according to their topic, and in colour mode 2 documents are coloured white-red-blue according to their ntp-ftp-stp class (e.g. "slower" seed word -> stp).

To use this function, it has to be added to the BERTopic documentation. Also, you need to pass a dictionary <code>color_map2</code> with documents as keys and RGB values as values, and a list topics indexed by doc for all the topics you want to model. 

<br>

All the modifications of the BERTopic documentation have the comment "#CHANGED HERE". The most important line that was changed is this one:

 ```marker=dict(size=5, opacity=0.5, color = [color_map[doc] if doc is not None else [255, 255, 255] for doc in selection.doc]) ```

In [None]:
def modified_visualize_documents(self,
                                    docs: List[str],
                                    topics: List[int] = None,
                                    embeddings: np.ndarray = None,
                                    reduced_embeddings: np.ndarray = None,
                                    sample: float = None,
                                    hide_annotations: bool = False,
                                    hide_document_hover: bool = False,
                                    custom_labels: bool = False,
                                    title: str = "<b>Documents and Topics</b>",
                                    width: int = 1200,
                                    height: int = 750,
                                    color_map2 = None, #CHANGED HERE
                                    keep_list3 = None): #CHANGED HERE
        """ Visualize documents and their topics in 2D

        Arguments:
            topic_model: A fitted BERTopic instance.
            docs: The documents you used when calling either `fit` or `fit_transform`
            topics: A selection of topics to visualize.
                    Not to be confused with the topics that you get from `.fit_transform`.
                    For example, if you want to visualize only topics 1 through 5:
                    `topics = [1, 2, 3, 4, 5]`.
            embeddings: The embeddings of all documents in `docs`.
            reduced_embeddings: The 2D reduced embeddings of all documents in `docs`.
            sample: The percentage of documents in each topic that you would like to keep.
                    Value can be between 0 and 1. Setting this value to, for example,
                    0.1 (10% of documents in each topic) makes it easier to visualize
                    millions of documents as a subset is chosen.
            hide_annotations: Hide the names of the traces on top of each cluster.
            hide_document_hover: Hide the content of the documents when hovering over
                                specific points. Helps to speed up generation of visualization.
            custom_labels: Whether to use custom topic labels that were defined using 
                        `topic_model.set_topic_labels`.
            title: Title of the plot.
            width: The width of the figure.
            height: The height of the figure.

        Examples:

        To visualize the topics simply run:

        ```python
        topic_model.visualize_documents(docs)
        ```

        Do note that this re-calculates the embeddings and reduces them to 2D.
        The advised and prefered pipeline for using this function is as follows:

        ```python
        from sklearn.datasets import fetch_20newsgroups
        from sentence_transformers import SentenceTransformer
        from bertopic import BERTopic
        from umap import UMAP

        # Prepare embeddings
        docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']
        sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
        embeddings = sentence_model.encode(docs, show_progress_bar=False)

        # Train BERTopic
        topic_model = BERTopic().fit(docs, embeddings)

        # Reduce dimensionality of embeddings, this step is optional
        # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

        # Run the visualization with the original embeddings
        topic_model.visualize_documents(docs, embeddings=embeddings)

        # Or, if you have reduced the original embeddings already:
        topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)
        ```

        Or if you want to save the resulting figure:

        ```python
        fig = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)
        fig.write_html("path/to/file.html")
        ```

        <iframe src="../../getting_started/visualization/documents.html"
        style="width:1000px; height: 800px; border: 0px;""></iframe>
        """
        topic_per_doc = keep_list3 #CHANGED HERE

        # Sample the data to optimize for visualization and dimensionality reduction
        if sample is None or sample > 1:
            sample = 1

        indices = []
        for topic in set(topic_per_doc):
            s = np.where(np.array(topic_per_doc) == topic)[0]
            size = len(s) if len(s) < 100 else int(len(s) * sample)
            indices.extend(np.random.choice(s, size=size, replace=False))
        indices = np.array(indices)

        df = pd.DataFrame({"topic": np.array(topic_per_doc)[indices]})
        df["doc"] = [docs[index] for index in indices]
        df["topic"] = [topic_per_doc[index] for index in indices]

        # Extract embeddings if not already done
        if sample is None:
            if embeddings is None and reduced_embeddings is None:
                embeddings_to_reduce = self._extract_embeddings(df.doc.to_list(), method="document")
            else:
                embeddings_to_reduce = embeddings
        else:
            if embeddings is not None:
                embeddings_to_reduce = embeddings[indices]
            elif embeddings is None and reduced_embeddings is None:
                embeddings_to_reduce = self._extract_embeddings(df.doc.to_list(), method="document")

        # Reduce input embeddings
        if reduced_embeddings is None:
            umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit(embeddings_to_reduce)
            embeddings_2d = umap_model.embedding_
        elif sample is not None and reduced_embeddings is not None:
            embeddings_2d = reduced_embeddings[indices]
        elif sample is None and reduced_embeddings is not None:
            embeddings_2d = reduced_embeddings

        unique_topics = set(topic_per_doc)
        if topics is None:
            topics = unique_topics

        # Combine data
        df["x"] = embeddings_2d[:, 0]
        df["y"] = embeddings_2d[:, 1]

        # Prepare text and names
        if self.custom_labels_ is not None and custom_labels:
            names = [self.custom_labels_[topic + self._outliers] for topic in unique_topics]
        else:
            names = [f"{topic}_" + "_".join([word for word, value in self.get_topic(topic)][:3]) for topic in unique_topics]



        # Outliers and non-selected topics
        non_selected_topics = set(unique_topics).difference(topics)
        if len(non_selected_topics) == 0:
            non_selected_topics = [-1]

        selection = df.loc[df.topic.isin(non_selected_topics), :]
        selection["text"] = ""
        selection.loc[len(selection), :] = [None, None, selection.x.mean(), selection.y.mean(), "Other documents"]



        all_traces = []
        for level in range(2): #CHANGED FROM HERE
            traces = []


            if level == 0:

                # Selected topics
                for name, topic in zip(names, unique_topics):
                    if topic in topics and topic != -1:
                        selection = df.loc[df.topic == topic, :]
                        selection["text"] = ""

                        if not hide_annotations:
                            selection.loc[len(selection), :] = [None, None, selection.x.mean(), selection.y.mean(), name]

                        traces.append(
                            go.Scattergl(
                                x=selection.x,
                                y=selection.y,
                                hovertext=selection.doc if not hide_document_hover else None,
                                hoverinfo="text",
                                text=selection.text,
                                mode='markers+text',
                                name=name,
                                textfont=dict(
                                    size=12,
                                ),
                                marker=dict(size=5, opacity=0.5)
                            )
                        )

            elif level == 1: 
                # Selected topics
                for name, topic in zip(names, unique_topics):
                    if topic in topics and topic != -1:
                        selection = df.loc[df.topic == topic, :]
                        selection["text"] = ""

                        if not hide_annotations:
                            selection.loc[len(selection), :] = [None, None, selection.x.mean(), selection.y.mean(), name]


                        traces.append(
                            go.Scattergl(
                                x=selection.x,
                                y=selection.y,
                                hovertext=selection.doc if not hide_document_hover else None,
                                hoverinfo="text",
                                text=selection.text,
                                mode='markers+text',
                                name=name,
                                textfont=dict(size=12),
                                marker=dict(size=5, opacity=0.5, color = [color_map2[doc] if doc is not None else [255, 255, 255] for doc in selection.doc]) #TO HERE
                            )
         
                        )
            all_traces.append(traces)



        # Track and count traces
        nr_traces_per_set = [len(traces) for traces in all_traces]
        trace_indices = [(0, nr_traces_per_set[0])]
        for index, nr_traces in enumerate(nr_traces_per_set[1:]):
            start = trace_indices[index][1]
            end = nr_traces + start
            trace_indices.append((start, end))

        # Visualization
        fig = go.Figure()
        for traces in all_traces:
            for trace in traces:
                fig.add_trace(trace)

        for index in range(len(fig.data)):
            if index >= nr_traces_per_set[0]:
                fig.data[index].visible = False

        # Create and add slider
        steps = []
        for index, indices in enumerate(trace_indices):
            step = dict(
                method="update",
                label=str(index),
                args=[{"visible": [False] * len(fig.data)}]
            )
            for index in range(indices[1]-indices[0]):
                step["args"][0]["visible"][index+indices[0]] = True
            steps.append(step)

        sliders = [dict(
            currentvalue={"prefix": "Colour mode: "},
            pad={"t": 20},
            steps=steps
        )]
        

        # Add grid in a 'plus' shape
        x_range = (df.x.min() - abs((df.x.min()) * .15), df.x.max() + abs((df.x.max()) * .15))
        y_range = (df.y.min() - abs((df.y.min()) * .15), df.y.max() + abs((df.y.max()) * .15))
        fig.add_shape(type="line",
                    x0=sum(x_range) / 2, y0=y_range[0], x1=sum(x_range) / 2, y1=y_range[1],
                    line=dict(color="#CFD8DC", width=2))
        fig.add_shape(type="line",
                    x0=x_range[0], y0=sum(y_range) / 2, x1=x_range[1], y1=sum(y_range) / 2,
                    line=dict(color="#9E9E9E", width=2))
        fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10)
        fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10)

        # Stylize layout
        fig.update_layout(
            sliders=sliders,
            template="simple_white",
            title={
                'text': f"{title}",
                'x': 0.5,
                'xanchor': 'center',
                'yanchor': 'top',
                'font': dict(
                    size=22,
                    color="Black")
            },
            width=width,
            height=height
        )

        fig.update_xaxes(visible=False)
        fig.update_yaxes(visible=False)
        return fig 

In [None]:
remove_df1 = pd.DataFrame()

remove_df1["Serotonergic psychedelics"] = [0, 3, 5,  6, 8, 10, 11, 13, 14, 15, 16, 17, 18, 25, 26, 30, 31, 34, 43, 44, 46, 50, 51, 58, 60, 61, 63, 65, 68, 70, 73, 81, 86, 90, 91, 93, 94, 97, 99, 105, 108, 111, 112, 116, 118, 120, 122, 126, 130, 142]
remove_df1["Dissociative psychedelics"] = [0, 3, 4, 6, 8, 12, 13, 19, 20, 21, 22, 26, 28, 29, 31, 36, 43, 44, 47, 1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000, 1000]
remove_df1["Entactogens"] = [0, 1, 2, 4, 5, 7, 8, 9, 11, 14, 18, 20, 22, 24, 25, 26, 27, 36, 40, 44, 47, 50, 52, 53, 54, 55, 57, 61, 63, 64, 65,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000, 1000]
remove_df1["Depressant sedatives"] = [0, 1, 3, 6, 9, 10, 15, 16, 17, 20, 21, 22, 23, 30, 31, 32, 33, 35, 38, 41, 45, 46, 47, 51, 52, 53, 56, 57, 61, 62, 65, 74, 76, 79, 81, 82, 89, 90, 96, 97,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000, 1000]
remove_df1["Stimulants"] = [0, 1, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 16, 17, 19, 20, 21, 22, 23, 26, 27, 28, 30, 33, 37, 39, 45, 47, 50, 52, 54, 55, 56, 59,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000, 1000]
remove_df1["Deliriants"] = [0, 2, 3, 4, 5, 7, 8, 10, 14, 15, 23, 31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000, 1000]
remove_df1["Antidepressants antipsychotics"] = [2, 3, 4, 5, 6, 8, 9, 10, 11, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000]
remove_df1["LSD"] = [1, 3, 4, 5, 7, 10, 11, 19, 20, 21, 22, 28, 38, 41, 42,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000, 1000]
remove_df1["MDMA"] = [0, 1, 2, 5, 6, 7, 12, 15, 17, 19, 21, 22, 28, 30, 32, 34, 37, 38, 40, 41, 42, 44, 45, 46, 47, 48, 50, 51, 52, 54, 59,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000, 0]
remove_df1["Cannabis spp"] = [0, 1, 2, 7, 14, 17, 19, 21, 29, 30, 31, 35, 38, 39,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000, 1000]
remove_df1["Salvia divinorum"] = [0, 1, 3, 4, 6, 7, 9, 11, 15, 16, 20, 28, 34, 35, 37, 38, 40, 45, 46, 56,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000,  1000, 1000]
remove_df1["DMT"] = [1, 3, 4, 6, 9, 12, 14, 15, 19, 20, 25, 31, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]

psilocybin_mushrooms_remove1 = [0, 1, 3, 8, 12, 13, 14, 21, 22, 23, 24, 26, 31, 38, 52, 55, 57, 58, 66, 70, 72, 82, 85, 90, 95, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244]



depressant_sedatives_remove2 = [2, 4, 5, 8, 10, 19, 29, 31, 41, 45, 52, 55, 61, 66, 69, 73, 79, 82, 83, 87, 89, 91, 93, 94, 96, 101, 102, 104, 108, 110, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253]
deliriants_remove2 = [0, 1, 2, 9, 10, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52]
antidepressants_antipsychotic_remove2 = [1, 7, 8, 9, 10, 11, 12, 52]
lsd_remove2 = [1, 2, 5, 14, 30, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150]
dmnt_remove2 = [2, 20, 21, 27, 32, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73]




substances_classes_list2 = ["Serotonergic psychedelics", "Dissociative psychedelics", "Entactogens", "Stimulants", "Psilocybin mushrooms", "MDMA", "Cannabis spp", "Salvia divinorum"]


for substance_or_class in substances_classes_list2:
        print(substance_or_class)



        #remove list
        
        if substance_or_class == "Psilocybin mushrooms":
                temp_remove_list1 = psilocybin_mushrooms_remove1
        else:
                temp_remove_list1 = list(set(remove_df1[substance_or_class].tolist()))


        #get topic model
        temp_topic_model = BERTopic.load(f"BERTopic files/BERTopic models/bertopic_model_1_{substance_or_class}")

        #labels
        labels = temp_topic_model.generate_topic_labels(nr_words=3, topic_prefix=True, word_length=15, separator=" - ")
        temp_topic_model.set_topic_labels(labels)

        #get docs for document info
        temp_docs = pd.read_pickle(f"BERTopic files/BERTopic docs/BERTopic_docs_{substance_or_class}.pkl")

        #get df
        temp_df  = pd.read_pickle(f"BERTopic files/BERTopic docs/BERTopic_df_{substance_or_class}.pkl")

        #get document info
        temp_document_info = temp_topic_model.get_document_info(temp_docs)

        #merge df and document info, so doc, ntp-ftp-stp class and topic in one df for color map
        temp_df = pd.merge(temp_df, temp_document_info, left_on='docs', right_on='Document')


        #update df based of keep list
        temp_df2 = temp_df[~temp_df.filter(items=['Topic']).isin(temp_remove_list1).any(axis=1)]
        temp_df2.reset_index(drop=True, inplace=True)


        #save updated df2 for future reference
        temp_df2.to_pickle(f"BERTopic files/BERTopic docs/BERTopic_df2_{substance_or_class}.pkl")

        #get docs2 from this updated df
        temp_docs2 = temp_df2.Document.tolist()


        #keep list1 - all topics once
        temp_keep_list1 = [topic for topic in range(-1, len(temp_topic_model.get_topic_info())-1) if topic not in temp_remove_list1]
        #get keep list2 - all topics of docs remaining indexed by docs
        temp_keep_list2 = [topic for topic in temp_topic_model.topics_ if topic not in temp_remove_list1]


        #color map (I would not normally spell it as 'colour' but wanted to keep consistent with BERTopic API :D )
        color_map1 = {'Neutral time perception words':[255, 255, 255], 'Fast time perception words':[195, 27, 52], "Slow time perception words":[48, 69, 186]}

        #create color map to assign documents colour of their seed word
        color_map2 = {}
        for i, type in enumerate(temp_df2.type):
                topic = temp_df2.loc[i, "Topic"]
                if topic in temp_keep_list1:
                        doc = temp_df2.loc[i, "Document"]
                        color_map2[doc] = color_map1[type]

        #put slash back for title of graph
        if substance_or_class == "Depressant sedatives":
                title = "Depressant / sedatives"
        elif substance_or_class == "Antidepressants antipsychotics":
                title = "Antidepressants / antipsychotics"
        else:
                title = substance_or_class


        ####
        temp_topic_model2 = BERTopic.load(f"BERTopic files/BERTopic models/bertopic_model_2_{substance_or_class}") #CHANGED


        #run modified viszualize documents function (see modification below)
        temp_fig = temp_topic_model2.modified_visualize_documents(docs=temp_docs2, topics=temp_keep_list1, custom_labels=True, title=title, color_map2=color_map2, keep_list3=temp_keep_list2)


        #save plotly file
        pyo.plot(temp_fig, filename=f'BERTopic files/BERTopic plots/BERTopic_plot_{substance_or_class}.html')




#DONE WITH MIN TOPIC SIZE 4 (AS WEL AS PSILOCYBIN MUSHROOMS ABOVE)
substances_classes_list = ["Deliriants", "Depressant sedatives", "Antidepressants antipsychotics", "LSD", "DMT"]


for substance_or_class in substances_classes_list:
        print(substance_or_class)

        #remove list

        temp_remove_list1 = list(set(remove_df1[substance_or_class].tolist()))

        
        if substance_or_class == "Deliriants":
                temp_remove_list2 = deliriants_remove2
        elif substance_or_class == "Depressant sedatives":
                temp_remove_list2 = depressant_sedatives_remove2
        elif substance_or_class == "Antidepressants antipsychotics":
                temp_remove_list2 = antidepressants_antipsychotic_remove2
        elif substance_or_class == "LSD":
                temp_remove_list2 = lsd_remove2
        elif substance_or_class == "DMT":
                temp_remove_list2 = dmnt_remove2

        #get docs for document info
        temp_docs = pd.read_pickle(f"BERTopic files/BERTopic docs/BERTopic_docs_{substance_or_class}.pkl")

        #get topic model
        temp_topic_model0 = BERTopic.load(f"BERTopic files/BERTopic models/bertopic_model_1_{substance_or_class}")

        temp_document_info0 = temp_topic_model0.get_document_info(temp_docs)




        #update df based of keep list
        temp_document_info0 = temp_document_info0[~temp_document_info0.filter(items=['Topic']).isin(temp_remove_list1).any(axis=1)]
        temp_document_info0.reset_index(drop=True, inplace=True)



        temp_docs2 = temp_document_info0.Document.tolist()


        temp_topic_model = BERTopic.load(f"BERTopic files/BERTopic models/bertopic_model_2_{substance_or_class}")




        #get df
        temp_df2  = pd.read_pickle(f"BERTopic files/BERTopic docs/BERTopic_df_{substance_or_class}.pkl")
        #update df based of keep list
        temp_df2 = temp_df2[~temp_df2.filter(items=['Topic']).isin(temp_remove_list1).any(axis=1)]
        temp_df2.reset_index(drop=True, inplace=True)

        #get document info
        temp_document_info = temp_topic_model.get_document_info(temp_docs2)

        #labels
        labels = temp_topic_model.generate_topic_labels(nr_words=3, topic_prefix=True, word_length=15, separator=" - ")
        temp_topic_model.set_topic_labels(labels)


        #merge df and document info, so doc, ntp-ftp-stp class and topic in one df for color map
        temp_df2 = pd.merge(temp_df2, temp_document_info, left_on='docs', right_on='Document')


        #update df based of keep list
        temp_df2 = temp_df2[~temp_df2.filter(items=['Topic']).isin(temp_remove_list2).any(axis=1)]
        temp_df2.reset_index(drop=True, inplace=True)


        #save updated df2 for future reference
        temp_df2.to_pickle(f"BERTopic files/BERTopic docs/BERTopic_df2_{substance_or_class}.pkl")

        #get docs2 from this updated df
        temp_docs3 = temp_df2.Document.tolist()


        #keep list1 - all topics once
        temp_keep_list1 = [topic for topic in range(-1, len(temp_topic_model.get_topic_info())-1) if topic not in temp_remove_list2]
        #get keep list2 - all topics of docs remaining indexed by docs
        temp_keep_list2 = [topic for topic in temp_topic_model.topics_ if topic not in temp_remove_list2]


        #color map (I would not normally spell it as 'colour' but wanted to keep consistent with BERTopic API :D )
        color_map1 = {'Neutral time perception words':[255, 255, 255], 'Fast time perception words':[195, 27, 52], "Slow time perception words":[48, 69, 186]}

        #create color map to assign documents colour of their seed word
        color_map2 = {}
        for i, type in enumerate(temp_df2.type):
                topic = temp_df2.loc[i, "Topic"]
                if topic in temp_keep_list1:
                        doc = temp_df2.loc[i, "Document"]
                        color_map2[doc] = color_map1[type]

        #put slash back for title of graph
        if substance_or_class == "Depressant sedatives":
                title = "Depressant / sedatives"
        elif substance_or_class == "Antidepressants antipsychotics":
                title = "Antidepressants / antipsychotics"
        else:
                title = substance_or_class



        #run modified viszualize documents function (see modification below)
        #temp_fig = temp_topic_model.modified_visualize_documents(docs=temp_docs3, topics=temp_keep_list1, custom_labels=True, title=title, color_map2=color_map2, keep_list3=temp_keep_list2)


        #save plotly file
        #pyo.plot(temp_fig, filename=f'BERTopic files/BERTopic plots/BERTopic_plot_{substance_or_class}.html')


Erowid Quotes

In [None]:
#CSV file for Erowid Quotes finder tool

#for urls with no placeholders - find quote on the Erowid page
def prepare_url(s, url):
    s = s.replace(" .", ".").replace(" ,", ",").replace(" i ", " I ")
    words = s.split()  # split the string into a list of words
    first_three_words = ' '.join(words[:3])  # join the first three words with a space
    last_three_words = ' '.join(words[-3:])  # join the last three words with a space
    updated_url = url + "#:~:text=" + first_three_words.replace(" ", "%20") + "," + last_three_words.replace(" ", "%20")
    return (updated_url)



substances_classes_list_lowercase_subst = ["Serotonergic psychedelics", "Dissociative psychedelics", "Entactogens", "Deliriants", "Depressant sedatives", "Stimulants", "Antidepressants antipsychotics", "lsd", "psilocybin mushrooms", "dmt", "mdma", "cannabis spp", "salvia divinorum"]


#create 
df4 = pd.DataFrame(columns=["Topic Nr", "Topic", "Substance", "Class", "Document"])


for substance_or_class in substances_classes_list_lowercase_subst:
    print(substance_or_class)

    #get temp df
    temp_df2  = pd.read_pickle(f"BERTopic files/BERTopic docs/BERTopic_df2_{substance_or_class}.pkl")



    #for substances "LSD", "Psilocybin mushrooms", "MDMA", "Cannabis spp", "Salvia divinorum" use own topics (not class topics)
    large_substances = ["lsd", "dmt", "psilocybin mushrooms", "mdma", "cannabis spp", "salvia divinorum"]
    if substance_or_class.lower() not in large_substances:
        temp_df2 = temp_df2[~temp_df2.filter(items=['substance']).isin(large_substances).any(axis=1)]

    #remove outlier rows
    #temp_df2 = temp_df2[~temp_df2.Topic == -1]

    #sort df2 by highest probabilities
    temp_df2 = temp_df2.sort_values(by=["Probability"], ascending=False)
    temp_df2.reset_index(drop=True, inplace=True) 

    #set custom labels for temp_df
    labels = temp_topic_model.generate_topic_labels(nr_words=3, topic_prefix=True, word_length=15, separator=" - ")
    temp_topic_model.set_topic_labels(labels)



    
    substance_topics_combos = []
 
    for i, topic_nr in enumerate(temp_df2.Topic):
        if topic_nr != -1:
            substance = temp_df2.loc[i, "substance"]
            substance_topics_combo = str(topic_nr) + substance
            substance_topics_combos.append(substance_topics_combo)
            if substance_topics_combos.count(substance_topics_combo) <= 10: 
                class_ = temp_df2.loc[i, "classes"]
                url = temp_df2.loc[i, "url"]
                doc = temp_df2.loc[i, "Document"]
                prob = round(temp_df2.loc[i, "Probability"], 3)
                #get topic labels from topic_info df
                if  len(str(topic_nr)) == 1: 
                    topic = "00" + temp_df2.loc[i, "CustomName"]
                elif len(str(topic_nr)) == 2: 
                    topic = "0" + temp_df2.loc[i, "CustomName"]
                else:
                    topic = temp_df2.loc[i, "CustomName"]
                #change url so it leads to the quote on the
                if any(substring in doc for substring in ["miranda", "megan", "matt", "alexa"]): #exclude names not removed by Spacy in pre-processing
                    pass 
                elif any(substring in doc for substring in ["PERSON", "ORG", "GPE", "LOC"]):
                    doc = "..." + doc + f"...- TBS:{prob}  (NO URL)" 
                else:
                    url = prepare_url(doc, url)    
                    doc = "..." + doc + f"... - TBS:{prob}  (<a href={url}>URL</a>)"       

                df4.loc[len(df4.index)] = [topic_nr, topic, substance, class_, doc]

#save
df4.to_csv("Representative Quotes Per Topic-Substance.csv")

Prepare classes data

In [None]:
classes_list = ["Serotonergic psychedelics", "Dissociative psychedelics", "Entactogens", "Deliriants", "Depressant / sedatives", "Stimulants", "Oneirogens", "Antidepressants / antipsychotics", "Other"]

#save df and docs for each class separetely
for class_ in classes_list:
        temp_df = df[df.classes == class_]
        if class_ == "Depressant / sedatives":
                class_ = "Depressant sedatives"
        elif class_ == "Antidepressants / antipsychotics":
                class_ = "Antidepressants antipsychotics"
        temp_df.to_pickle(f"BERTopic files/BERTopic docs/BERTopic_df_{class_}.pkl")
        temp_docs = [doc for doc in temp_df.docs]
        with open(f"BERTopic files/BERTopic docs/BERTopic_docs_{class_}.pkl", "wb") as f:
                pickle.dump(temp_docs, f)


#Capitalize for graph title asthetics 
substances_list = ["LSD", "Psilocybin mushrooms", "MDMA", "Cannabis spp.", "Salvia divinorum", "DMT"]

#save df and docs for each large substance separetely
for substance in substances_list:
        temp_df = df[df.substance == substance.lower()]
        temp_df.to_pickle(f"BERTopic files/BERTopic docs/BERTopic_df_{substance}.pkl")
        temp_docs = [doc for doc in temp_df.docs]
        with open(f"BERTopic files/BERTopic docs/BERTopic_docs_{substance}.pkl", "wb") as f:
                pickle.dump(temp_docs, f)




#create topic model for class / substance
vectorizer_model= CountVectorizer(stop_words="english") 

#classes 
for class_ in classes_list:
    if class_ == "Depressant / sedatives":
            class_ = "Depressant sedatives"
    elif class_ == "Antidepressants / antipsychotics":
            class_ = "Antidepressants antipsychotics"

    temp_topic_model = BERTopic(seed_topic_list=seed_topic_list, n_gram_range=(1,3), vectorizer_model=vectorizer_model, calculate_probabilities=False)
    temp_docs = pd.read_pickle(f"BERTopic files/BERTopic docs/BERTopic_docs_{class_}.pkl")
    topics, probs = temp_topic_model.fit_transform(temp_docs)
    temp_topic_model.save(f"BERTopic files/BERTopic models/bertopic_model_1_{class_}")


#substances
for substance in substances_list:
    temp_topic_model = BERTopic(seed_topic_list=seed_topic_list, n_gram_range=(1,3), vectorizer_model=vectorizer_model, calculate_probabilities=False)
    temp_docs = pd.read_pickle(f"BERTopic files/BERTopic docs/BERTopic_docs_{substance}.pkl")
    topics, probs = temp_topic_model.fit_transform(temp_docs)
    temp_topic_model.save(f"BERTopic files/BERTopic models/bertopic_model_1_{substance}")