In [2]:
import os
import re
import json
import joblib
import faiss
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from langchain.docstore.document import Document
from langchain.embeddings import OpenAIEmbeddings

from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
from dotenv import find_dotenv, load_dotenv

dotenv_path = find_dotenv()
load_dotenv(dotenv_path)

True

### Preparing video metadata

In [4]:
def find_episode_name(s):
    pattern = r'Episode(.*?),\s*Segment'
    result = re.search(pattern, s)
    if result:
        return result.group(1).strip()
    else:
        return None
    
def find_segment_name(s):
    pattern = r'Segment(.*?)\(\d{2}-\d{2}-\d{2}\s*\d{2}-\d{2}-\d{2}\)'
    result = re.search(pattern, s)
    if result:
        return result.group(1).strip()
    else:
        return None
    
def get_start_timestamp_in_s(s):
    pattern = r'\((\d{2})-(\d{2})-(\d{2})\s*\d{2}-\d{2}-\d{2}\)'
    result = re.search(pattern, s)
    if result:
        hours = int(result.group(1))
        minutes = int(result.group(2))
        seconds = int(result.group(3))
        total_seconds = (hours * 3600) + (minutes * 60) + seconds
        return total_seconds
    else:
        return None
    
def get_yt_url(video_id, start_timestamp):
    return f"https://www.youtube.com/watch?v={video_id}&t={start_timestamp}s"

def create_summary_docs_and_metadata(summary_directory):
    video_data = pd.read_csv(os.path.join('data', 'video_metadata.csv'))
    video_data['title'] = video_data['title'].apply(lambda x: x.replace("/", " ").replace(":", "-"))
    
    data = []
    docs = []
    for filename in os.listdir(summary_directory):
        if not filename.startswith('.'):
            episode_name = find_episode_name(filename)
            segment_name = find_segment_name(filename)
            video_id = video_data.loc[video_data['title'] == episode_name, "videoId"].to_list()[0]
            start_timestamp_s = get_start_timestamp_in_s(filename)
            url = get_yt_url(video_id, start_timestamp_s)
            full_path = os.path.join(summary_directory, filename)
            with open(full_path) as f:
                text = f.read()
            docs.append(Document(page_content=text, metadata={"episode_name": episode_name,
                                                              "segment_name": segment_name,
                                                              "url": url}))
            data.append([episode_name, segment_name, text, url])

    metadata = pd.DataFrame(data, columns=['episode_name', 'segment_name', 'summary', "url"])
    return metadata, docs

In [5]:
summary_dir = os.path.join("data", "summaries")
df_metadata, docs = create_summary_docs_and_metadata(summary_dir)

In [None]:
df_metadata.to_csv(os.path.join('data', 'video_metadata.csv'), index=False)

In [None]:
docs

### OpenAI embeddings

In [None]:
embeddings = OpenAIEmbeddings()
vectors = embeddings.embed_documents(texts=[doc.page_content for doc in docs])

In [None]:
joblib.dump(vectors, os.path.join("data", "embeddings", "summary_embeddings.joblib"))

In [None]:
vectors_array = np.array(vectors)

### SBERT embeddings

In [11]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
df = pd.read_csv(os.path.join("data", "summary_kmeans_with_chatgpt_and_keywords_final.csv"))

In [8]:
summaries = [row["summary"] for i, row in df.iterrows()]

In [13]:
embeddings = model.encode(summaries)

In [15]:
joblib.dump(embeddings, os.path.join("data", "embeddings", "summary_embeddings_sbert.joblib"))

['data/embeddings/summary_embeddings_sbert.joblib']

Move the vector to faiss embedding

In [17]:
embeddings.shape

(2831, 384)

In [19]:
N_DIM = 384
index = faiss.IndexFlatIP(N_DIM)
index.add(embeddings)
faiss.write_index(index, os.path.join("data", "embeddings", "faiss_summary_index_sbert.faiss"))

### KMeans on embeddings

In [None]:
num_clusters = 30
kmeans = KMeans(n_clusters=num_clusters,
                init='k-means++',
                n_init=10,
                random_state=42).fit(vectors_array)


# Perform t-SNE and reduce to 2 dimensions
tsne = TSNE(n_components=2, random_state=42)
reduced_data_tsne = tsne.fit_transform(vectors_array)

In [None]:
df_plot = pd.DataFrame(reduced_data_tsne)
df_plot['label'] = kmeans.labels_
cmap = plt.cm.get_cmap('gist_ncar', num_clusters)
colors = [cmap(i) for i in range(num_clusters)]

for i, df in df_plot.groupby("label"):
    plt.scatter(df.iloc[:, 0], df.iloc[:, 1], c=colors[i], label=i, alpha=0.7)

plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('Summary Embeddings Clustered')
plt.savefig(os.path.join("images", "summary_kmeans_tsne_results.png"))

In [None]:
df_metadata['k_means_label'] = kmeans.labels_

In [None]:
df_sorted = df_metadata.sort_values(by=["k_means_label"]).reset_index(drop=True)

In [None]:
df_sorted.to_csv(os.path.join('data', 'summaries_kmeans.csv'), index=False)

### Sample segments for giving ontology to ChatGPT

In [None]:
df_na_dropped = df_sorted.dropna(subset=["segment_name"])

In [None]:
for i in range(0, 30, 5):
    df_na_dropped[df_na_dropped["k_means_label"].isin(list(range(i, i+5)))].sample(200).loc[:, ["k_means_label", "segment_name"]].to_csv(f'../data/label_sample_{i}_{i+5}.csv', index=False)

### Mapping from ChatGPT

In [None]:
mapping = {
    "Mental Health and Emotional Resilience": [0, 2, 18, 20],
    "Physical Performance and Recovery": [1, 4, 8, 15, 22, 29],
    "Sleep, Circadian Rhythms and Light": [5, 7, 26],
    "Meditation, Focus, and Cognitive Training": [3, 11, 17, 23, 27],
    "Nutrition, Supplements, and Metabolic Health": [9, 16, 24, 25, 28],
    "Gut Health and Microbiome": [6],
    "Relationships, Social Dynamics, and Personal Development": [10, 21],
    "Neuroscience, Biohacking, and Health Monitoring": [12, 14, 19],
    "Taste, Smell, and Perception": [13]
}

# Create a reverse mapping dictionary
reverse_mapping = {old_cat: new_cat for new_cat, old_cats in mapping.items() for old_cat in old_cats}

# Replace old category values with new non-overlapping categories
df_sorted['chatgpt_labels'] = df_sorted['k_means_label'].replace(reverse_mapping)

In [None]:
df_sorted.to_csv(os.path.join('data', 'summary_kmeans_with_chatgpt_labels.csv'), index=False)

#### Category keywords

In [None]:
cat_keywords = {
    "Mental Health and Emotional Resilience": [
        "fear",
        "aggression",
        "anxiety",
        "grief",
        "trauma",
        "memory",
        "motivation",
        "mindfulness",
        "gratitude",
        "brain regions",
        "stress response",
        "cortisol",
        "inflammation",
        "relaxation",
        "coping mechanisms",
        "stress inoculation",
        "stress threshold",
        "emotions",
        "resilience",
        "OCD",
        "bipolar disorder",
        "depression",
        "ADHD",
        "mental health",
        "CBT",
        "exposure therapy",
        "SSRIs",
        "symptoms",
        "mania",
        "relapse",
        "triggers",
        "cortico-striatal-thalamic loop",
        "hyper-focus",
        "neural circuits",
        "suicide",
        "working memory",
        "happiness",
        "creativity",
        "self-talk",
        "ambition",
        "self-doubt",
        "friendship",
        "narcissism",
        "loneliness",
        "visualization",
        "play",
        "power dynamics"],
    "Physical Performance and Recovery": [
        "ATP",
        "Creatine",
        "Magnesium",
        "anaerobic capacity",
        "antagonistic muscles",
        "blood flow",
        "body temperature",
        "breathwork",
        "breathing mechanics",
        "caffeine",
        "cardiovascular adaptations",
        "cold exposure",
        "cooling",
        "dehydration",
        "dynamic movements",
        "electrolytes",
        "endurance",
        "endurance training",
        "energy",
        "exercise",
        "exercise order",
        "fat loss",
        "fitness metric",
        "flexibility",
        "gym",
        "heat exposure",
        "heart rate",
        "heart rate variability",
        "heating",
        "high-intensity",
        "HIIT",
        "hydration",
        "hypertrophy",
        "hyperventilation",
        "infrequent training",
        "kidneys",
        "lactate",
        "low-carbohydrate diets",
        "meditation",
        "mental resilience",
        "modifiable variables",
        "muscle",
        "muscle memory",
        "muscle physiology",
        "nasal breathing",
        "nutrition",
        "oxygen utilization",
        "osteopenia",
        "osteoporosis",
        "plasticity",
        "power vs. strength training",
        "pushing vs. pulling exercises",
        "range of motion",
        "recovery",
        "resistance training",
        "rest periods",
        "routine",
        "sauna",
        "salt intake",
        "sets",
        "sleep apnea",
        "stress reduction",
        "strength",
        "strength test",
        "stretching",
        "training",
        "training frequency",
        "training program",
        "upper motor neurons",
        "urine regulation",
        "warming up",
        "water",
        "water filters",
        "water filtration",
        "workout"
    ],
    "Sleep, Circadian Rhythms and Light": [
      "sleep", "circadian rhythm", "melatonin", "REM", "deep sleep",
      "naps", "shift workers", "hormones", "light", "jetlag", "vision",
      "sunlight", "blue light", "infrared", "eye", "phototherapy",
      "sun exposure", "circadian entrainment", "mood", "fasting",
      "meal timing", "protein", "autophagy", "TRE", "cardiovascular health",
      "blood glucose", "firefighters", "morning protein", "gut health",
      "microbiota"
   ],
   "Meditation, Focus, and Cognitive Training": [
      "meditation", "focus", "goal setting", "hypnosis", "visual attention",
      "interoception", "time perception", "cognitive performance", "self-directed",
      "mental training", "injury", "travel", "layoffs", "exercise", "memory",
      "procedural memory", "visualization", "balance", "movement diversity",
      "squat challenge", "stop-signal task", "acetylcholine", "attention",
      "limb range of motion", "automaticity", "habit formation", "ultradian training",
      "cardiovascular exercise", "neurogenesis", "binaural frequency", "work",
      "awareness", "gender", "age", "smooth pursuit", "competition", "dynamic movement",
      "mindset", "Hebbian learning", "NMDA receptors", "skill development", "nervous system",
      "brain", "language", "neural control", "neural circuits", "sensory perception", "pain",
      "brain machine interface", "neural repair", "encoding", "depth perception", "caffeine",
      "adrenaline", "alertness", "dopamine", "epinephrine", "neurotransmitters",
      "phenylethylamine", "modafinil", "huperzine A", "lactate", "cognitive enhancement",
      "sense of self", "brain-body contract", "cold exposure", "sauna", "habits",
      "deliberate practice", "smell", "social media mindset", "Andrew Tate",
      "Masculinity", "OnTime Health App", "photographic memory", "bizarre addiction",
      "momentous supplements", "romantic love", "connecting to Dr. Walker",
      "Dr. Matthew Johnson", "Jocko Willink", "fitness testing", "grief & bereavement",
      "creativity & ideas", "UFC performance institute", "summary of protocols",
      "blood & oxygen for vision"
   ],
   "Nutrition, Supplements, and Metabolic Health": [
      "supplements", "omega-3", "vitamins", "minerals", "creatine", "nutrition",
      "amino acids", "antioxidants", "protein", "hormones", "puberty", "fertility",
      "menstrual cycles", "testosterone", "estrogen", "sperm", "sex chromosomes",
      "germ cells", "GnRH", "melatonin", "leptin", "follicular", "AMH",
      "oral contraceptives", "ovarian cancer", "IUDs", "sperm production", "vasectomy",
      "anorexia nervosa", "nucleus accumbens", "sugar cravings", "gut neurons",
      "obesity", "glucose metabolism", "glycemic index", "yerba mate",
      "artificial sweeteners", "fiber", "gastric emptying time", "insulin",
      "appetite suppression", "gut-brain axis", "DHT", "hair loss", "baldness",
      "prostate health", "menopause", "aromatase inhibitors", "calcium D-glucarate",
      "DIM", "SHBG", "testosterone therapy", "HCG", "marijuana", "nicotine",
      "cycling", "pelvic floor", "alcohol", "fat", "fat fasting", "blood glucose",
      "schizophrenia", "depression", "ketogenic diet", "fasted vs. fed states",
      "focus", "low carbohydrate diets", "cholesterol", "saturated fat",
      "LDL", "HDL", "Apolipoprotein B", "fasting", "metformin", "rapamycin",
      "GABA", "highly processed foods", "ketones", "mental health benefits",
      "psychiatric medications", "diet adherence", "intermittent fasting",
      "metabolic flexibility", "blood glucose monitoring", "mTOR", "ketosis",
      "brain energy", "spiking glucose during ketosis", "PCOS", "hypomania",
      "Serotonin"
   ],
    "Gut Health and Microbiome": [
      "microbiome", "gut health", "inflammation", "fermented foods", "probiotics",
      "fiber", "constipation", "microbiota", "non-alcoholic fatty liver"
   ],
      "Relationships, Social Dynamics, and Personal Development": [
      "romance", "love", "desire", "synthesis", "attachment styles", "mate choice",
      "infidelity", "long-term mates", "short-term cheating", "social connection",
      "jealousy", "mate value discrepancies", "deep social connection", "narcissism",
      "machiavellianism", "psychopathy", "relationship stability", "romantic attachment",
      "social isolation", "child-parent bonding", "friendship", "breakups",
      "neural mechanisms", "serotonin", "psilocybin", "ketamine", "MDMA", "PTSD",
      "depression", "ayahuasca", "meditation", "sleep", "cognitive flexibility",
      "hallucinations", "neuropharmacology", "mental health therapy",
      "electroconvulsive therapy"
   ],
   "Neuroscience, Biohacking, and Health Monitoring": [
      "bloodwork", "biomarkers", "lifespan", "disease", "Sinclair Test", "smartphones",
      "dopamine circuits", "dopamine neurons", "co-release glutamate", "neurochemical toolkit",
      "individual goals", "happiness", "impulsivity", "mood", "creativity", "addiction",
      "recovery", "binding behaviors", "dopamine stacking", "intrinsic motivation",
      "pain-pleasure balance", "amphetamine", "cocaine", "detrimental rewiring",
      "addiction recovery", "L-Tyrosine", "Ritalin", "Adderall", "Modafinil",
      "Armodafinil", "smart drugs", "caffeine", "dangers", "neurogenesis",
      "neuroplasticity", "binaural beats", "focus", "meditation", "procrastination",
      "social homeostasis", "pleasure", "pain", "trauma", "arousal", "serotonin",
      "time underestimation", "decreased frame rate", "cannabis", "alcohol", "nicotine",
      "CBD", "THC", "vaping", "brain health", "dopamine", "psychoactive compounds",
      "cannabinoids", "receptors", "nervous system function", "smoking", "tobacco",
      "drug risk", "dependence", "inebriation", "neurodegeneration"
   ],
    "Taste, Smell, and Perception": [
      "eating more plants", "eating more meat", "cravings", "desire", "pheromones",
      "Coolidge Effect", "taste perception", "smell", "odorant similarity", "sniffing",
      "romantic partner", "food", "pleasure", "reproduction", "dopamine", "serotonin",
      "oxytocin", "acquired tastes", "conditioned taste aversion", "medical diagnostic",
      "olfaction digitization", "fat sensing", "olfaction circuits", "social chemo-signals",
      "fear"
   ]
}


In [None]:
df_sorted

### Pick Keywords for each document

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain
from langchain import PromptTemplate, LLMChain

In [None]:
prompt_template = """Pick right keywords for the [SUMMARY]. Select from the [KEYWORD LIST].


- [SUMMARY]: {text}

- [KEYWORD LIST]: {keywords}

- [PICKED KEYWORDS] (selected from [KEYWORD LIST]):"""

In [None]:
df_sorted.columns

In [None]:
model_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=model_name)
BULLET_POINT_PROMPT = PromptTemplate(template=prompt_template, 
                                    input_variables=["text", "keywords"])
chain = LLMChain(llm=llm,
                prompt=BULLET_POINT_PROMPT)

keyword_list = joblib.load(os.path.join("data", "keyword_list.joblib"))
for i, el in df_sorted.iloc[1917:, :].iterrows():
    summary = el['summary']
    keywords = chain.run(text=summary, keywords=str(cat_keywords[el['chatgpt_labels']])).replace('.', '')
    keywords = keywords.split(', ')
    
    keyword_list.append(keywords)
    if i%100 == 0:
        print(i)
        joblib.dump(keyword_list, os.path.join("data", "keyword_list_2.joblib"))

df_sorted["keywords"] = keyword_list

In [None]:
df_sorted.to_csv(os.path.join("data", "summary_kmeans_with_chatgpt_and_keywords.csv"), index=False)