## **Scraping data for the Lex Fridman Podcast**

### Install the required libraries

In [224]:
pip install youtube_transcript_api

Note: you may need to restart the kernel to use updated packages.


In [225]:
# Import necessary libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import json
from youtube_transcript_api import YouTubeTranscriptApi
from collections import Counter
import time
import datetime
import re

### Get video data using BeautifulSoup and YouTube's Data API v3

In [226]:
# Record start time for performance measurement
start = time.time()

# Create empty lists to store video data
videos_id = []
videos_title = []
videos_description = []
videos_upload_date = []
videos_duration = []
videos_views = []
videos_likes = []
videos_comments = []
videos_tags = []
top_ten_words = []
videos_thumbnail = []
videos_favorite_count = []
videos_region_restriction = []
videos_captions = []
videos_captions_text = []


# Fetch the Lex Fridman Podcast webpage to get only the episodes' links to use in the youtube API endpoint 
# as there are other links in the page that are not episodes videos
bs_response = requests.get('https://lexfridman.com/podcast/')

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(bs_response.text, 'html.parser')

# Extract all YouTube video URLs from the page
youtube_urls = []
for urls in soup.select('div.vid-materials'):
  url = urls
  youtube_urls.append(url.a.get('href'))

# Process YouTube URLs in chunks of 50 for API efficiency (youtube api doesn't allow for more than 50 API calls in one API endpoint)
for i in range(0, len(youtube_urls), 50):
  youtube_urls_chunk = youtube_urls[i:i+50]
  # Extract video IDs from the URLs for the YouTube API
  video_ids = [url_ids.split('v=')[-1] for url_ids in youtube_urls_chunk]

  # Construct YouTube API endpoint URL
  api_endpoint = f"https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails,statistics&id={','.join(video_ids)}&key=AIzaSyA087wwY7-mxbqeER7UXqbm8nJbHPQpm5M"
  
  # Make youtube API request
  yt_response = requests.get(api_endpoint)

  # Parse the JSON response
  data = json.loads(yt_response.text)

  # Iterate over the data json file to get the required data
  for item in data['items']:
    video_id = item['id']
    video_title = item['snippet']['title']
    video_description = item['snippet']['description']
    video_upload_date = item['snippet']['publishedAt']
    video_duration = item['contentDetails']['duration']
    video_views = item['statistics']['viewCount']
    video_likes = item['statistics']['likeCount']
    video_comments = item['statistics']['commentCount']
    if 'tags' in item['snippet']:
      video_tags = item['snippet']['tags']
    else:
      video_tags = None
    video_thumbnail = item['snippet']['thumbnails']['high']['url']
    video_favorite_count = item['statistics']['favoriteCount']
    if 'regionRestriction' in item['contentDetails']:
      video_region_restriction = item['contentDetails']['regionRestriction']
    else:
      video_region_restriction = None
    video_caption = item['contentDetails']['caption']
    
    # Append data to respective lists
    videos_id.append(video_id)
    videos_title.append(video_title)
    videos_description.append(video_description)
    videos_upload_date.append(video_upload_date)
    videos_duration.append(video_duration)
    videos_views.append(video_views)
    videos_likes.append(video_likes)
    videos_comments.append(video_comments)
    videos_tags.append(video_tags)
    videos_thumbnail.append(video_thumbnail)
    videos_favorite_count.append(video_favorite_count)
    videos_region_restriction.append(video_region_restriction)
    videos_captions.append(video_caption)

# Calculate and print execution time
end = time.time()
print(f'This scraping operation took {round(end - start)} seconds')

This scraping operation took 4 seconds


In [227]:
# Create a Pandas DataFrame from the collected video data
column_headers = ['id', 'yt_title', 'description', 'upload_date', 'duration', 'views', 'likes', 'comments_count', 'tags', 'thumbnail_url', 'favorite_count', 'region_restriction', 'captions_availability']
lex_df0 = pd.DataFrame(list(zip(videos_id, videos_title, videos_description, videos_upload_date, videos_duration, videos_views, videos_likes, videos_comments, videos_tags, videos_thumbnail, videos_favorite_count, videos_region_restriction, videos_captions)), columns = column_headers)

### Get the videos transcripts using **youtube_transcript_api**

In [228]:
# Record start time for performance measurement
start = time.time()

# Fetch and process transcripts (captions) for each video
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound

# Create an new column to store video transcripts
videos_captions_text = []

# Loop through each video ID in the DataFrame to get its transcript
for id in lex_df0['id']:
    try:
        # Check if captions are available for the current video
        if lex_df0[lex_df0['id'] == id]['captions_availability'].values[0] == 'true':
            try:
                # Get the transcript for the video using the YouTube Transcript API
                transcript = YouTubeTranscriptApi.get_transcript(id)
                # Extract the text from each part of the transcript and join it into a single string
                text = ' '.join([d['text'] for d in transcript])
                # Append the combined transcript text to the list
                videos_captions_text.append(text)
            except (TranscriptsDisabled, NoTranscriptFound, Exception) as e:
                # Handle exceptions if the transcript is disabled, not found, or any other error occurs
                print(f"no transcript for video {id}: {str(e)}")
                videos_captions_text.append(None)
        else:
            # Append None if captions are not available for the video
            videos_captions_text.append(None)
    except Exception as e:
        # Handle any unexpected errors during transcript retrieval
        print(f"unexpected error for video {id}: {str(e)}")
        videos_captions_text.append(None)

# Create a Pandas DataFrame from the collected video transcripts
videos_captions_df = pd.DataFrame(videos_captions_text, columns=['captions_text'])

# Calculate and print execution time
end = time.time()
print(f'This youtube scraping operation took {round(end - start)} seconds')

no transcript for video reYdQYZ9Rj4: no element found: line 1, column 0
This youtube scraping operation took 96 seconds


In [229]:
# view the output
videos_captions_df.head()

Unnamed: 0,captions_text
0,"- Most people, most of the time, are polite, c..."
1,- The ideas that I am talking about are ideas ...
2,- The big question for\nme in that timeline is...
3,- The following is a conversation\nwith Jordan...
4,- The following is a conversation with the fou...


### Get top 5 said words of each video

In [230]:
# Initialize a list to store the top 5 words for each video
top_5_words_in_video = []

# Define a function to extract the top N most frequent words from a given text
def get_top_n_words(text, n, exclude_list):
    """
    Extracts the top N most frequent words from a transcript text, excluding words in a provided list.

    Args:
        text (str): The input text (video transcript).
        n (int): The number of top words to return.
        exclude_list (list): A list of words to exclude from the count.

    Returns:
        list: A list of tuples, where each tuple contains a word and its frequency, sorted by frequency in descending order.
    """
    # Remove punctuation and convert text to lowercase
    words = [word.lower().strip('.,?!:;()[]{}\"\'') for word in text.split()]
    # Filter out words present in the exclude list (case-insensitive)
    words = [word for word in words if word.lower() not in [x.lower() for x in exclude_list]]
    # Count word frequencies
    c = Counter(words)
    # Return the N most common words
    return c.most_common(n)

In [231]:
# Record start time for performance measurement
start = time.time()

# Create a list of the excluded words from the top five words
exclude_list = ["the", "a", "like", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because",
                "been", "before", "being", "one", "in", "not", "of", "is", "just", "even", "get", "to", "lot", "that", "his", "know",
                "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had",
                "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "how", "how's", "i",
                "i'd", "i'll", "i'm", "i've", "if", "into", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor",
                "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should",
                "so", "some", "such", "than", "that's", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd",
                "they'll", "they're", "they've", "this", "those", "through", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're",
                "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would",
                "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "uh", "don't", "um", "yeah", "actually", "also",
                "going", "can", "something", "will", "no", "well", "many", "really", "things", "kind", "think", "say", "see", "basically", "man"
                "right", "else", "isn't", "shall", "wasn't", "http", "however", "therefore", "can't", "through", "further", "shan't", "whom", "get",
                  "is", "them", "but", "me", "too", "as", "once", "hadn't", "doing", "our", "which", "time", "good", "great", "little", "back", "stuff", "able", "life", "world",
                "until", "nor", "we're", "they", "cannot", "any", "won't", "they'll", "why's", "because", "can't", "www", "people", "♪", "cus", "ago"
                "haven't", "mustn't", "under", "against", "let's", "off", "shouldn't", "below", "has", "couldn't", "weren't", "human," "idea," "country", "system", 
                "while", "not", "was", "such", "ever", "yours", "it", "his", "myself", "you're", "than", "didn't", "obviously", "different",
                "aren't", "does", "ought", "she's", "some", "by", "ours", "only", "should", "few", "since", "did", "theirs", "these", "were", "him", "on", "if",
                "up", "where", "its", "wouldn't", "each", "with", "where's", "hasn't", "who", "for", "otherwise", "during", "those", "other", "before", "doesn't", "into", "herself",
                "to", "how", "that's", "hence", "having", "own", "com", "why", "it's", "mean", "make", "want", "sort", "okay", "thing", "need", "go", "much",
                "got", "way", "maybe", "upon", "100", "yes", "may", "look", "oh", "somebody", "around", "now", "said", "-", "lex", "might", "gonna", "guys", "cause",
                "years", "thing", "things", "really", "very", "actually", "kind", "kinda", "sort", "sorta", "just", "mean", "think", "say", "said", "saying", "know", "going", "get", "got", "lot", "way"]


# Iterate through each transcript's text
for text in videos_captions_text:
  if text: # Check if the transcript exists (is not None)
    # Get the top 5 most frequent words in the transcript, excluding the words in exclude_list
    top_5_words = get_top_n_words(text, 5, exclude_list)
    top_5_words_in_video.append(top_5_words)
  else:
     # If no transcript is available, append a message indicating that
    top_5_words_in_video.append('no captions available')

# Calculate and print execution time
end = time.time()
print(f'This operation took {round(end - start)} seconds')

This operation took 50 seconds


In [232]:
# Process the top 5 words data and format it for the DataFrame
top_five_words = []
for item in top_5_words_in_video:
    if isinstance(item, list):  # Check if the item is a list of (word, count) tuples
        # Filter out any remaining excluded words that might have been missed in the previous step
        filtered_words = [(word, count) for word, count in item
                         if word.lower().strip('.,?!:;()[]{}\"\'') not in exclude_list]

        # Format the words and counts as a comma-separated string ("word-count, word-count, ...")
        words_string = ', '.join([f"{word}-{count}" for word, count in filtered_words])
        top_five_words.append([words_string])  # Append as a list to maintain structure
    else:
        # If no top words are available (e.g., 'no captions available'), append the existing value directly
        top_five_words.append([item])  # Append the message or value from top_5_words_in_video


# Create a DataFrame from the processed top 5 words data
top_five_df = pd.DataFrame(top_five_words, columns=['words'])

In [233]:
# Add the 'top_five_words' and 'captions_text' columns to the main DataFrame
lex_df0['top_five_words'] = top_five_df['words']
lex_df0['captions_text'] = videos_captions_df['captions_text']

# Display the first 10 rows of the updated DataFrame
lex_df0.head(10)

Unnamed: 0,id,yt_title,description,upload_date,duration,views,likes,comments_count,tags,thumbnail_url,favorite_count,region_restriction,captions_availability,top_five_words,captions_text
0,abd5hguWKz0,"Rick Spence: CIA, KGB, Illuminati, Secret Soci...",Rick Spence is a historian specializing in the...,2024-10-30T18:06:12Z,PT3H28M20S,901686,14696,2037,"[Rick Spence, alex friedman, lex ai, lex debat...",https://i.ytimg.com/vi/abd5hguWKz0/hqdefault.jpg,0,,True,"jews-51, part-50, idea-49, german-49, intellig...","- Most people, most of the time, are polite, c..."
1,MzkgWDCucNY,Bernie Sanders Interview | Lex Fridman Podcast...,Bernie Sanders is a US Senator from Vermont an...,2024-10-23T20:19:21Z,PT1H2M32S,1218968,23415,5665,"[Bernie Sanders, alex friedman, lex ai, lex de...",https://i.ytimg.com/vi/MzkgWDCucNY/hqdefault.jpg,0,,True,"right-56, money-39, working-28, country-28, he...",- The ideas that I am talking about are ideas ...
2,NMHiLvirCb0,Graham Hancock: Lost Civilization of the Ice A...,Graham Hancock a journalist and author who for...,2024-10-16T12:16:21Z,PT2H33M2S,2656311,43799,7821,"[Graham Hancock, alex friedman, lex ai, lex de...",https://i.ytimg.com/vi/NMHiLvirCb0/hqdefault.jpg,0,,True,"ago-63, civilization-60, human-53, ancient-48,...",- The big question for\nme in that timeline is...
3,q8VePUwjB9Y,"Jordan Peterson: Nietzsche, Hitler, God, Psych...","Jordan Peterson is a psychologist, author, lec...",2024-10-11T18:03:40Z,PT2H23M5S,1132848,25316,4191,"[Jordan Peterson, alex friedman, lex ai, lex d...",https://i.ytimg.com/vi/q8VePUwjB9Y/hqdefault.jpg,0,,True,"right-84, god-40, ideas-37, idea-36, nietzsche-33",- The following is a conversation\nwith Jordan...
4,oFfVt3S51T4,Cursor Team: Future of Programming with AI | L...,"Aman Sanger, Arvid Lunnemark, Michael Truell, ...",2024-10-06T18:43:14Z,PT2H29M5S,488086,9397,881,"[Cursor Team, alex friedman, lex ai, lex debat...",https://i.ytimg.com/vi/oFfVt3S51T4/hqdefault.jpg,0,,True,"code-194, model-156, models-110, programming-5...",- The following is a conversation with the fou...
5,AzzE7GOvYz8,"Ed Barnhart: Maya, Aztec, Inca, and Lost Civil...",Ed Barnhart is an archaeologist and explorer s...,2024-09-30T17:23:24Z,PT3H28M51S,2750975,25303,2801,"[Ed Barnhart, alex friedman, lex ai, lex debat...",https://i.ytimg.com/vi/AzzE7GOvYz8/hqdefault.jpg,0,,True,"maya-88, right-63, america-54, big-54, civiliz...",- For the vast majority\nof human existence. W...
6,Q8Qk_3a3lUw,"Vivek Ramaswamy: Trump, Conservatism, National...","Vivek Ramaswamy is a conservative politician, ...",2024-09-25T17:59:15Z,PT2H40M26S,770550,20913,6031,"[Vivek Ramaswamy, alex friedman, lex ai, lex d...",https://i.ytimg.com/vi/Q8Qk_3a3lUw/hqdefault.jpg,0,,True,"right-125, states-118, united-111, country-94,...","- The way I would do it, 75% headcount reducti..."
7,s1oTH4Sjvzg,"Vejas Liulevicius: Communism, Marxism, Nazism,...",Vejas Liulevicius is a historian specializing ...,2024-09-20T20:30:55Z,PT3H31M58S,909253,10610,1638,"[Vejas Liulevicius, alex friedman, lex ai, lex...",https://i.ytimg.com/vi/s1oTH4Sjvzg/hqdefault.jpg,0,,True,"marx-87, history-80, war-68, soviet-64, right-60","- And the outcome here is\na horrific, manmade..."
8,DyoVVSggPjY,Gregory Aldrete: The Roman Empire - Rise and F...,Gregory Aldrete is a historian specializing in...,2024-09-12T19:05:50Z,PT3H42M21S,3755496,43979,3773,"[Gregory Aldrete, alex friedman, lex ai, lex d...",https://i.ytimg.com/vi/DyoVVSggPjY/hqdefault.jpg,0,,True,"roman-257, romans-150, rome-112, empire-110, h...",- So Rome always wins because even if they los...
9,qCbfTN-caFI,Donald Trump Interview | Lex Fridman Podcast #442,Donald Trump is the 45th President of the Unit...,2024-09-03T16:21:05Z,PT1H4M18S,6268026,227993,59812,"[Donald Trump, alex friedman, lex ai, lex deba...",https://i.ytimg.com/vi/qCbfTN-caFI/hqdefault.jpg,0,,True,"right-33, country-31, done-22, big-18, electio...","- I don't know if you know this, but some peop..."


### Get videos' URLs

In [234]:
# Convert the video Ids to YouTube URLs
lex_df0['yt_url'] = 'https://www.youtube.com/watch?v=' + lex_df0['id']

### Get Epsidoe Number, Guest Name, and Episode Summary

In [235]:
# Extract episode numbers
lex_df0['number'] = lex_df0[lex_df0['yt_title'].str.contains('#', na=False)]['yt_title'].str.split("#").str[1]
lex_df0['number'] = lex_df0['number'].fillna('').astype(int)

# First split on '|' and keep only the first part
lex_df0['clean_title'] = lex_df0['yt_title'].str.split('|').str[0]

# Then split that into Guest and Summary on ':'
lex_df0[['guest', 'summary']] = lex_df0['clean_title'].str.split(":", n=1, expand=True)

# Keep only first two words of Guest name
lex_df0['guest'] = lex_df0['guest'].str.split().str[:2].str.join(' ')

# Drop intermediate column
lex_df0.drop(columns='clean_title', inplace=True)

# Clean up any remaining whitespace
lex_df0['guest'] = lex_df0['guest'].str.strip()
lex_df0['summary'] = lex_df0['summary'].str.strip()

# Display the first rows of the updated DataFrame
lex_df0.head()

Unnamed: 0,id,yt_title,description,upload_date,duration,views,likes,comments_count,tags,thumbnail_url,favorite_count,region_restriction,captions_availability,top_five_words,captions_text,yt_url,number,guest,summary
0,abd5hguWKz0,"Rick Spence: CIA, KGB, Illuminati, Secret Soci...",Rick Spence is a historian specializing in the...,2024-10-30T18:06:12Z,PT3H28M20S,901686,14696,2037,"[Rick Spence, alex friedman, lex ai, lex debat...",https://i.ytimg.com/vi/abd5hguWKz0/hqdefault.jpg,0,,True,"jews-51, part-50, idea-49, german-49, intellig...","- Most people, most of the time, are polite, c...",https://www.youtube.com/watch?v=abd5hguWKz0,451,Rick Spence,"CIA, KGB, Illuminati, Secret Societies, Cults ..."
1,MzkgWDCucNY,Bernie Sanders Interview | Lex Fridman Podcast...,Bernie Sanders is a US Senator from Vermont an...,2024-10-23T20:19:21Z,PT1H2M32S,1218968,23415,5665,"[Bernie Sanders, alex friedman, lex ai, lex de...",https://i.ytimg.com/vi/MzkgWDCucNY/hqdefault.jpg,0,,True,"right-56, money-39, working-28, country-28, he...",- The ideas that I am talking about are ideas ...,https://www.youtube.com/watch?v=MzkgWDCucNY,450,Bernie Sanders,
2,NMHiLvirCb0,Graham Hancock: Lost Civilization of the Ice A...,Graham Hancock a journalist and author who for...,2024-10-16T12:16:21Z,PT2H33M2S,2656311,43799,7821,"[Graham Hancock, alex friedman, lex ai, lex de...",https://i.ytimg.com/vi/NMHiLvirCb0/hqdefault.jpg,0,,True,"ago-63, civilization-60, human-53, ancient-48,...",- The big question for\nme in that timeline is...,https://www.youtube.com/watch?v=NMHiLvirCb0,449,Graham Hancock,Lost Civilization of the Ice Age & Ancient Hum...
3,q8VePUwjB9Y,"Jordan Peterson: Nietzsche, Hitler, God, Psych...","Jordan Peterson is a psychologist, author, lec...",2024-10-11T18:03:40Z,PT2H23M5S,1132848,25316,4191,"[Jordan Peterson, alex friedman, lex ai, lex d...",https://i.ytimg.com/vi/q8VePUwjB9Y/hqdefault.jpg,0,,True,"right-84, god-40, ideas-37, idea-36, nietzsche-33",- The following is a conversation\nwith Jordan...,https://www.youtube.com/watch?v=q8VePUwjB9Y,448,Jordan Peterson,"Nietzsche, Hitler, God, Psychopathy, Suffering..."
4,oFfVt3S51T4,Cursor Team: Future of Programming with AI | L...,"Aman Sanger, Arvid Lunnemark, Michael Truell, ...",2024-10-06T18:43:14Z,PT2H29M5S,488086,9397,881,"[Cursor Team, alex friedman, lex ai, lex debat...",https://i.ytimg.com/vi/oFfVt3S51T4/hqdefault.jpg,0,,True,"code-194, model-156, models-110, programming-5...",- The following is a conversation with the fou...,https://www.youtube.com/watch?v=oFfVt3S51T4,447,Cursor Team,Future of Programming with AI


### Convert Data Types

In [236]:
# Convert the 'duration' column from ISO 8601 format to a timedelta object
def convert_duration(duration):
    """
    Converts an ISO 8601 duration string (e.g., "PT1H23M45S") to a timedelta object.

    Args:
        duration (str): The ISO 8601 duration string.

    Returns:
        timedelta: The equivalent timedelta object.
    """
    duration = duration[2:] # Remove the "PT" prefix

    hours = 0
    minutes = 0
    seconds = 0

    if "H" in duration:
        hours, duration = int(duration.split("H")[0]), duration.split("H")[1] #Extract Hour
    if "M" in duration:
        minutes, duration = int(duration.split("M")[0]) if duration.split("M")[0] else 0 , duration.split("M")[1] if len(duration.split("M")) > 1 else "" #Extract minutes
    if "S" in duration:
        seconds = int(duration.split("S")[0]) if duration.split("S")[0] else 0 #Extract seconds

    total_seconds = 3600 * hours + 60 * minutes + seconds # Calculate total seconds

    duration = datetime.timedelta(seconds=total_seconds) # Return as timedelta

    return duration

# Apply the conversion function to the 'duration' column
lex_df0['duration'] = lex_df0['duration'].apply(convert_duration)

# Create a new column 'duration_minutes' representing the duration in minutes
lex_df0['duration_minutes'] = lex_df0['duration'].dt.total_seconds() / 60

# Display the first rows of the updated DataFrame
lex_df0.head()

Unnamed: 0,id,yt_title,description,upload_date,duration,views,likes,comments_count,tags,thumbnail_url,favorite_count,region_restriction,captions_availability,top_five_words,captions_text,yt_url,number,guest,summary,duration_minutes
0,abd5hguWKz0,"Rick Spence: CIA, KGB, Illuminati, Secret Soci...",Rick Spence is a historian specializing in the...,2024-10-30T18:06:12Z,0 days 03:28:20,901686,14696,2037,"[Rick Spence, alex friedman, lex ai, lex debat...",https://i.ytimg.com/vi/abd5hguWKz0/hqdefault.jpg,0,,True,"jews-51, part-50, idea-49, german-49, intellig...","- Most people, most of the time, are polite, c...",https://www.youtube.com/watch?v=abd5hguWKz0,451,Rick Spence,"CIA, KGB, Illuminati, Secret Societies, Cults ...",208.333333
1,MzkgWDCucNY,Bernie Sanders Interview | Lex Fridman Podcast...,Bernie Sanders is a US Senator from Vermont an...,2024-10-23T20:19:21Z,0 days 01:02:32,1218968,23415,5665,"[Bernie Sanders, alex friedman, lex ai, lex de...",https://i.ytimg.com/vi/MzkgWDCucNY/hqdefault.jpg,0,,True,"right-56, money-39, working-28, country-28, he...",- The ideas that I am talking about are ideas ...,https://www.youtube.com/watch?v=MzkgWDCucNY,450,Bernie Sanders,,62.533333
2,NMHiLvirCb0,Graham Hancock: Lost Civilization of the Ice A...,Graham Hancock a journalist and author who for...,2024-10-16T12:16:21Z,0 days 02:33:02,2656311,43799,7821,"[Graham Hancock, alex friedman, lex ai, lex de...",https://i.ytimg.com/vi/NMHiLvirCb0/hqdefault.jpg,0,,True,"ago-63, civilization-60, human-53, ancient-48,...",- The big question for\nme in that timeline is...,https://www.youtube.com/watch?v=NMHiLvirCb0,449,Graham Hancock,Lost Civilization of the Ice Age & Ancient Hum...,153.033333
3,q8VePUwjB9Y,"Jordan Peterson: Nietzsche, Hitler, God, Psych...","Jordan Peterson is a psychologist, author, lec...",2024-10-11T18:03:40Z,0 days 02:23:05,1132848,25316,4191,"[Jordan Peterson, alex friedman, lex ai, lex d...",https://i.ytimg.com/vi/q8VePUwjB9Y/hqdefault.jpg,0,,True,"right-84, god-40, ideas-37, idea-36, nietzsche-33",- The following is a conversation\nwith Jordan...,https://www.youtube.com/watch?v=q8VePUwjB9Y,448,Jordan Peterson,"Nietzsche, Hitler, God, Psychopathy, Suffering...",143.083333
4,oFfVt3S51T4,Cursor Team: Future of Programming with AI | L...,"Aman Sanger, Arvid Lunnemark, Michael Truell, ...",2024-10-06T18:43:14Z,0 days 02:29:05,488086,9397,881,"[Cursor Team, alex friedman, lex ai, lex debat...",https://i.ytimg.com/vi/oFfVt3S51T4/hqdefault.jpg,0,,True,"code-194, model-156, models-110, programming-5...",- The following is a conversation with the fou...,https://www.youtube.com/watch?v=oFfVt3S51T4,447,Cursor Team,Future of Programming with AI,149.083333


In [237]:
# Type Conversion for Data Analysis and Consistency

# Convert numerical statistics columns to integers
lex_df0[['views', 'likes', 'comments_count', 'favorite_count']] = lex_df0[['views', 'likes', 'comments_count', 'favorite_count']].astype(int)

# Convert duration in minutes to float (for potential fractional minutes)
lex_df0['duration_minutes'] = lex_df0['duration_minutes'].astype(float)

# Convert captions availability to boolean (True/False) for easier filtering/analysis
lex_df0['captions_availability'] = lex_df0['captions_availability'].apply(lambda x: True if x.lower() == 'true' else False)

# Convert upload dates to datetime objects for date/time-based operations
lex_df0['upload_date'] = pd.to_datetime(lex_df0['upload_date'])

### Get guest nationality and profession

I'll get the guests nationality and profession using `requests` in wikipedia pages

In [238]:
# Record start time for performance measurement
start = time.time()

# Define a function to fetch guest nationality and profession from Wikipedia
def fetch_bio_data(guest_name):
    """
    Fetches biographical data (nationality and profession) for a given guest name from Wikipedia pages

    Args:
        guest_name (str): The name of the guest.

    Returns:
        dict: A dictionary containing the guest's nationality and profession, or NaN values if not found.
    """
    url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{guest_name.replace(' ', '_')}" # Construct Wikipedia page URL
    response = requests.get(url)
    
    if response.status_code == 200: # Check for successful response
        data = response.json()
        # Extract nationality and profession from the Wikipedia page summary (if available)
        return {
            'nationality': data.get('description', 'N/A').split(", ")[-1] if 'description' in data else 'N/A', # Nationality is assumed to be the last element
            'profession': data.get('description', 'N/A').split(", ")[0] if 'description' in data else 'N/A' # Profession is assumed to be the first element
        }
    # Handle cases where the Wikipedia page is not found or other errors
    else:
        return {'nationality': np.nan, 'profession': np.nan}

# Apply function to DataFrame and create new columns
lex_df0[['nationality', 'profession']] = lex_df0['guest'].apply(lambda x: pd.Series(fetch_bio_data(x)))

# Calculate and print execution time
end = time.time()
print(f'This scraping operation took {round(end - start)} seconds')

This scraping operation took 55 seconds


In [239]:
# Display the first 10 rows of the newly added columns with the guest names
lex_df0[['guest', 'nationality', 'profession']].head(10)

Unnamed: 0,guest,nationality,profession
0,Rick Spence,,
1,Bernie Sanders,American politician and activist (born 1941),American politician and activist (born 1941)
2,Graham Hancock,British writer (born 1950),British writer (born 1950)
3,Jordan Peterson,Canadian clinical psychologist (born 1962),Canadian clinical psychologist (born 1962)
4,Cursor Team,,
5,Ed Barnhart,American archaeologist and explorer (born 1968),American archaeologist and explorer (born 1968)
6,Vivek Ramaswamy,American businessman (born 1985),American businessman (born 1985)
7,Vejas Liulevicius,,
8,Gregory Aldrete,American academic,American academic
9,Donald Trump,Former president and president-elect of the Un...,Former president and president-elect of the Un...


After checking the profession and nationality columns, I found some inconsistencies and duplications between the two columns. So I'll handle those using regular expressions.

In [240]:
# Refine nationality and profession extraction using regular expressions
def split_nationality_profession(text):
    """
    Splits a string into nationality and profession components using a regular expression.
    Assumes nationality consists of capitalized words (including hyphenated) at the beginning of the string.

    Args:
        text (str): The input string containing potential nationality and profession information.

    Returns:
        pd.Series: A Pandas Series containing the extracted nationality and profession.
                   Returns empty strings if the input is not a string or no match is found.
    """
    # Check if the input is a string
    if isinstance(text, str):  # Check if the input is a valid string
        # Regular expression to match capitalized words (or hyphenated capitalized words) at the beginning of the string
        match = re.match(r'^([A-Z][a-z]*(?:-[A-Z][a-z]*)*\s?)+', text)
        if match:
            nationality = match.group().strip()
            profession = text[len(nationality):].strip()
            return pd.Series([nationality, profession])
    # Return empty strings if the text is not a valid string
    return pd.Series(["", ""])

# Apply the function to the 'profession' column to split it into separate 'nationality' and 'profession' columns
lex_df0[['nationality', 'profession']] = lex_df0['profession'].apply(split_nationality_profession)

In [241]:
# Display the first 30 rows
lex_df0[['guest', 'nationality', 'profession']].head(30)

Unnamed: 0,guest,nationality,profession
0,Rick Spence,,
1,Bernie Sanders,American,politician and activist (born 1941)
2,Graham Hancock,British,writer (born 1950)
3,Jordan Peterson,Canadian,clinical psychologist (born 1962)
4,Cursor Team,,
5,Ed Barnhart,American,archaeologist and explorer (born 1968)
6,Vivek Ramaswamy,American,businessman (born 1985)
7,Vejas Liulevicius,,
8,Gregory Aldrete,American,academic
9,Donald Trump,Former,president and president-elect of the United St...


Now this looks so much better, Now let's do some manual refinemnets and standardization into the **nationality** column.

In [242]:
# Define a dictionary to map incorrect or inconsistent nationality data to their standardized equivalents
replacement_dict = {
    "President": "American",
    "Retired United States Navy SEAL": "American",
    "Israeli-American Objectivist": "Israeli-American",
    "American AI": "American",
    "Brazilian Jiu-Jitsu": "Brazilian",
    "Australia": "Australian",
    "Australian": "Australian",
    "American YouTuber": "American",
    "English-American": "British-American",
    "English": "British",
    "American-Swiss": "Swiss-American",
    "French American": "French-American",
    "Topics referred to by the same term": 'N/A'
}

# Replace values in the 'nationality' column using the mapping dictionary
lex_df0['nationality'] = lex_df0['nationality'].replace(replacement_dict)#, regex=True)

Now let's do some regular expressions refinemnets and standardization into the **prefession** column, I'll remove the birth years too because they're not extracted for every guest, and I'll add them later for everyone using Wikidata.

In [243]:
# Remove birth year information from profession entries using a regular expression
# \s*: Matches zero or more whitespace characters
# \( and \): Matches literal parentheses
# born: Matches the literal word "born"
# \d{4}: Matches exactly four digits (the year)
lex_df0['profession'] = lex_df0['profession'].str.replace(r'\s*\(born \d{4}\)', '', regex=True)

# Remove leading/trailing whitespace from profession entries
lex_df0['profession'] = lex_df0['profession'].str.strip()

# Define a function to standardizes profession column
def standardize_profession(profession):
    """
    Standardizes profession strings by removing extra whitespace, birth year info, and parenthetical content.

    Args:
        profession (str or NaN): The profession string to standardize.

    Returns:
        str or NaN: The standardized profession string, or NaN if the input was NaN.
    """
    if pd.isna(profession): # Handle missing values (NaN)
        return profession
        
    profession = str(profession).lower().strip() # Convert to lowercase and remove leading/trailing whitespace
    
    # Remove the '-Born' prefix
    profession = profession.replace('-born', '').strip()
    profession = profession.replace('born', '').strip()
    
    # Remove parenthetical content (including anything inside parentheses)
    profession = re.sub(r'\s*\(.*?\)', '', profession) # Non-greedy matching to avoid capturing too much

    # Remove content after an en dash (including the dash) – often used for birth/death dates or additional info
    profession = re.sub(r'\s*–.*', '', profession)
    
    return profession

# Apply the standardization function to the 'profession' column
lex_df0['profession'] = lex_df0['profession'].apply(standardize_profession)

In [244]:
# Manually refine the profession column, I try as much as I can to make it just a single word to make the analysis easier

# Define a dictionary to map incorrect or inconsistent profession data to their standardized equivalents
profession_replacement_dict = {
    "-born businessman": "Businessperson",
    "neuroscientist and podcaster": "Neuroscientist",
    "computer scientist and mathematician": "Computer Scientist",
    "podcaster and comedian": "Comedian",
    "linguist and activist": "Linguist",
    "philosopher and neuroscientist": "Philosopher",
    "investor and hedge fund manager": "Investor",
    "comedian and actor": "Comedian",
    "entrepreneur and investor": "Entrepreneur",
    "clinical psychologist": "Psychologist",
    "wrestler and mixed martial artist": "Martial Arts",
    "speculative fiction writer": "Writer",
    "physician-scientist": "Scientist",
    "astronomer and planetary scientist": "Astronomer",
    "computer programmer and entrepreneur": "Computer Scientist",
    "rapper and record producer": "Rapper",
    "wrestler": "Martial Arts",
    "chef and businessperson": "Chef",
    "businessman and software engineer": "Businessperson",
    "defector and activist": "Defector",
    "evolutionary biologist and author": "Biologist",
    "cognitive scientist": "Scientist",
    "physicist and Nobel laureate": "Physicist",
    "writer and director": "Writer",
    "wrestler and coach": "Martial Arts",
    "stand-up comedian and actor": "Comedian",
    "moral philosopher": "Philosopher",
    "mathematical physicist": "Physicist",
    "businessman and investor": "Businessperson",
    "politician and activist": "Politician",
    "social psychologist": "Psychologist",
    "professional armwrestler": "Martial Arts",
    "political scientist": "Scientist",
    "and businessman": "Businessperson",
    "businessman and author": "Businessperson",    
    "researcher and writer": "Writer",
    "designer and academic": "Designer",
    "Investigative journalist and author": "Journalist",
    "chess grandmaster and streamer": "Chess Player",
    "comedian and surgeon": "Comedian",
    "businesswoman": "Businessperson",
    "archaeologist and explorer": "Archaeologist",
    "-born AI researcher": "AI Expert",
    "chess player and content creator": "Chess Player",    
    "record producer": "Producer",
    "evolutionary psychologist": "Psychologist",
    "artificial intelligence researcher": "AI Expert",
    "economist and author": "Economist",
    "cosmologist and astrophysicist": "Astrophysicist",
    "computer programmer and video game developer": "Computer Scientist",
    "chess grandmaster": "Chess Player",
    "biochemist and writer": "Biochemist",
    "software engineer": "Computer Scientist",
    "-Founder Of Wikipedia": "Founder Of Wikipedia",
    "And Writer": "Writer",
    "Physician": "Physicist",
    "Programmer": "Computer Scientist",
    "Business Executive": "Businessperson",
    "ai scientist": "AI Expert",
    "anthropologist and primatologist": "Anthropologist",
    "anti-scientology activist": "Activist",
    "Archaeologist And Explorer": "Archaeologist",
    "astronautical engineer": "Engineer",
    "astronomer and associate professor": "Astronomer",
    "Astronomer And Planetary Scientist": "Astronomer",
    "author and motivational speaker": "Author",
    "author and producer": "Author",
    "billionaire hedge fund manager  1966": "Investor",
    "business executive": "Businessperson",
    "businessman": "Businessperson",
    "businessperson and author": "Businessperson",
    "businesswoman": "Businessperson",
    "Chef And Businessperson": "Chef",
    "Investigative Journalist": "Journalist",    
    "aerospace engineer": "Engineer",
    "ai researcher": "AI Expert",
    "co-founders of fermat’s library": "Co-Founder",
    "computer scientist and ai researcher": "Computer Scientist",
    "computer scientist and technology executive": "Computer Scientist",
    "conservationist and author": "Conservationist",    
    "cryptocurrency entrepreneur": "Entrepreneur",
    "economist and political scientist": "Economist",    
    "free speech advocate": "Attorney",    
    "geopolitical commentator and author": "Political Commentator",
    "international rugby league footballer": "Footballer",
    "inventor and robotics youtuber": "Inventor",
    "investigative journalist and author": "Journalist",
    "legal scholar": "Scholar",    
    "machine learning researcher": "AI Expert",
    "mathematics educator": "Mathematician",
    "mixed martial artist and professional wrestler": "Martial Arts",
    "mixed martial arts fighter": "Martial Arts",
    "molecular biologist": "Biologist",    
    "multimedia instant messaging app": "Entrepreneur",
    "of wellesley college": "Professor",
    "particle physicist": "Physicist",
    "philanthropic organization": "Activist",
    "philosopher and cognitive scientist": "Philosopher",
    "physician and author": "Physician",
    "physicist and computational neuroscientist": "Physicist",
    "physicist and nobel laureate": "Physicist",
    "planetary physicist": "Physicist",
    "podcaster and author": "Podcaster",
    "political pundit": "Political Commentator",
    "practitioner and mixed martial artist": "Martial Arts",
    "software developer": "Computer Scientist",
    "technology company that produces consumer robots": "Roboticist",
    "theoretical physicist and astrobiologist": "Physicist",
    "ultramarathon runner": "Athlete",
    "writer and media personality": "Writer",
    "film producer": "Producer",
    "filmmaker": "Director",
    "Film Producer": "Producer",
    "Film Director": "Director",
    "musician": "Artist",
    "game designer": "Designer",
    "judoka": "Martial Arts",
    "singer": "Artist",
    "programmer": "Computer Scientist",
    "theoretical physicist": "Physicist",
    "computational biologist": "Biologist",
    "research scientist": "Researcher",
    "lawyer": "Attorney"
}

# Replace values in the 'profession' column using the mapping dictionary
lex_df0['profession'] = lex_df0['profession'].replace(profession_replacement_dict)

In [245]:
# Display the first 30 rows
lex_df0[['guest', 'nationality', 'profession']].head(30)

Unnamed: 0,guest,nationality,profession
0,Rick Spence,,
1,Bernie Sanders,American,Politician
2,Graham Hancock,British,writer
3,Jordan Peterson,Canadian,Psychologist
4,Cursor Team,,
5,Ed Barnhart,American,Archaeologist
6,Vivek Ramaswamy,American,Businessperson
7,Vejas Liulevicius,,
8,Gregory Aldrete,American,academic
9,Donald Trump,Former,president and president-elect of the united st...


### Get guest birth and death years

I'll get the guests birth and death years using `requests` in wikidata pages

In [246]:
# Record start time for performance measurement
start = time.time()

def get_birth_death_years(name):
    """
    Queries Wikidata for the birth and death years of guest names.

    Args:
        name (str): The name of the guest to search for.

    Returns:
        tuple: A tuple containing the birth year and death year (as strings), or (None, None) if not found or an error occurs.
    """
    url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={name}&language=en&format=json" # Construct Wikidata page URL
    
    try:
        response = requests.get(url)
        data = response.json()
        
        if data['search']: # Check if any search results were returned
            entity_id = data['search'][0]['id'] # Get the Wikidata entity ID of the first result (most likely match)

            # Construct the Wikidata URL to get claims (properties) for the entity
            details_url = f"https://www.wikidata.org/w/api.php?action=wbgetclaims&entity={entity_id}&format=json"
            details_response = requests.get(details_url)
            details_data = details_response.json()
            
            birth_year = death_year = None
            
            # Extract birth year (property P569)
            if 'P569' in details_data['claims']:
                birth_date = details_data['claims']['P569'][0]['mainsnak']['datavalue']['value']['time']
                birth_year = birth_date[1:5]  # Extract the year from the date string

            # Extract death year (property P570)    
            if 'P570' in details_data['claims']:
                death_date = details_data['claims']['P570'][0]['mainsnak']['datavalue']['value']['time']
                death_year = death_date[1:5]  # Extract the year from the date string
            
            return birth_year, death_year
    except Exception as e: # Handle any errors during the process
        print(f"Error retrieving data for {name}: {e}")
        return None, None # Return None values if an error occurred
    
# Apply the function to the 'guest' column to create new 'birth_year' and 'death_year' columns
lex_df0[['birth_year', 'death_year']] = lex_df0['guest'].apply(lambda name: pd.Series(get_birth_death_years(name)))

# Calculate and print execution time
end = time.time()
print(f'This scraping operation took {round(end - start)} seconds')

This scraping operation took 302 seconds


In [247]:
# Convert birth and death years into integers
lex_df0['birth_year'] = lex_df0['birth_year'].fillna(0).astype(int)
lex_df0['death_year'] = lex_df0['death_year'].fillna(0).astype(int)

In [248]:
# Display the last 30 rows of the newly added columns with the guest names
lex_df0[['guest', 'nationality', 'profession', 'birth_year', 'death_year']].tail(30)

Unnamed: 0,guest,nationality,profession,birth_year,death_year
444,Vladimir Vapnik,Russian,mathematician,1936,0
445,Yoshua Bengio,Canadian,computer scientist,1964,0
446,Steven Pinker,Canadian-American,psycholinguist,1954,0
447,Christof Koch,German-American,neurophysiologist,1956,0
448,Max Tegmark,Swedish-American,cosmologist,1967,0
449,Andrew Bustamante,,,0,0
450,Tucker Carlson,American,political commentator,1969,0
451,Jordan Peterson,Canadian,Psychologist,1962,0
452,Elon Musk,South African,Businessperson,1971,0
453,Paul Rosolie,American,Conservationist,0,0


### Final manual refinements

Now I have to do some manual refinements on the whole dataset, on guest	nationality, profession, birth_year, and death_year.

In [249]:
# As some guests birth years are not available on the internet, I'll do some estimation based on their education and careers, and indicate that with a boolean column if it's estimated or not
lex_df0['birth_year_estimated'] = False

In [250]:
# Manual refining of guests’ biographic data

lex_df0.loc[lex_df0['guest'] == 'Bernie Sanders', ['profession']] = ['Politician']
lex_df0.loc[lex_df0['guest'] == 'Mohammed El-Kurd', ['profession']] = ['Activist']
lex_df0.loc[lex_df0['guest'] == 'Noam Chomsky', ['profession']] = ['Activist']
lex_df0.loc[lex_df0['guest'] == 'Rick Doblin', ['profession']] = ['Researcher']
lex_df0.loc[lex_df0['guest'] == 'Yeonmi Park', ['profession']] = ['Activist']
lex_df0.loc[lex_df0['guest'] == 'Marcus Hutter', ['nationality', 'profession']] = ['German', 'AI Expert']
lex_df0.loc[lex_df0['guest'] == 'Harvey Silverglate', ['nationality', 'profession']] = ['American', 'Attorney']
lex_df0.loc[lex_df0['guest'] == 'Albert Bourla', ['nationality', 'profession']] = ['Greek-American', 'Businessperson']
lex_df0.loc[lex_df0['guest'] == 'Chris Urmson', ['nationality', 'profession']] = ['Canadian', 'Engineer']
lex_df0.loc[lex_df0['guest'] == 'Marc Raibert', ['nationality', 'profession']] = ['American', 'Engineer']
lex_df0.loc[lex_df0['guest'] == 'Marc Andreessen', ['nationality', 'profession']] = ['American', 'Entrepreneur']
lex_df0.loc[lex_df0['guest'] == 'Douglas Lenat', ['nationality', 'profession']] = ['American', 'Computer Scientist']
lex_df0.loc[lex_df0['guest'] == 'Ilya Sutskever', ['nationality', 'profession']] = ['Israeli-Canadian', 'Computer Scientist']
lex_df0.loc[lex_df0['guest'] == 'Judea Pearl', ['nationality', 'profession']] = ['Israeli-American', 'Computer Scientist']
lex_df0.loc[lex_df0['guest'] == 'Cristiano Amon', ['nationality', 'profession']] = ['Brazilian', 'Engineer']
lex_df0.loc[lex_df0['guest'] == 'Bjørn Lomborg and Andrew Revkin', ['nationality', 'profession']] = ['Mixed', 'Mixed']
lex_df0.loc[lex_df0['guest'] == 'David Ferrucci', ['nationality', 'profession', 'birth_year']] = ['American', 'Computer Scientist', 1970]
lex_df0.loc[lex_df0['guest'] == 'François Chollet', ['nationality', 'profession']] = ['French', 'Computer Scientist']
lex_df0.loc[lex_df0['guest'] == 'Pieter Abbeel', ['nationality', 'profession']] = ['Belgian-American', 'Computer Scientist']
lex_df0.loc[lex_df0['guest'] == 'Grant Sanderson', ['nationality', 'profession', 'birth_year']] = ['American', 'YouTuber', 1997]
lex_df0.loc[lex_df0['guest'] == 'Aella', ['nationality', 'profession', 'birth_year','death_year']] = ['American', 'Sex Worker', 1992, 0]
lex_df0.loc[lex_df0['guest'] == 'Nick Bostrom', ['nationality', 'profession']] = ['Swedish', 'Philosopher']
lex_df0.loc[lex_df0['guest'] == 'Destiny', ['nationality', 'profession', 'birth_year']] = ['American', 'Streamer', 1988]
lex_df0.loc[lex_df0['guest'] == 'Clara Sousa-Silva', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['Portuguese', 'Astrophysicist', 1985, True]
lex_df0.loc[lex_df0['guest'] == 'Andrew Callaghan', ['nationality', 'profession']] = ['American', 'YouTuber']
lex_df0.loc[lex_df0['guest'] == 'Cursor Team', ['nationality', 'profession']] = ['Mixed', 'Computer Scientist']
lex_df0.loc[lex_df0['guest'] == 'Lisa Feldman', ['nationality', 'profession']] = ['Canadian-American', 'Psychologist']
lex_df0.loc[lex_df0['guest'] == 'Charles Isbell', ['nationality', 'profession']] = ['American', 'Computer Scientist']
lex_df0.loc[lex_df0['guest'] == 'John Clarke', ['nationality', 'profession']] = ['American', 'Martial Arts']
lex_df0.loc[lex_df0['guest'] == 'Matthew Johnson', ['nationality', 'profession']] = ['American', 'Psychologist']
lex_df0.loc[lex_df0['guest'] == 'Vejas Liulevicius', ['nationality', 'profession', 'birth_year']] = ['American', 'Historian', 1968]
lex_df0.loc[lex_df0['guest'] == 'Ryan Hall', ['nationality', 'profession']] = ['American', 'Martial Arts']
lex_df0.loc[lex_df0['guest'] == 'Aaron Smith-Levin', ['nationality', 'profession', 'birth_year']] = ['American', 'Activist', 1981]
lex_df0.loc[lex_df0['guest'] == 'Sara Walker', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['American', 'Activist', 1982, True]
lex_df0.loc[lex_df0['guest'] == 'Alien Debate', ['guest', 'nationality', 'profession']] = ['Sara Walker and Lee Cronin', 'Mixed', 'Mixed']
lex_df0.loc[lex_df0['guest'] == 'Anca Dragan', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['Romanian', 'Professor', 1986, True]
lex_df0.loc[lex_df0['guest'] == 'Aravind Srinivas', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['Indian-American', 'Computer Scientist', 1994, True]
lex_df0.loc[lex_df0['guest'] == 'B-Team Jiu', ['guest', 'nationality', 'profession']] = ['Craig Jones, Nicky Rod, and Nicky Ryan', 'Mixed', 'Martial Arts']
lex_df0.loc[lex_df0['guest'] == 'Balaji Srinivasan', ['nationality', 'profession', 'birth_year']] = ['American', 'Entrepreneur', 1980]
lex_df0.loc[lex_df0['guest'] == 'Benjamin Netanyahu', ['nationality', 'profession']] = ['Israeli', 'Politician, War Criminal']
lex_df0.loc[lex_df0['guest'] == 'Bishop Robert', ['guest', 'nationality', 'profession', 'birth_year', 'death_year']] = ['Bishop Robert Barron', 'American', 'Bishop', 1959, 0]
lex_df0.loc[lex_df0['guest'] == 'Bjarne Stroustrup', ['nationality', 'profession']] = ['Danish', 'Computer Scientist']
lex_df0.loc[lex_df0['guest'] == 'Boris Sofman', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['American', 'Engineer', 1982, True]
lex_df0.loc[lex_df0['guest'] == 'Botez Sisters', ['guest', 'nationality', 'profession']] = ['Botez Sisters (Alexandra and Andrea Botez)', 'Canadian', 'Chess Player']
lex_df0.loc[lex_df0['guest'] == 'Brian Armstrong', ['nationality', 'profession']] = ['American', 'Businessperson']
lex_df0.loc[lex_df0['guest'] == 'Brian Muraresku', ['nationality', 'profession', 'birth_year']] = ['American', 'Author', 1984]
lex_df0.loc[lex_df0['guest'] == 'Charan Ranganath', ['nationality', 'profession', 'birth_year']] = ['Indian-American', 'Psychologist', 1971]
lex_df0.loc[lex_df0['guest'] == 'Chris Duffin', ['nationality', 'profession', 'birth_year']] = ['American', 'Entrepreneur', 1977]
lex_df0.loc[lex_df0['guest'] == 'Chris Mason', ['nationality', 'profession']] = ['American', 'Professor']
lex_df0.loc[lex_df0['guest'] == 'Chris Tarbell', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['American', 'Former Intelligence Officer', 1977, True]
lex_df0.loc[lex_df0['guest'] == 'Christopher Capozzola', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['American', 'Professor', 1972, True]
lex_df0.loc[lex_df0['guest'] == 'Craig Jones', ['nationality', 'profession', 'birth_year']] = ['Australian', 'Martial Arts', 1991]
lex_df0.loc[lex_df0['guest'] == 'Cristos Goodrow', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['American', 'Computer Scientist', 1969, True]
lex_df0.loc[lex_df0['guest'] == 'Dan Kokotov', ['nationality', 'profession', 'birth_year']] = ['Russian-American', 'Computer Scientist', 1978]
lex_df0.loc[lex_df0['guest'] == 'Dan Reynolds', ['nationality', 'profession']] = ['American', 'Artist']
lex_df0.loc[lex_df0['guest'] == 'Daniel Schmachtenberger', ['nationality', 'profession', 'birth_year']] = ['American', 'Philosopher', 1984]
lex_df0.loc[lex_df0['guest'] == 'David Eagleman', ['nationality', 'profession']] = ['American', 'Neuroscientist']
lex_df0.loc[lex_df0['guest'] == 'David Patterson', ['nationality', 'profession']] = ['American', 'Computer Scientist']
lex_df0.loc[lex_df0['guest'] == 'David Silver', ['nationality', 'profession', 'birth_year']] = ['British', 'Computer Scientist', 1976]
lex_df0.loc[lex_df0['guest'] == 'David Sinclair', ['nationality', 'profession']] = ['Australian-American', 'Geneticist']
lex_df0.loc[lex_df0['guest'] == 'Dennis Whyte', ['nationality', 'profession']] = ['Canadian-American', 'Nuclear Scientist']
lex_df0.loc[lex_df0['guest'] == 'Dileep George', ['nationality', 'profession']] = ['Indian-American', 'AI Expert']
lex_df0.loc[lex_df0['guest'] == 'Dmitry Korkin', ['nationality', 'profession', 'birth_year']] = ['Russian-American', 'Professor', 1979]
lex_df0.loc[lex_df0['guest'] == 'Douglas Murray', ['nationality', 'profession', 'birth_year', 'death_year']] = ['British', 'Political Commentator', 1979, 0]
lex_df0.loc[lex_df0['guest'] == 'Ed Calderon', ['nationality', 'profession', 'birth_year']] = ['Mexican', 'Security Specialist', 1982]
lex_df0.loc[lex_df0['guest'] == 'Eugenia Kuyda', ['nationality', 'profession', 'birth_year']] = ['Russian-American', 'Entrepreneur', 1987]
lex_df0.loc[lex_df0['guest'] == 'Fiona Hill', ['nationality', 'profession']] = ['British-American', 'Political Advisor']
lex_df0.loc[lex_df0['guest'] == 'Glenn Loury', ['nationality', 'profession']] = ['American', 'Professor']
lex_df0.loc[lex_df0['guest'] == 'Greg Brockman', ['nationality', 'profession']] = ['American', 'Computer Scientist']
lex_df0.loc[lex_df0['guest'] == 'Guido van', ['guest', 'nationality', 'profession']] = ['Guido van Rossum', 'Dutch', 'Creator of Python Programming Language']
lex_df0.loc[lex_df0['guest'] == 'Guillaume Verdon', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['French', 'Entrepreneur', 1990, True]
lex_df0.loc[lex_df0['guest'] == 'Gustav Soderstrom', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['Swedish', 'Electrical Engineer', 1977, True]
lex_df0.loc[lex_df0['guest'] == 'Harry Cliff', ['nationality', 'profession', 'birth_year']] = ['British', 'Particle Physicist', 1985]
lex_df0.loc[lex_df0['guest'] == 'Ian Hutchinson', ['nationality', 'profession']] = ['British', 'Nuclear Engineer']
lex_df0.loc[lex_df0['guest'] == 'Ishan Misra', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['Indian', 'Researcher    ', 1990, True]
lex_df0.loc[lex_df0['guest'] == 'Israel-Palestine Debate', ['guest', 'nationality', 'profession']] = ['Norman Finkelstein, Destiny, M. Rabbani & Benny Morris', 'Mixed', 'Mixed'] 
lex_df0.loc[lex_df0['guest'] == 'Jack Barsky', ['nationality', 'profession']] = ['German-American', 'Former Intelligence Officer']
lex_df0.loc[lex_df0['guest'] == 'Jack Dorsey', ['nationality', 'profession']] = ['American', 'Computer Scientist']
lex_df0.loc[lex_df0['guest'] == 'Jaron Lanier', ['nationality', 'profession']] = ['American', 'Computer Scientist']
lex_df0.loc[lex_df0['guest'] == 'Jay McClelland', ['nationality', 'profession']] = ['American', 'Cognitive Scientist']
lex_df0.loc[lex_df0['guest'] == 'Jeffrey Shainline', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['American', 'Physicist', 1980, True]
lex_df0.loc[lex_df0['guest'] == 'Jeremy Howard', ['nationality', 'profession', 'birth_year']] = ['Australian', 'Data Scientist', 1973]
lex_df0.loc[lex_df0['guest'] == 'John Danaher', ['nationality', 'profession']] = ['New Zealander', 'Martial Arts']
lex_df0.loc[lex_df0['guest'] == 'John Vervaeke', ['nationality', 'profession', 'birth_year']] = ['Canadian', 'Psychologist', 1963]
lex_df0.loc[lex_df0['guest'] == 'Kai-Fu Lee', ['nationality', 'profession']] = ['Chinese-American', 'AI Expert']
lex_df0.loc[lex_df0['guest'] == "Kanye 'Ye'", ['nationality', 'profession', 'birth_year']] = ['American', 'Artist', 1977]
lex_df0.loc[lex_df0['guest'] == "Katherine de", ['guest', 'nationality', 'profession', 'birth_year', 'death_year']] = ['Katherine de Kleer', 'American', 'Scientist', 1987, 0]
lex_df0.loc[lex_df0['guest'] == 'Kelsi Sheren', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['Canadian', 'Veteran', 1990, True]
lex_df0.loc[lex_df0['guest'] == 'Keoki Jackson', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['American', 'Lockheed Martin Executive', 1970, True]
lex_df0.loc[lex_df0['guest'] == 'Kevin Scott', ['nationality', 'profession']] = ['American', 'Computer Scientist']
lex_df0.loc[lex_df0['guest'] == 'Lee Cronin', ['nationality', 'profession']] = ['British', 'Chemist']
lex_df0.loc[lex_df0['guest'] == 'Jeff Hawkins', ['nationality', 'profession']] = ['British', 'Chemist']
lex_df0.loc[lex_df0['guest'] == 'Liv Boeree', ['nationality', 'profession']] = ['British', 'Poker Player']
lex_df0.loc[lex_df0['guest'] == 'Luís and', ['guest', 'nationality', 'profession', 'birth_year', 'death_year']] = ['Luís and João Batalha', 'Portuguese', "Co-founder", 0, 0]
lex_df0.loc[lex_df0['guest'] == 'Magatte Wade', ['nationality', 'profession']] = ['Senegalese', 'Entrepreneur']
lex_df0.loc[lex_df0['guest'] == 'Matt Botvinick', ['nationality', 'profession', 'birth_year']] = ['American', 'AI Expert', 1968]
lex_df0.loc[lex_df0['guest'] == 'Matt Walker', ['nationality', 'profession']] = ['American', 'Scientist']
lex_df0.loc[lex_df0['guest'] == 'Matthew Cox', ['nationality', 'profession', 'birth_year']] = ['American', 'Former Con Man', 1969]
lex_df0.loc[lex_df0['guest'] == 'Michael Levin', ['nationality', 'profession']] = ['American', 'Professor']
lex_df0.loc[lex_df0['guest'] == 'Michael Malice', ['nationality', 'profession']] = ['American', 'Professor']
lex_df0.loc[lex_df0['guest'] == 'Michael Stevens', ['nationality', 'profession', 'birth_year']] = ['American', 'YouTuber', 1986]
lex_df0.loc[lex_df0['guest'] == 'Michio Kaku', ['nationality', 'profession']] = ['Japanese-American', 'Physicist']
lex_df0.loc[lex_df0['guest'] == 'Natalya Bailey', ['nationality', 'profession']] = ['Russian-American', 'Rocket Propulsion Engineer']
lex_df0.loc[lex_df0['guest'] == 'Neil Adams', ['nationality', 'profession']] = ['British', 'Martial Arts']
lex_df0.loc[lex_df0['guest'] == 'Nic Carter', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['American', 'Researcher', 1990, True]
lex_df0.loc[lex_df0['guest'] == 'Nicole Perlroth', ['nationality', 'profession', 'birth_year']] = ['American', 'Journalist', 1981]
lex_df0.loc[lex_df0['guest'] == 'Noam Brown', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['American', 'Researcher', 1988, True]
lex_df0.loc[lex_df0['guest'] == 'Omar Suleiman', ['nationality', 'profession', 'birth_year', 'death_year']] = ['Palestinian-American', 'Imam', 1986, 0]
lex_df0.loc[lex_df0['guest'] == 'Paul Conti', ['nationality', 'profession', 'birth_year']] = ['American', 'Psychiatrist', 1946]
lex_df0.loc[lex_df0['guest'] == 'Paul Goff', ['nationality', 'profession']] = ['British', 'Philosopher']
lex_df0.loc[lex_df0['guest'] == 'Pieter Levels', ['nationality', 'profession', 'birth_year']] = ['Dutch', 'Entrepreneur', 1986]
lex_df0.loc[lex_df0['guest'] == 'Rajat Monga', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['Indian-American', 'Electrical Engineer', 1974, True]
lex_df0.loc[lex_df0['guest'] == 'Ray Kurzweil', ['nationality', 'profession']] = ['American', 'Inventor']
lex_df0.loc[lex_df0['guest'] == 'Risto Miikkulainen', ['nationality', 'profession']] = ['American', 'Entrepreneur']
lex_df0.loc[lex_df0['guest'] == 'Robert Crews', ['nationality', 'profession']] = ['American', 'Historian']
lex_df0.loc[lex_df0['guest'] == 'Robert F.', ['guest', 'nationality', 'profession', 'birth_year', 'death_year']] = ['Robert F. Kennedy Jr', 'American', 'Politician', 1954, 0]
lex_df0.loc[lex_df0['guest'] == 'Robert Playter', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['American', 'Roboticist', 1970, True]
lex_df0.loc[lex_df0['guest'] == 'Rodney Brooks', ['nationality', 'profession']] = ['American', 'Historian']
lex_df0.loc[lex_df0['guest'] == 'Rohit Prasad', ['nationality', 'profession', 'birth_year']] = ['Indian', 'AI Expert', 1975]
lex_df0.loc[lex_df0['guest'] == 'Ronald Sullivan', ['nationality', 'profession', 'birth_year']] = ['American', 'Professor', 1967]
lex_df0.loc[lex_df0['guest'] == 'Russ Tedrake', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['American', 'Roboticist', 1977, True]
lex_df0.loc[lex_df0['guest'] == 'Ryan Graves', ['nationality', 'profession']] = ['American', 'Pilot']
lex_df0.loc[lex_df0['guest'] == 'Ryan Schiller', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['American', 'Entrepreneur', 2000, True]
lex_df0.loc[lex_df0['guest'] == 'Saagar Enjeti', ['nationality', 'profession']] = ['Indian-American', 'Political Commentator']
lex_df0.loc[lex_df0['guest'] == 'Sean Carroll', ['nationality', 'profession', 'birth_year', 'death_year']] = ['American', 'Physicist', 1954, 0]
lex_df0.loc[lex_df0['guest'] == 'Sean Kelly', ['nationality', 'profession']] = ['American', 'Philosopher']
lex_df0.loc[lex_df0['guest'] == 'Sergey Levine', ['nationality', 'profession', 'birth_year']] = ['Russian-American', 'Professor',1987]
lex_df0.loc[lex_df0['guest'] == 'Sergey Nazarov', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['Russian-American', 'Co-Founder', 1988, True]
lex_df0.loc[lex_df0['guest'] == 'Sertac Karaman', ['nationality', 'profession', 'birth_year', 'birth_year_estimated']] = ['Turkish-American', 'Professor', 1985, True]
lex_df0.loc[lex_df0['guest'] == 'Skye Fitzgerald', ['nationality', 'profession']] = ['American', 'Director']
lex_df0.loc[lex_df0['guest'] == 'Stephen Kotkin', ['nationality', 'profession']] = ['American', 'Historian']
lex_df0.loc[lex_df0['guest'] == 'Stephen Pressfield', ['nationality', 'profession']] = ['American', 'Author']
lex_df0.loc[lex_df0['guest'] == 'Stuart Russell', ['nationality', 'profession']] = ['American', 'Professor']
lex_df0.loc[lex_df0['guest'] == 'Teddy Atlas', ['nationality', 'profession']] = ['American', 'Martial Arts']
lex_df0.loc[lex_df0['guest'] == 'Tim Dillon', ['nationality', 'profession']] = ['American', 'Comedian']
lex_df0.loc[lex_df0['guest'] == 'Todd Howard', ['nationality', 'profession']] = ['American', 'Designer']
lex_df0.loc[lex_df0['guest'] == 'Tony Fadell', ['nationality', 'profession']] = ['American', 'Engineer']
lex_df0.loc[lex_df0['guest'] == 'Vijay Kumar', ['nationality', 'profession']] = ['Indian-American', 'Roboticist']
lex_df0.loc[lex_df0['guest'] == 'Walter Isaacson', ['nationality', 'profession']] = ['American', 'Author']
lex_df0.loc[lex_df0['guest'] == 'Whitney Cummings', ['nationality', 'profession']] = ['American', 'Comedian']
lex_df0.loc[lex_df0['guest'].isin(['Yann LeCun', 'Yann Lecun']), ['nationality', 'profession']] = ['French-American', 'AI Expert']
lex_df0.loc[lex_df0['guest'] == 'Yuval Noah', ['nationality', 'profession']] = ['Israeli', 'Historian']
lex_df0.loc[lex_df0['guest'] == 'Zev Weinstein', ['nationality', 'profession', 'birth_year']] = ['American', 'Philosopher', 2000]
lex_df0.loc[lex_df0['guest'] == 'Nationalism Debate', ['guest', 'nationality', 'profession']] = ['Yaron Brook and Yoram Hazony', 'Israeli', 'Philosopher']
lex_df0.loc[lex_df0['guest'] == 'Donald Hoffman', ['nationality', 'profession', 'birth_year', 'death_year']] = ['American', 'Psychologist', 1955, 0]
lex_df0.loc[lex_df0['guest'] == 'Brett Johnson', ['nationality', 'profession', 'birth_year', 'death_year']] = ['American', 'Consultant', 1970, 0]
lex_df0.loc[lex_df0['guest'] == 'Daniel Kahneman', ['profession', 'birth_year', 'death_year']] = ['Psychologist', 1934, 2024]
lex_df0.loc[lex_df0['guest'] == 'Barry Barish', ['profession']] = ['Physicist']
lex_df0.loc[lex_df0['guest'] == 'Vladimir Vapnik', ['profession']] = ['Computer Scientist']
lex_df0.loc[lex_df0['guest'] == 'Richard Karp', ['profession']] = ['Computer Scientist']
lex_df0.loc[lex_df0['guest'] == 'Yannis Pappas', ['profession', 'birth_year', 'death_year']] = ['Comedian', 1976, 0]
lex_df0.loc[lex_df0['guest'] == 'James Sexton', ['profession', 'birth_year', 'death_year']] = ['Attorney', 1972, 0]
lex_df0.loc[lex_df0['guest'] == 'Jared Kushner', ['profession']] = ['Businessperson']
lex_df0.loc[lex_df0['guest'] == 'Jocko Willink', ['profession']] = ['Author']
lex_df0.loc[lex_df0['guest'] == 'Richard Wolff', ['nationality', 'profession']] = ['American', 'Economist']
lex_df0.loc[lex_df0['guest'] == 'Elon Musk', ['nationality', 'profession']] = ['American-South African', 'Entrepreneur']
lex_df0.loc[lex_df0['guest'] == 'Garry Nolan', 'birth_year'] = 1961
lex_df0.loc[lex_df0['guest'] == 'Rana el', ['guest', 'nationality', 'profession', 'birth_year']] = ['Rana el Kaliouby', 'Egyptian-American', 'AI Expert', 1978]
lex_df0.loc[lex_df0['guest'] == 'Coffeezilla', ['profession', 'birth_year']] = ['YouTuber', 1985]
lex_df0.loc[lex_df0['guest'] == 'Yaron Brook', ['profession', 'birth_year']] = ['Writer', 1961]
lex_df0.loc[lex_df0['guest'] == 'Andrew Bustamante', ['nationality', 'profession', 'birth_year']] = ['American', 'Former Intelligence Officer', 1981]
lex_df0.loc[lex_df0['guest'] == 'Jordan Jonas', ['nationality', 'profession', 'birth_year', 'death_year']] = ['American', 'Survival Expert', 1976, 0]
lex_df0.loc[lex_df0['guest'] == 'Shannon Curry', ['birth_year', 'death_year']] = [1986, 0]
lex_df0.loc[lex_df0['guest'] == 'Oriol Vinyals', ['birth_year', 'death_year']] = [1983, 0]
lex_df0.loc[lex_df0['guest'] == 'Philip Goff', ['nationality', 'profession', 'birth_year', 'death_year', 'birth_year_estimated']] = ['British', 'Philosopher', 1980, 0, True]
lex_df0.loc[lex_df0['guest'] == 'Georges St-Pierre,', ['guest', 'nationality', 'profession', 'birth_year', 'death_year']] = ['Georges St-Pierre', 'Canadian', 'Martial Arts', 1981, 0]
lex_df0.loc[lex_df0['guest'] == 'Alex Gladstein', ['birth_year', 'death_year']] = [1984, 0]
lex_df0.loc[lex_df0['guest'] == 'Tuomas Sandholm', ['birth_year', 'death_year']] = [1969, 0]
lex_df0.loc[lex_df0['guest'] == 'Roger Reaves', ['birth_year', 'death_year']] = [1944, 0]
lex_df0.loc[lex_df0['guest'] == 'John Abramson', ['birth_year', 'death_year']] = [1945, 0]
lex_df0.loc[lex_df0['guest'] == 'Niels Jorgensen', ['birth_year', 'death_year']] = [1991, 0]
lex_df0.loc[lex_df0['guest'] == 'Michael Kearns', ['nationality', 'profession', 'birth_year', 'death_year', 'birth_year_estimated']] = ['American', 'Computer Scientist', 1963, 0, True]
lex_df0.loc[lex_df0['guest'] == 'Gavin Miller', ['nationality', 'profession', 'birth_year', 'death_year', 'birth_year_estimated']] = ['British', 'Computer Scientist', 1960, 0, True]
lex_df0.loc[lex_df0['guest'] == 'Jonathan Reisman', ['birth_year', 'death_year', 'birth_year_estimated']] = [1980, 0, True]
lex_df0.loc[lex_df0['guest'] == 'Peter Wang', ['birth_year', 'death_year', 'birth_year_estimated']] = [1975, 0, True]
lex_df0.loc[lex_df0['guest'] == 'Steve Viscelli', ['nationality', 'profession', 'birth_year', 'death_year', 'birth_year_estimated']] = ['American', 'Sociologist', 1975, 0, True]
lex_df0.loc[lex_df0['guest'] == 'Robert Breedlove', ['nationality', 'profession', 'birth_year', 'death_year', 'birth_year_estimated']] = ['American', 'Entrepreneur', 1985, 0, True]
lex_df0.loc[lex_df0['guest'] == 'Richard Craib', ['nationality', 'profession', 'birth_year', 'death_year', 'birth_year_estimated']] = ['South African', 'Entrepreneur', 1988, 0, True]
lex_df0.loc[lex_df0['guest'] == 'Diana Walsh', ['birth_year', 'death_year', 'birth_year_estimated']] = [1970, 0, True]
lex_df0.loc[lex_df0['guest'] == 'Dmitri Dolgov', ['birth_year', 'death_year', 'birth_year_estimated']] = [1978, 0, True]
lex_df0.loc[lex_df0['guest'] == 'Paola Arlotta', ['birth_year', 'death_year', 'birth_year_estimated']] = [1972, 0, True]
lex_df0.loc[lex_df0['guest'] == 'Robert Proctor', ['nationality', 'profession', 'birth_year', 'death_year']] = ['American', 'Historian', 1954, 0]
lex_df0.loc[lex_df0['guest'] == 'Michael I.', ['guest', 'nationality', 'profession']] = ['Michael I. Jordan', 'American', 'Professor']
lex_df0.loc[lex_df0['guest'] == 'Paul Rosolie', 'birth_year'] = 1988
lex_df0.loc[lex_df0['guest'] == 'Tucker Carlson', 'birth_year'] = 1969
lex_df0.loc[lex_df0['guest'] == 'Richard Haier', 'birth_year'] = 1945
lex_df0.loc[lex_df0['guest'] == 'Jay Bhattacharya', 'birth_year'] = 1968
lex_df0.loc[lex_df0['guest'] == 'Michael Mina', 'birth_year'] = 1969
lex_df0.loc[lex_df0['guest'] == 'Rob Reid', 'birth_year'] = 1966
lex_df0.loc[lex_df0['guest'] == 'Travis Stevens', 'birth_year'] = 1986
lex_df0.loc[lex_df0['guest'] == 'Sheldon Solomon', 'birth_year'] = 1951
lex_df0.loc[lex_df0['guest'] == 'Ian Goodfellow', 'birth_year'] = 1985
lex_df0.loc[lex_df0['guest'] == 'Grimes', 'birth_year'] = 1988
lex_df0.loc[lex_df0['guest'] == 'GothamChess', 'guest'] = 'GothamChess (Levy Rozman)'
lex_df0.loc[lex_df0['guest'] == 'David Fravor', ['nationality', 'profession', 'birth_year', 'death_year', 'birth_year_estimated']] = ['American', 'Pilot', 1965, 0, True]
lex_df0.loc[lex_df0['guest'] == 'Jimmy Wales', ['nationality', 'profession', 'birth_year', 'death_year']] = ['American', 'Entrepreneur', 1966, 0]
lex_df0.loc[lex_df0['guest'] == 'Kimbal Musk', ['nationality', 'profession', 'birth_year', 'death_year']] = ['American-South African', 'Entrepreneur', 1972, 0]
lex_df0.loc[lex_df0['guest'] == 'Tim Urban', ['nationality', 'profession', 'birth_year', 'death_year', 'birth_year_estimated']] = ['American', 'Writer', 1982, 0, True]
lex_df0.loc[lex_df0['guest'] == 'Donald Trump', 'profession'] = 'President'
lex_df0.loc[lex_df0['guest'] == 'Climate Change', ['guest', 'nationality', 'profession']] = ['Bjørn Lomborg and Andrew Revkin', 'Mixed', 'Mixed']
lex_df0.loc[lex_df0['guest'] == 'Rick Spence', ['nationality', 'profession']] = ['American', 'Historian']
lex_df0.loc[lex_df0['guest'] == 'Stuart Russell', ['birth_year', 'death_year']] = [1962, 0]


In [251]:
# Convert the professions to title case
lex_df0['profession'] = lex_df0['profession'].str.strip().str.title()

In [252]:
# Re-arrange the dataframe columns, and drop redundant columns, I'll drop the caption text column
lex_df0 = lex_df0[['yt_url', 'number', 'guest', 'nationality', 'profession', 'birth_year', 'death_year', 'birth_year_estimated', 'summary', 'description', 'upload_date', 'duration', 'duration_minutes', 'views', 'likes', 'comments_count', 
                 'tags', 'top_five_words', 'favorite_count', 'region_restriction', 'thumbnail_url', 'captions_availability']]

In [253]:
# Export the data frame into a csv file
lex_df0.to_csv('lex fridman podcast episodes.csv', encoding='utf-8-sig')