# Initialization

Retrieve all JRE episodes and save the episodes titles, descriptions and dates. Guest names can be extracted from the description using Named Entity Recognition. The hope is that this approach can be generalized to all podcasts that contain descriptions for their episodes.

In [1]:
# Imports
import requests
import base64
import pandas as pd
import time
import json
import numpy as np
from multiprocessing import Pool
import pickle

In [2]:
# Show retriever Class
class ShowRetriever:
    def __init__(self):
        self.show_id = None
        self.client_id = "6dae08f3c799496fad2adfe2657634a7"
        self.client_secret = "2afb39d7abb84e7fa957c2ec76b8e6f4"
        self.auth_response = None
        self.auth_header = base64.b64encode(f'{self.client_id}:{self.client_secret}'.encode()).decode('utf-8')
        self.headers = {
            'Authorization': f'Basic {self.auth_header}'
        }
        self.payload =  payload = {
            'grant_type': 'client_credentials'
        }
        self.descs = []
        self.titles = []
        self.dates = []
        
    #url = f'https://api.spotify.com/v1/shows/{show_id}/episodes?offset=1&limit=20'
        
    def authenticate(self):
        
       
        response = requests.post('https://accounts.spotify.com/api/token', data=self.payload, headers=self.headers)
        
        self.auth_response = response

    def retrieve_episodes(self, show, offset, limit):
        
        while self.auth_response.status_code != 200:
            self.authenticate()
            time.sleep(0.1)
            
    
        url = f'https://api.spotify.com/v1/shows/{show}/episodes?offset={offset}&limit={limit}'
        access_token = self.auth_response.json()['access_token']
        #print(access_token)
        headers = {
            'Authorization': f'Bearer {access_token}'
        }
        response = requests.get(url, headers=headers)
    
        if response.status_code == 200:
            show_data = response.json()
            for episode in show_data["items"]:
                if type(episode) is not type(None):
                    self.descs.append(episode['description'])
                    self.titles.append(episode['name'])
                    self.dates.append(episode['release_date'])
            
        else:
            print(self.show_id)
            print(f"{self.show_id} gives {response}")
            
        return response
    def retrieve_all(self):
        offset=0
        limit=50
        response = self.retrieve_episodes(self.show_id, offset, limit)
        while response.status_code != 404:
            offset += limit
            response = self.retrieve_episodes(self.show_id, offset, limit)
            

## Data retrieval for JRE
Using the ShowRetriever class along with the JRE podcast ID. Retrieval continues until receiving a response 404 from the API

In [3]:
SR = ShowRetriever()
SR.authenticate()
SR.show_id = '4rOoJ6Egrf8K2IrywzwOMk'
SR.retrieve_all()

4rOoJ6Egrf8K2IrywzwOMk
4rOoJ6Egrf8K2IrywzwOMk gives <Response [404]>


In [4]:
len(SR.titles)

2223

## Initiailizing HuggingFace NER
Download and setup a pretrained NER model from huggingface, along with a small test. (understand the warning and determine if it has an implication for our purposes)

In [5]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "Our names are Wolfgang Weibhauer and Max Schreck, and we live in Berlin"

ner_results = nlp(example)
print(ner_results)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'B-PER', 'score': 0.9994892, 'index': 4, 'word': 'Wolfgang', 'start': 14, 'end': 22}, {'entity': 'I-PER', 'score': 0.99953365, 'index': 5, 'word': 'Wei', 'start': 23, 'end': 26}, {'entity': 'I-PER', 'score': 0.9209704, 'index': 7, 'word': '##uer', 'start': 29, 'end': 32}, {'entity': 'B-PER', 'score': 0.99950886, 'index': 9, 'word': 'Max', 'start': 37, 'end': 40}, {'entity': 'I-PER', 'score': 0.99947387, 'index': 10, 'word': 'Sc', 'start': 41, 'end': 43}, {'entity': 'I-PER', 'score': 0.9962214, 'index': 11, 'word': '##hr', 'start': 43, 'end': 45}, {'entity': 'I-PER', 'score': 0.9785665, 'index': 12, 'word': '##eck', 'start': 45, 'end': 48}, {'entity': 'B-LOC', 'score': 0.9985958, 'index': 18, 'word': 'Berlin', 'start': 65, 'end': 71}]


## Running the NER on the descriptions

Descriptions stored in the ShowRetriever object as SR.descs are processed with NER, the model results being stored in a list. Multiprocessing used to speed up this process. (Only checks for the word 'podcast' existing in the desc, might need some other, better checks, or simply perform on all descs and lookup names in a list of podcasters)

In [6]:
# Function to run NLP/NER only if the word Podcast exists in the description
def ner_on_descs(desc):
    result = nlp(desc)
    return result

In [7]:
# Parallel processing
if __name__ == '__main__':
    # Number of processes to be used (you can adjust this based on your system)
    num_processes = 7

    # Create a Pool of processes
    with Pool(num_processes) as pool:
        # Use pool.map to apply the function to each item in the list
        results = pool.map(ner_on_descs, SR.descs)

    # Results will contain the processed data in the same order as the input list
    print(results)



## Extracting Name Strings from NER Results
Function that takes in one NER result (list of dicts) and returns the names as a list of strings. Multiprocessing again.

In [8]:
def name_extraction_from_ner(ner_results):
    output = ner_results
    
    person_names = []
    current_name = ""
    if output != '':
        for i, token in enumerate(output):
            if token['entity'] in ['B-PER', 'I-PER'] and not token['word'].startswith('##'):
                current_name += ' '
            if token["entity"] in ["B-PER", "I-PER"]:
                current_name += token['word'].replace('##', '')
        
            if i == len(output) - 1 or (i + 1 < len(output) and output[i + 1]['entity'] not in ['I-PER', 'B-PER']):
                if current_name:
                    person_names.append(current_name.strip())  # Strip leading and trailing spaces before appending
                    current_name = ""
        
    #print("Extracted Person Names:", person_names)
    return person_names




In [9]:
if __name__ == '__main__':
    # Number of processes to be used (you can adjust this based on your system)
    num_processes = 7

    # Create a Pool of processes
    with Pool(num_processes) as pool:
        # Use pool.map to apply the function to each item in the list
        names = pool.map(name_extraction_from_ner, results)

    # Results will contain the processed data in the same order as the input list
    print(names)

[['Dave Smith'], ['Ralph Barbosa Ralph Barbosa'], ['David Grusch'], ['Mike Baker'], ['Dwayne " The Rock " Johnson'], ['Will Harris', 'Jenni Harris', 'Will'], ['Whitney Cummings'], ['Gary Brecka'], ['Adam Greentree'], ['Elliott West'], ['Dale Brisby', 'DaleBrisby'], ['David Blaine'], ['Tim Kennedy'], ['Elon Musk', 'elon'], ['Danny Michael Philippou'], ['Shane Dorian'], ['Graham Hancock', 'cock'], ['Joe Dan Henderson Hendo', 'n heon'], ['Joe Bryan Callen Brendan Schaub Eddie Bravo Bryan Callen', 'Bryan Callen', 'Brendan Schaub', 'Eddie Bravo', 'Eddie Bravo'], ['Ehsan Ahmad', 'ehjah'], ['Coleman Hughes', 'Coleman Coleman', 'co'], ['Reggie Watts'], ['Brian C Muraresku'], ['Brian Redban'], ['Jimmy Carr Carr', 'ji'], ['Sam Altman'], ['Joe Bernard Hopkins'], ['Francis Foster Konstantin Kisin'], ['Joe List Mark Normand'], ['Joe', "Sugar Sean O ' Malley Tim Welch"], ['Steve Strope'], ['Eddie Bravo', 'Eddie Bravo'], ['Michael Easter', 'ha'], ['Francis Ngannou', 'Ngannou Tyson Fury'], ['Trae Tha 

## Find podcasts IDs by searching the spotify API using names
The name format that works in a search query such that the first result is the correct podcast is "FirstName+LastName". Yet again, multiprocess.
(Could be integrated into the ShowRetriever object, a lot of redundant stuff with the headers and IDs and secrets down here.) 
Using a horrible time.sleep to get around rate limiting.

In [103]:
with open("podcasters.pkl", "rb") as f:
    podcasters = pickle.load(f)

filter = [ x for x in names if len(x)>0]
single = [x[0] for x in filter]

from fuzzywuzzy import process
import fuzzywuzzy

def find_closest_match(query_name,name_list=podcasters, threshold=100):
    """
    Find the closest match to 'query_name' in 'name_list'.
    
    :param name_list: List of names to search.
    :param query_name: Name to search for.
    :param threshold: The minimum score for a match (0-100, where 100 is an exact match).
    :return: Closest match name and its score, or None if no match above the threshold.
    """
    # Use the 'extractOne' method to find the closest match
    if query_name is not None:
        best_match = process.extractOne(query_name, name_list, score_cutoff=threshold,scorer= fuzzywuzzy.fuzz.token_set_ratio)
        if best_match:
            return best_match[0]
    else:
        return None


In [104]:
if __name__ == '__main__':
    # Number of processes to be used (you can adjust this based on your system)
    num_processes = 7

    # Create a Pool of processes
    with Pool(num_processes) as pool:
        # Use pool.map to apply the function to each item in the list
        matches = pool.map(find_closest_match, single)
        #matches = list(set([x for x in matches if x is not None]))
        matches =  [s.split(' (')[0] if s is not None else '' for s in matches]

    # Results will contain the processed data in the same order as the input list
    print(matches)

['Dave Smith', '', '', '', '', '', 'Whitney Cummings', '', '', '', '', '', '', '', '', '', '', '', 'Eddie Bravo', '', 'Coleman Hughes', '', '', '', '', '', '', '', 'Mark Normand', 'Joe Budden', '', 'Eddie Bravo', '', '', '', '', 'Kurt Angle', '', '', '', 'Bryan Callen', '', '', '', '', '', '', '', 'Dave Smith', '', 'Brian Keating', '', '', '', '', 'Tim Dillon', '', 'Eddie Bravo', 'Bryan Callen', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'Tom Segura', '', '', 'Amanda Rossmann', '', '', '', '', '', '', '', '', 'Theo Von', '', '', '', 'Shane Gillis', 'Bert Kreischer', '', '', '', '', 'Jack Carr', '', '', '', '', '', '', '', 'Ms. Pat', 'Dave Smith', '', '', '', 'Joey Diaz', '', '', '', '', '', '', '', 'Big Jay Oakerson', 'David Choe', '', '', '', '', 'Andrew Schulz', '', '', '', 'Shane Gillis', '', '', 'Bert Kreischer', '', 'Michael Malice', '', '', '', '', '', 'Shane Gillis', 'Eric Weinstein', '', '', '', '', 'Matt Taibbi', 'Eddie Bravo', 'Joe Budden', '', '', '', '', 'Kyle 

In [124]:
def show_search_by_name(name):
    
    time.sleep(0.1)
    show_id = None
    show_name = None
    if len(name) > 0:
        search_query = name.replace(' ', '%2B')
    else:
        return (show_id, show_name)
    client_id='6dae08f3c799496fad2adfe2657634a7'
    client_secret='2afb39d7abb84e7fa957c2ec76b8e6f4'
    
    #show_id = '4rOoJ6Egrf8K2IrywzwOMk'
    url = f'https://api.spotify.com/v1/search?q={search_query}&type=show&market=US&limit=10'
    #url = 'https://api.spotify.com/v1/search?q=JRE&type=show'
    auth_header = base64.b64encode(f'{client_id}:{client_secret}'.encode()).decode('utf-8')
    
    headers = {
        'Authorization': f'Basic {auth_header}'
    }
    
    payload = {
        'grant_type': 'client_credentials'
    }
    response = requests.post('https://accounts.spotify.com/api/token', data=payload, headers=headers)
    
    if response.status_code == 200:
        access_token = response.json()['access_token']
        
        headers = {
            'Authorization': f'Bearer {access_token}'
        }
        
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            show_data = response.json()
            #print(search_query)
           # print(show_data)
            if len(show_data['shows']['items']) > 0:
                #print(len(show_data['shows']['items']))
                try:
                    show_id = show_data['shows']['items'][0]['id']
                    show_name = show_data['shows']['items'][0]['name']
                except TypeError:
                    show_id = None
                    show_name = None
        else:
            print(f"Error: Unable to retrieve show information. Status code {response.status_code}")
    else:
        print(f"Error: Unable to retrieve access token. Status code {response.status_code}")
        
    return (show_id, show_name)
    

In [60]:
if __name__ == '__main__':
    # Number of processes to be used (you can adjust this based on your system)
    num_processes = 7

    # Create a Pool of processes
    with Pool(num_processes) as pool:
        # Use pool.map to apply the function to each item in the list
        jre_guests_show_ids = pool.map(show_search_by_name, matches)

    # Results will contain the processed data in the same order as the input list
    print(jre_guests_show_ids)

[('41EUjkv3PfnDXFkVxTJpmz', 'Dave Smith Dharma'), (None, None), (None, None), (None, None), (None, None), (None, None), ('7AgQSpalNK5OwhQjWFmAEr', 'Good For You'), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), ('7kXXC0JwHsnYyiMl3c5Vfb', 'Look Into It - with Eddie Bravo'), (None, None), ('2Xa3GbllxlSnTXarA2HsyK', 'The AUXORO Podcast'), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), ('3dFVX2maIjW62sg2nwQywk', 'All Over The Road with Mark Normand'), ('7G8X3uikqwAmvr92MHFZO6', 'The Joe Budden Podcast'), (None, None), ('7kXXC0JwHsnYyiMl3c5Vfb', 'Look Into It - with Eddie Bravo'), (None, None), (None, None), (None, None), (None, None), ('5lcBcZbI1JO3BASz163jjh', 'The Kurt Angle Show'), (None, None), (None, None), (None, None), ('6M2O0tltPbenrncrUXPyxf', 'Off Limits w/ Bryan Callen'), (None, None), (None, None), (None, None), (None, Non

In [71]:
jre_guests_show_names = np.transpose(list(jre_guests_show_ids))[1]
jre_guests_show_ids = np.transpose(list(jre_guests_show_ids))[0]

## Everything in a DF

In [15]:
df = pd.DataFrame(list(zip(SR.titles, SR.descs, SR.dates, names, jre_guests_show_ids)), columns=['Title', 'Description', 'Date', 'PodcastHosts', 'GuestShowSpotifyId'])

In [16]:
df

Unnamed: 0,Title,Description,Date,PodcastHosts,GuestShowSpotifyId
0,#2067 - Dave Smith,"Dave Smith is a stand-up comedian, libertarian...",2023-11-23,[Dave Smith],41EUjkv3PfnDXFkVxTJpmz
1,#2066 - Ralph Barbosa,Ralph Barbosa is a stand-up comic. His new com...,2023-11-22,[Ralph Barbosa Ralph Barbosa],
2,#2065 - David Grusch,David Grusch is a former Air Force intelligenc...,2023-11-21,[David Grusch],
3,#2064 - Mike Baker,Mike Baker is a former CIA covert operations o...,2023-11-16,[Mike Baker],
4,#2063 - The Rock,"Dwayne ""The Rock"" Johnson is an actor, produce...",2023-11-15,"[Dwayne "" The Rock "" Johnson]",
...,...,...,...,...,...
2204,#18 - Brian Redban,Joe sits down with Brian Redban.,2010-04-27,[Joe Brian Redban],7D2LqJIEcKV1sRss3DMOMH
2205,"#17 - Ari Shaffir, Eddie Bravo","Joe sits down with Ari Shaffir, and Eddie Bravo.",2010-04-21,[Joe Ari Shaffir Eddie Bravo],7D2LqJIEcKV1sRss3DMOMH
2206,#16 - Brian Redban,Joe sits down with Brian Redban.,2010-04-13,[Joe Brian Redban],7D2LqJIEcKV1sRss3DMOMH
2207,#15 - Brian Redban,Joe sits down with Brian Redban.,2010-04-07,[Joe Brian Redban],


In [26]:
df.to_pickle("JRE_DF.pkl")

In [5]:
df = pd.read_pickle("JRE_DF.pkl")

In [167]:
from queue import Queue
processed_ids = set()  # Initialize an empty set to keep track of processed IDs
final_df = pd.DataFrame(columns=['OriginShow', 'Showname', 'PodcastHosts', 'GuestShowSpotifyId'])


def process_single_id(id_number, showname, queue):
    if id_number in processed_ids and id_number is not None:
        return  # Skip if the ID has already been processed

    SR = ShowRetriever()
    SR.authenticate()
    SR.show_id = id_number
    SR.retrieve_all()
    
    num_processes = 7

    with Pool(num_processes) as pool:
        results = pool.map(ner_on_descs, SR.descs)

    with Pool(num_processes) as pool:
        names = pool.map(name_extraction_from_ner, results)

    guests_and_dates = pd.DataFrame({'Guest':names, 'Dates':SR.dates})
    print(guests_and_dates)
    
    filter = [ x for x in names if len(x)>0]
    single = [x[0] for x in filter]
    with Pool(num_processes) as pool:
        # Use pool.map to apply the function to each item in the list
        matches = pool.map(find_closest_match, single)
        matches = list(set([x for x in matches if x is not None]))
        matches =  [s.split(' (')[0] for s in matches if s is not None]
        #print(matches)

    with Pool(num_processes) as pool:
        search_results = pool.map(show_search_by_name, matches)
        guest_show_names = np.transpose(list(search_results))[1]
        guest_show_ids = np.transpose(list(search_results))[0]

    showname_list = [showname for _ in range(len(show_ids))]
    temp_df = pd.DataFrame(list(zip(showname_list, guest_show_names, matches, guest_show_ids, guests_and_dates)),
                           columns=['OriginShow', 'Showname', 'PodcastHosts', 'GuestShowSpotifyId', 'guests_and_dates'])
    print(temp_df)
    global final_df
    final_df = pd.concat([final_df, temp_df], ignore_index=True)

    processed_ids.add(id_number)  # Add the current ID to the set of processed IDs, as to not use the same ID twice
    
    for new_id, new_showname in zip(guest_show_ids, guest_show_names): # Recursion
        if new_id is None:
            continue
        else:
            queue.put((new_id, new_showname))

def ids_to_descs(ids: list, start_podcast: str):
    queue = Queue()
    
    # Add initial IDs to the queue
    for id_number in ids:
        if id_number is not None:
            queue.put((id_number, start_podcast))
    print(queue)
    while not queue.empty():
        id_number, showname = queue.get()
        print("QUEUEGET",id_number, showname)
        process_single_id(id_number, showname, queue)
        

In [168]:
#podcast_ids = [x for x in list(df['GuestShowSpotifyId']) if x is not None]
podcast_ids = jre_guests_show_ids

In [169]:
podcast_ids

array(['41EUjkv3PfnDXFkVxTJpmz', None, None, ...,
       '7D2LqJIEcKV1sRss3DMOMH', None, None], dtype=object)

In [None]:
ids_to_descs(podcast_ids, "The Joe Rogan Experience")

<queue.Queue object at 0x7fb94f9ea850>
QUEUEGET 41EUjkv3PfnDXFkVxTJpmz The Joe Rogan Experience
41EUjkv3PfnDXFkVxTJpmz
41EUjkv3PfnDXFkVxTJpmz gives <Response [404]>


In [155]:
final_df

Unnamed: 0,OriginShow,Showname,PodcastHosts,GuestShowSpotifyId,guests_and_dates
0,The Joe Rogan Experience,Fat Man Beyond,Kevin Smith,0BKAnu976g3EJrovyyg5Gk,Guest
1,The Joe Rogan Experience,The Dollop with Dave Anthony and Gareth Reynolds,Dave Anthony,3m9c2ibJOYiigvVuswYSgU,Dates
2,The Joe Rogan Experience,Rob. Ryan. Red.,Rob McElhenney,2wakNmhip6sG7n152xTbwg,Guest
3,The Joe Rogan Experience,Your Mom's House with Christina P. and Tom Segura,Christina Pazsitzky,7i59GubTw3CcNy9M6m7DTX,Dates


In [166]:
final_df['guests_and_dates'][0]

'Guest'

In [13]:
#### import pandas as pd

# Example sub-tables (DataFrames)
df1 = pd.DataFrame({'A': [1, 2,3,4], 'B': [3, 4,3,4], "C": [1,2,3,4]})
df2 = pd.DataFrame({'C': [5, 6], 'D': [7, 8]})
df3 = pd.DataFrame({'E': [9, 10], 'F': [11, 12]})

# Main DataFrame
main_df = pd.DataFrame({
    'Column1': [df1, df2],
    'Column2': [df3, None]  # You can also have None for empty cells
})

# Accessing a sub-table
sub_table = main_df.at[0, 'Column1']  # Accesses the DataFrame in cell (0, 'Column1')

In [16]:
main_df

Unnamed: 0,Column1,Column2
0,A B C 0 1 3 1 1 2 4 2 2 3 3 3 3 ...,E F 0 9 11 1 10 12
1,C D 0 5 7 1 6 8,


In [14]:
sub_table

Unnamed: 0,A,B,C
0,1,3,1
1,2,4,2
2,3,3,3
3,4,4,4


In [15]:
main_df['Column1'][0]

Unnamed: 0,A,B,C
0,1,3,1
1,2,4,2
2,3,3,3
3,4,4,4


In [17]:
import pandas as pd

# Target DataFrame
df1 = pd.DataFrame({'ID': [1, 2, 3], 'Description': ['Desc1', 'Desc2', 'Desc3']})

# DataFrame to be inserted as a cell value
df2 = pd.DataFrame({'Value': [10, 20, 30], 'Count': [100, 200, 300]})

# Attempt to insert df2 into a column in df1
df1['NewColumn'] = df2  # This line will trigger the error

# Display the result
print(df1)


ValueError: Cannot set a DataFrame with multiple columns to the single column NewColumn