# Initialization

Retrieve all JRE episodes and save the episodes titles, descriptions and dates. Guest names can be extracted from the description using Named Entity Recognition. The hope is that this approach can be generalized to all podcasts that contain descriptions for their episodes.

In [1]:
# Imports
import requests
import base64
import pandas as pd
import time
import json

from multiprocessing import Pool

In [2]:
# Show retriever Class
class ShowRetriever:
    def __init__(self):
        self.show_id = None
        self.client_id = "e72a0925bf1a4699a4ed62a8ee3593ec"
        self.client_secret = "6b24e502fc694859805181ec4819d9df"
        self.auth_response = None
        self.auth_header = base64.b64encode(f'{self.client_id}:{self.client_secret}'.encode()).decode('utf-8')
        self.headers = {
            'Authorization': f'Basic {self.auth_header}'
        }
        self.payload =  payload = {
            'grant_type': 'client_credentials'
        }
        self.descs = []
        self.titles = []
        self.dates = []
        
    #url = f'https://api.spotify.com/v1/shows/{show_id}/episodes?offset=1&limit=20'
        
    def authenticate(self):
        
       
        response = requests.post('https://accounts.spotify.com/api/token', data=self.payload, headers=self.headers)
        
        self.auth_response = response

    def retrieve_episodes(self, show, offset, limit):
        
        while self.auth_response.status_code != 200:
            self.authenticate()
            time.sleep(2)
            
    
        url = f'https://api.spotify.com/v1/shows/{show}/episodes?offset={offset}&limit={limit}'
        access_token = self.auth_response.json()['access_token']
        #print(access_token)
        headers = {
            'Authorization': f'Bearer {access_token}'
        }
        response = requests.get(url, headers=headers)
    
        if response.status_code == 200:
            show_data = response.json()
            for episode in show_data["items"]:
                self.descs.append(episode['description'])
                self.titles.append(episode['name'])
                self.dates.append(episode['release_date'])
            
        else:
            print(response)
        return response
    def retrieve_all(self):
        offset=0
        limit=50
        response = self.retrieve_episodes(self.show_id, offset, limit)
        while response.status_code != 404:
            offset += limit
            response = self.retrieve_episodes(self.show_id, offset, limit)
            

## Data retrieval for JRE
Using the ShowRetriever class along with the JRE podcast ID. Retrieval continues until receiving a response 404 from the API

In [4]:
SR = ShowRetriever()
SR.authenticate()
SR.show_id = '4rOoJ6Egrf8K2IrywzwOMk'
SR.retrieve_all()

<Response [404]>


## Initiailizing HuggingFace NER
Download and setup a pretrained NER model from huggingface, along with a small test. (understand the warning and determine if it has an implication for our purposes)

In [3]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "Our names are Wolfgang Weibhauer and Max Schreck, and we live in Berlin"

ner_results = nlp(example)
print(ner_results)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'B-PER', 'score': 0.9994892, 'index': 4, 'word': 'Wolfgang', 'start': 14, 'end': 22}, {'entity': 'I-PER', 'score': 0.99953365, 'index': 5, 'word': 'Wei', 'start': 23, 'end': 26}, {'entity': 'I-PER', 'score': 0.9209705, 'index': 7, 'word': '##uer', 'start': 29, 'end': 32}, {'entity': 'B-PER', 'score': 0.99950886, 'index': 9, 'word': 'Max', 'start': 37, 'end': 40}, {'entity': 'I-PER', 'score': 0.99947387, 'index': 10, 'word': 'Sc', 'start': 41, 'end': 43}, {'entity': 'I-PER', 'score': 0.9962214, 'index': 11, 'word': '##hr', 'start': 43, 'end': 45}, {'entity': 'I-PER', 'score': 0.97856665, 'index': 12, 'word': '##eck', 'start': 45, 'end': 48}, {'entity': 'B-LOC', 'score': 0.9985958, 'index': 18, 'word': 'Berlin', 'start': 65, 'end': 71}]


## Running the NER on the descriptions

Descriptions stored in the ShowRetriever object as SR.descs are processed with NER, the model results being stored in a list. Multiprocessing used to speed up this process. (Only checks for the word 'podcast' existing in the desc, might need some other, better checks, or simply perform on all descs and lookup names in a list of podcasters)

In [4]:
# Function to run NLP/NER only if the word Podcast exists in the description
def ner_on_descs(desc):
    if 'podcast' in desc:
        result = nlp(desc)
    else:
        result = ''
    return result

In [5]:
# Parallel processing
if __name__ == '__main__':
    # Number of processes to be used (you can adjust this based on your system)
    num_processes = 7

    # Create a Pool of processes
    with Pool(num_processes) as pool:
        # Use pool.map to apply the function to each item in the list
        results = pool.map(ner_on_descs, SR.descs)

    # Results will contain the processed data in the same order as the input list
    print(results)

NameError: name 'SR' is not defined

## Extracting Name Strings from NER Results
Function that takes in one NER result (list of dicts) and returns the names as a list of strings. Multiprocessing again.

In [5]:
def name_extraction_from_ner(ner_results):
    output = ner_results
    
    person_names = []
    current_name = ""
    if output != '':
        for i, token in enumerate(output):
            if token['entity'] in ['B-PER', 'I-PER'] and not token['word'].startswith('##'):
                current_name += ' '
            if token["entity"] in ["B-PER", "I-PER"]:
                current_name += token['word'].replace('##', '')
        
            if i == len(output) - 1 or (i + 1 < len(output) and output[i + 1]['entity'] not in ['I-PER', 'B-PER']):
                if current_name:
                    person_names.append(current_name.strip())  # Strip leading and trailing spaces before appending
                    current_name = ""
        
    #print("Extracted Person Names:", person_names)
    return person_names




In [18]:
if __name__ == '__main__':
    # Number of processes to be used (you can adjust this based on your system)
    num_processes = 7

    # Create a Pool of processes
    with Pool(num_processes) as pool:
        # Use pool.map to apply the function to each item in the list
        names = pool.map(name_extraction_from_ner, results)

    # Results will contain the processed data in the same order as the input list
    print(names)

[['Mike Baker'], [], [], ['Whitney Cummings'], [], [], [], [], [], [], [], [], [], [], [], ['Joe Bryan Callen Brendan Schaub Eddie Bravo Bryan Callen', 'Bryan Callen', 'Brendan Schaub', 'Eddie Bravo', 'Eddie Bravo'], ['Ehsan Ahmad', 'ehjah'], ['Coleman Hughes', 'Coleman Coleman', 'co'], [], [], ['Brian Redban'], [], [], [], ['Francis Foster Konstantin Kisin'], ['Joe List Mark Normand'], [], [], ['Eddie Bravo', 'Eddie Bravo'], [], [], [], [], [], ['Brian Simpson', 'Brian Simpson'], [], [], ['Joe Bryan Callen Brendan Schaub Sam Tripoli Bryan Callen', 'Bryan Callen', 'Brendan Schaub', 'Sam Tripoli', 'Sam Tripoli', 'Sam Tripoli'], [], ['Luis J . Gomez', 'Big Jay Oakerson Dave Smith', 'Zac Amico l'], ['Shane Matt Shane', 'Matt McCusker Gilly and Keeves John McKeever', 'sMark', 'Joe List', 'Sam Morril', 'ri', 'Ari Shaffir', 'aris'], ['Bill Maher', 'Maher'], [], [], [], ['Dave Smith', 'Dave Smith'], [], ['Brian Keating', 'Brian Keating'], [], ['Jeremy Phil Gerber', 'Josh Henning'], ['Mike Bak

## Find podcasts IDs by searching the spotify API using names
The name format that works in a search query such that the first result is the correct podcast is "FirstName+LastName". Yet again, multiprocess.
(Could be integrated into the ShowRetriever object, a lot of redundant stuff with the headers and IDs and secrets down here.) 
Using a horrible time.sleep to get around rate limiting.

In [6]:
def show_search_by_name(name):
    time.sleep(0.1)
    show_id = None
    if len(name) > 0:
        query = name[0]
        search_query = query.replace(' ', '%2B')
    else:
        return show_id
    client_id='e72a0925bf1a4699a4ed62a8ee3593ec'
    client_secret='6b24e502fc694859805181ec4819d9df'
    
    #show_id = '4rOoJ6Egrf8K2IrywzwOMk'
    url = f'https://api.spotify.com/v1/search?q={search_query}&type=show&market=US'
    #url = 'https://api.spotify.com/v1/search?q=JRE&type=show'
    auth_header = base64.b64encode(f'{client_id}:{client_secret}'.encode()).decode('utf-8')
    
    headers = {
        'Authorization': f'Basic {auth_header}'
    }
    
    payload = {
        'grant_type': 'client_credentials'
    }
    response = requests.post('https://accounts.spotify.com/api/token', data=payload, headers=headers)
    
    if response.status_code == 200:
        access_token = response.json()['access_token']
        
        headers = {
            'Authorization': f'Bearer {access_token}'
        }
        
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            show_data = response.json()
            #print(show_data)
            if len(show_data['shows']['items']) > 0:
                show_id = show_data['shows']['items'][0]['id']
        else:
            print(f"Error: Unable to retrieve show information. Status code {response.status_code}")
    else:
        print(f"Error: Unable to retrieve access token. Status code {response.status_code}")
        
    return show_id
    

In [23]:
if __name__ == '__main__':
    # Number of processes to be used (you can adjust this based on your system)
    num_processes = 7

    # Create a Pool of processes
    with Pool(num_processes) as pool:
        # Use pool.map to apply the function to each item in the list
        jre_guests_show_ids = pool.map(show_search_by_name, names)

    # Results will contain the processed data in the same order as the input list
    print(jre_guests_show_ids)

['6VpV9ZlbQTzMngKilhJlZs', None, None, '7AgQSpalNK5OwhQjWFmAEr', None, None, None, None, None, None, None, None, None, None, None, '5EA5JpC0p2wRazXv3q4gYS', '5b0KJx66TTplApOF0vqQCD', '2Xa3GbllxlSnTXarA2HsyK', None, None, '77iUVnajPyEWPhY6LxjpzM', None, None, None, '1JH26td5UtmKQWHLv0NCZp', '5DHMlgUAo4cc2zYb9jdpBI', None, None, '7kXXC0JwHsnYyiMl3c5Vfb', None, None, None, None, None, '4OXGjyjT1UtANcjo8TaJHh', None, None, '2lX4necvtAjBQTHTZFGVi8', None, '3Kkaz4Z9dWfnJuOpn8TnXb', '32p08HngccrVVyugc45Ljp', '0Z3M1mtmdnvMgTbO27LHkB', None, None, None, '41EUjkv3PfnDXFkVxTJpmz', None, '2G3PRMUhxGQkyQzLiiCqlf', None, '0InqQVwHuNWDeUr31thY05', '6VpV9ZlbQTzMngKilhJlZs', None, '2gRd1woKiAazAKPWPkHjds', None, None, '6M2O0tltPbenrncrUXPyxf', None, None, None, None, None, '4QbLlCXPLBEZ54CflXoe4t', None, None, '1Wz3JX3ZNE24lnbSroRmpq', None, None, '4OXGjyjT1UtANcjo8TaJHh', None, '73hjIggIaddQCcT1Vc5K36', '1PgDUTgeyu3FOzK1FcBoqa', None, None, None, None, None, '1Wz3JX3ZNE24lnbSroRmpq', None, None, '42pd

## Everything in a DF

In [24]:
df = pd.DataFrame(list(zip(SR.titles, SR.descs, SR.dates, names, jre_guests_show_ids)), columns=['Title', 'Description', 'Date', 'PodcastHosts', 'GuestShowSpotifyId'])

In [25]:
df

Unnamed: 0,Title,Description,Date,PodcastHosts,GuestShowSpotifyId
0,#2064 - Mike Baker,Mike Baker is a former CIA covert operations o...,2023-11-16,[Mike Baker],6VpV9ZlbQTzMngKilhJlZs
1,#2063 - The Rock,"Dwayne ""The Rock"" Johnson is an actor, produce...",2023-11-15,[],
2,#2062 - Will & Jenni Harris,Will Harris is the owner of White Oak Pastures...,2023-11-14,[],
3,#2061 - Whitney Cummings,"Whitney Cummings is a stand-up comic, actor, a...",2023-11-10,[Whitney Cummings],7AgQSpalNK5OwhQjWFmAEr
4,#2060 - Gary Brecka,Gary Brecka is a human biologist and co-founde...,2023-11-09,[],
...,...,...,...,...,...
2215,"#5 - John Heffron, Ari Shaffir (Part 2)","Joe sits down with John Heffron, and Ari Shaffir.",2010-01-21,[],
2216,"#5 - John Heffron, Ari Shaffir (Part 1)","Joe sits down with John Heffron, and Ari Shaffir.",2010-01-21,[],
2217,#3 - Ari Shaffir,Joe sits down with Ari Shaffir.,2010-01-06,[],
2218,#2 - Brian Redban,Joe sits down with Brian Redban.,2009-12-29,[],


In [26]:
df.to_pickle("JRE_DF.pkl")

In [None]:
df = pd.read_pickle("JRE_DF.pkl")

In [7]:
SR = ShowRetriever()
SR.authenticate()
SR.show_id = '6VpV9ZlbQTzMngKilhJlZs'
SR.retrieve_all()

<Response [404]>


In [8]:
SR.name

AttributeError: 'ShowRetriever' object has no attribute 'name'

In [None]:
def ids_to_descs(ids:list, start_podcast:str):


    for id_number in ids:
        SR = ShowRetriever()
        SR.authenticate()
        SR.show_id = id_number
        SR.retrieve_all()

        num_processes = 7
        # Create a Pool of processes
        with Pool(num_processes) as pool:
            # Use pool.map to apply the function to each item in the list
            results = pool.map(ner_on_descs, SR.descs)
        with Pool(num_processes) as pool:
            # Use pool.map to apply the function to each item in the list
            names = pool.map(name_extraction_from_ner, results)
        with Pool(num_processes) as pool:
            # Use pool.map to apply the function to each item in the list
            guests_show_ids = pool.map(show_search_by_name, names)
        
        showname_list = [start_podcast for k in range(len(guest_show_ids))]
        temp_df = pd.DataFrame(list(zip(showname_list, SR.titles, SR.descs, SR.dates, names, guests_show_ids)), columns=['Showname', 'Title', 'Description', 'Date', 'PodcastHosts', 'GuestShowSpotifyId'])
        final_df = pd.concat([final_df, temp_df], ignore_index=True)
        return final_df, 

In [None]:
import pandas as pd
from multiprocessing import Pool

processed_ids = set()  # Initialize an empty set to keep track of processed IDs
final_df = pd.DataFrame(columns=['Showname', 'Title', 'Description', 'Date', 'PodcastHosts', 'GuestShowSpotifyId'])

def process_single_id(id_number, showname):
    if id_number in processed_ids:
        return  # Skip if the ID has already been processed

    SR = ShowRetriever()
    SR.authenticate()
    SR.show_id = id_number
    SR.retrieve_all()

    num_processes = 7

    with Pool(num_processes) as pool:
        results = pool.map(ner_on_descs, SR.descs)

    with Pool(num_processes) as pool:
        names = pool.map(name_extraction_from_ner, results)

    with Pool(num_processes) as pool:
        guests_show_ids = pool.map(show_search_by_name, names)

    showname_list = [showname for _ in range(len(guests_show_ids))]
    temp_df = pd.DataFrame(list(zip(showname_list, SR.titles, SR.descs, SR.dates, names, guests_show_ids)),
                           columns=['Showname', 'Title', 'Description', 'Date', 'PodcastHosts', 'GuestShowSpotifyId'])

    global final_df
    final_df = pd.concat([final_df, temp_df], ignore_index=True)

    processed_ids.add(id_number)  # Add the current ID to the set of processed IDs

    # Recursively process more IDs if needed
    for new_id, showname in zip(guests_show_ids, names):
        process_single_id(new_id, showname)

def ids_to_descs(ids: list, start_podcast: str):
    for id_number in ids:
        process_single_id(id_number, start_podcast)

# Call the function with your list of IDs and starting podcast name
ids_to_descs([1, 2, 3], 'MyPodcast')

# Print the final DataFrame
print(final_df)
