In [1]:
# Web scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Scrapes transcript data from scrapsfromtheloft.com
def url_to_transcript(url):
    '''Returns transcript data specifically from scrapsfromtheloft.com.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find_all('p')]
    print(f"Scraped: {url}")
    return text


In [3]:
# Original URLs of transcripts in scope
urls = [
    'http://scrapsfromtheloft.com/2017/05/06/louis-ck-oh-my-god-full-transcript/',
    'http://scrapsfromtheloft.com/2017/04/11/dave-chappelle-age-spin-2017-full-transcript/',
    'http://scrapsfromtheloft.com/2018/03/15/ricky-gervais-humanity-transcript/',
    'http://scrapsfromtheloft.com/2017/08/07/bo-burnham-2013-full-transcript/',
    'http://scrapsfromtheloft.com/2017/05/24/bill-burr-im-sorry-feel-way-2014-full-transcript/',
    'http://scrapsfromtheloft.com/2017/04/21/jim-jefferies-bare-2014-full-transcript/',
    'http://scrapsfromtheloft.com/2017/08/02/john-mulaney-comeback-kid-2015-full-transcript/',
    'http://scrapsfromtheloft.com/2017/10/21/hasan-minhaj-homecoming-king-2017-full-transcript/',
    'http://scrapsfromtheloft.com/2017/09/19/ali-wong-baby-cobra-2016-full-transcript/',
    'http://scrapsfromtheloft.com/2017/08/03/anthony-jeselnik-thoughts-prayers-2015-full-transcript/',
    'http://scrapsfromtheloft.com/2018/03/03/mike-birbiglia-my-girlfriends-boyfriend-2013-full-transcript/',
    'http://scrapsfromtheloft.com/2017/08/19/joe-rogan-triggered-2016-full-transcript/'
]

# Additional URLs
additional_urls = [
    'http://scrapsfromtheloft.com/2020/12/06/kevin-hart-zero-fks-given-2020-transcript/',
    'http://scrapsfromtheloft.com/2021/05/07/trevor-noah-afraid-of-the-dark-2017-transcript/',
    'http://scrapsfromtheloft.com/2021/02/03/amy-schumer-growing-2019-transcript/',
    'http://scrapsfromtheloft.com/2020/09/25/tiffany-haddish-black-mitzvah-2019-transcript/',
    'http://scrapsfromtheloft.com/2021/03/20/iliza-shlesinger-elder-millennial-2018-transcript/',
    'http://scrapsfromtheloft.com/2021/04/16/ronny-chieng-asian-comedian-2021-transcript/',
    'http://scrapsfromtheloft.com/2021/10/03/patton-oswalt-i-love-everything-2020-transcript/',
    'http://scrapsfromtheloft.com/2021/06/11/eddie-murphy-delirious-1983-transcript/',
    'http://scrapsfromtheloft.com/2021/07/22/margaret-cho-psycho-2015-transcript/',
    'http://scrapsfromtheloft.com/2021/08/30/jerry-seinfeld-23-hours-to-kill-2020-transcript/'
]


In [4]:
# Combine all URLs
urls.extend(additional_urls)

# Comedian names
comedians = ['louis', 'dave', 'ricky', 'bo', 'bill', 'jim', 'john', 'hasan', 'ali', 'anthony', 'mike', 'joe',
             'kevin', 'trevor', 'amy', 'tiffany', 'iliza', 'ronny', 'patton', 'eddie', 'margaret', 'jerry']

# Scrape transcripts
transcripts = [url_to_transcript(u) for u in urls]

# Save transcripts to pickle files
for i, c in enumerate(comedians):
    with open("transcripts/" + c + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file)

# Load transcripts into a dictionary
data = {}
for i, c in enumerate(comedians):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

Scraped: http://scrapsfromtheloft.com/2017/05/06/louis-ck-oh-my-god-full-transcript/
Scraped: http://scrapsfromtheloft.com/2017/04/11/dave-chappelle-age-spin-2017-full-transcript/
Scraped: http://scrapsfromtheloft.com/2018/03/15/ricky-gervais-humanity-transcript/
Scraped: http://scrapsfromtheloft.com/2017/08/07/bo-burnham-2013-full-transcript/
Scraped: http://scrapsfromtheloft.com/2017/05/24/bill-burr-im-sorry-feel-way-2014-full-transcript/
Scraped: http://scrapsfromtheloft.com/2017/04/21/jim-jefferies-bare-2014-full-transcript/
Scraped: http://scrapsfromtheloft.com/2017/08/02/john-mulaney-comeback-kid-2015-full-transcript/
Scraped: http://scrapsfromtheloft.com/2017/10/21/hasan-minhaj-homecoming-king-2017-full-transcript/
Scraped: http://scrapsfromtheloft.com/2017/09/19/ali-wong-baby-cobra-2016-full-transcript/
Scraped: http://scrapsfromtheloft.com/2017/08/03/anthony-jeselnik-thoughts-prayers-2015-full-transcript/
Scraped: http://scrapsfromtheloft.com/2018/03/03/mike-birbiglia-my-girlf

In [5]:
# Combine the texts
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    return ' '.join(list_of_text)

data_combined = {key: [combine_text(value)] for (key, value) in data.items()}
data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()

# Add full names
full_names = ['Louis C.K.', 'Dave Chappelle', 'Ricky Gervais', 'Bo Burnham', 'Bill Burr', 'Jim Jefferies',
              'John Mulaney', 'Hasan Minhaj', 'Ali Wong', 'Anthony Jeselnik', 'Mike Birbiglia', 'Joe Rogan',
              'Kevin Hart', 'Trevor Noah', 'Amy Schumer', 'Tiffany Haddish', 'Iliza Shlesinger', 'Ronny Chieng',
              'Patton Oswalt', 'Eddie Murphy', 'Margaret Cho', 'Jerry Seinfeld']

data_df['full_name'] = full_names

# Save the corpus
data_df.to_pickle("expanded_corpus.pkl")


In [6]:
# Cleaning the text
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

  text = re.sub('\[.*?\]', '', text)
  text = re.sub('\w*\d\w*', '', text)


In [7]:

def clean_text_round2(text):
    '''Add a new regex for removing additional patterns like excessive whitespace and special characters.'''
    text = re.sub('[’‘“”…]', '', text)  # Remove additional punctuation
    text = re.sub('\n', '', text)       # Remove newline characters
    text = re.sub('\s+', ' ', text)     # Remove excessive whitespace
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)  # Remove special characters except common punctuation
    return text


  text = re.sub('\s+', ' ', text)     # Remove excessive whitespace


In [8]:
# Apply cleaning
round1 = lambda x: clean_text_round1(x)
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
round2 = lambda x: clean_text_round2(x)
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean.to_pickle('expanded_data_clean.pkl')

In [9]:
# Document-Term Matrix
def create_dtm(data_clean):
    '''Create a Document-Term Matrix using CountVectorizer.'''
    cv = CountVectorizer(stop_words='english', ngram_range=(1, 2), min_df=2, max_df=0.8)
    data_cv = cv.fit_transform(data_clean.transcript)
    data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names_out())
    data_dtm.index = data_clean.index
    return data_dtm, cv

data_dtm, cv = create_dtm(data_clean)
data_dtm.to_pickle("expanded_dtm.pkl")
pickle.dump(cv, open("cv.pkl", "wb"))

print("Data cleaning and Document-Term Matrix creation complete.")


Data cleaning and Document-Term Matrix creation complete.


In [10]:
data_dtm

Unnamed: 0,aah,able,absolutely,absurdities,absurdities sharp,abuse,accent,acceptable,acceptable answer,access,...,youtube,youve,youve got,youve gotta,youve seen,youve south,yummy,zero,zombie,zombies
ali,0,2,0,1,1,1,0,0,0,0,...,0,2,0,0,0,0,0,0,1,0
amy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
anthony,0,0,0,1,1,0,1,0,0,0,...,0,6,1,2,2,0,0,0,0,0
bill,0,1,3,0,0,0,0,1,0,0,...,1,1,0,0,0,0,1,1,1,1
bo,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dave,0,0,0,1,1,0,0,0,0,0,...,0,5,2,0,1,1,0,0,0,0
eddie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hasan,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
iliza,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
jerry,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
