In [1]:
# Importing the required libraries
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re 
import pickle
import string
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Loading of data

In [2]:
# Web Scraping using beautiful soup i.e. extracting transcripts of different comedians 
def url_to_transcript(url):
    pageData = requests.get(url).text 
    soup = BeautifulSoup(pageData, "lxml") 
    text = [p.text for p in soup.find(class_="elementor-widget-theme-post-content").find_all('p')]
    return text

urls = ['http://scrapsfromtheloft.com/2017/04/11/dave-chappelle-age-spin-2017-full-transcript/',
        'http://scrapsfromtheloft.com/2017/05/24/bill-burr-im-sorry-feel-way-2014-full-transcript/',
        'http://scrapsfromtheloft.com/2017/04/21/jim-jefferies-bare-2014-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/02/john-mulaney-comeback-kid-2015-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/03/anthony-jeselnik-thoughts-prayers-2015-full-transcript/',
        'http://scrapsfromtheloft.com/2018/03/03/mike-birbiglia-my-girlfriends-boyfriend-2013-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/19/joe-rogan-triggered-2016-full-transcript/']

comedians = ['Dave Chappelle','Bill Burr', 'Jim Jefferies', 'John Mulaney','Anthony Jeselnik', 'Mike Birbiglia','Joe Rogan']

In [3]:
# Making a list to store the transcripts of different comedians
ls = []
for url in urls:
  ls.append(url_to_transcript(url))

In [4]:
# Making a final list to store the transcript of different comedians and making a dictionary of it.
transcripts = []
for i in range(len(comedians)):
  s=""
  for j in range(len(ls[i])):
    s=s+ls[i][j]
  transcripts.append(s)

data = {'Comedians':comedians,'transcript':transcripts}
data_df = pd.DataFrame(data)  
data_df

Unnamed: 0,Comedians,transcript
0,Dave Chappelle,This is Dave. He tells dirty jokes for a livin...
1,Bill Burr,"[cheers and applause] All right, thank you! Th..."
2,Jim Jefferies,[Car horn honks] [Audience cheering] [Announce...
3,John Mulaney,"Armed with boyish charm and a sharp wit, the f..."
4,Anthony Jeselnik,"Thank you. Thank you. Thank you, San Francisco..."
5,Mike Birbiglia,"Wow. Hey, thank you. Thanks. Thank you, guys. ..."
6,Joe Rogan,[rock music playing][audience cheering][announ...


# Data Cleaning


In [5]:
# General Cleaning of transcripts
def cleaning(text):
    text = text.lower()   #Changing text to lower case
    text = re.sub('[.*?]', '', text) #Removing general punctuation from text
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) #Removing all punctuation from text
    text = re.sub('\w*\d\w*', '', text)  #Removing digits from text
    text = re.sub('[‘’“”…]', '', text) #Removing qotation marks and ellipsis from text
    text = re.sub('\n', '', text)  #Removing new line (enter) from text
    return text

clean = lambda x: cleaning(x) 

Cleaneddata = {'Comedians':comedians,'transcript':data_df.transcript.apply(clean)}
cleaned_dataframe = pd.DataFrame(Cleaneddata)
cleaned_dataframe

Unnamed: 0,Comedians,transcript
0,Dave Chappelle,this is dave he tells dirty jokes for a living...
1,Bill Burr,cheers and applause all right thank you thank ...
2,Jim Jefferies,car horn honks audience cheering announcer lad...
3,John Mulaney,armed with boyish charm and a sharp wit the fo...
4,Anthony Jeselnik,thank you thank you thank you san francisco th...
5,Mike Birbiglia,wow hey thank you thanks thank you guys hey se...
6,Joe Rogan,rock music playingaudience cheeringannouncerla...


In [6]:
# Lemmatization i.e. converting each word to its root word. For eg: converting children to child
lemmatizer = WordNetLemmatizer()
LemmatizedTranscripts = []

for i in range(len(cleaned_dataframe)):
  sen = cleaned_dataframe.iloc[i,1].split(" ")
  newsen=""
  for word in sen:
    newsen+=lemmatizer.lemmatize(word)+" "
  
  LemmatizedTranscripts.append(newsen)

Lemmatizeddata = {'Comedians':comedians,'Transcript':LemmatizedTranscripts}
final_dataframe = pd.DataFrame(Lemmatizeddata)
final_dataframe

Unnamed: 0,Comedians,Transcript
0,Dave Chappelle,this is dave he tell dirty joke for a living t...
1,Bill Burr,cheer and applause all right thank you thank y...
2,Jim Jefferies,car horn honk audience cheering announcer lady...
3,John Mulaney,armed with boyish charm and a sharp wit the fo...
4,Anthony Jeselnik,thank you thank you thank you san francisco th...
5,Mike Birbiglia,wow hey thank you thanks thank you guy hey sea...
6,Joe Rogan,rock music playingaudience cheeringannouncerla...


In [7]:
# Saving my cleaned data as a pickle file.
final_dataframe.to_pickle("comedians_cleaned_data.pkl")

 # Document - term matrix

In [8]:
# Creating document-term matrix i.e. a matrix conating frequency of each word said by a comedian.
# Here I ignored the words which were said by less than 3 comedians
cv = CountVectorizer(stop_words='english',min_df=3)
data_cv = cv.fit_transform(final_dataframe.Transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = final_dataframe.index
data_dtm


Unnamed: 0,able,accent,acceptable,accidentally,act,actual,actually,admit,adorable,adult,...,yard,yeah,year,yes,york,youd,youll,young,youre,youve
0,0,0,0,0,4,1,5,1,1,0,...,1,17,11,7,1,7,3,10,15,5
1,1,0,1,2,4,0,11,0,0,6,...,1,67,20,3,1,1,5,0,59,1
2,1,15,0,0,5,1,2,0,1,1,...,0,14,9,2,0,2,2,0,48,11
3,3,1,1,0,1,0,7,0,0,2,...,0,10,13,11,7,2,1,2,27,3
4,0,1,0,1,1,0,7,2,0,0,...,0,15,12,2,0,1,0,0,18,6
5,0,0,1,0,0,1,6,0,1,1,...,0,25,10,2,5,3,0,1,28,3
6,2,3,0,1,1,0,3,2,1,0,...,1,12,9,0,0,2,0,4,42,6


In [9]:
# Saving document-term matrix as a pickle file
data_dtm.to_pickle("dtm.pkl")