### Import Modules

In [1]:
import re
import pickle
import pandas as pd
import string

### Setting up Dataframe

In [2]:
# opening pickle file
with open('comedian.pickle', 'rb') as f:
    heading = pickle.load(f)

In [4]:
heading[:5]

['BILLY CONNOLLY: HIGH HORSE TOUR LIVE (2016) – FULL TRANSCRIPT',
 'HANNAH GADSBY: DOUGLAS (2020) – FULL TRANSCRIPT',
 'PATTON OSWALT: I LOVE EVERYTHING (2020) – FULL TRANSCRIPT',
 'RUSSELL PETERS: DEPORTED (2020) – FULL TRANSCRIPT',
 'JIMMY O. YANG: GOOD DEAL (2020) – FULL TRANSCRIPT']

In [5]:
# make a datafram using the heading
comedian_df = pd.DataFrame({'Heading': heading})

In [6]:
comedian_df.head()

Unnamed: 0,Heading
0,BILLY CONNOLLY: HIGH HORSE TOUR LIVE (2016) – ...
1,HANNAH GADSBY: DOUGLAS (2020) – FULL TRANSCRIPT
2,PATTON OSWALT: I LOVE EVERYTHING (2020) – FULL...
3,RUSSELL PETERS: DEPORTED (2020) – FULL TRANSCRIPT
4,JIMMY O. YANG: GOOD DEAL (2020) – FULL TRANSCRIPT


### Splitting Names

In most cases the format in the heading column is "firstname lastname:Title (Year)". This makes it easy to get most comedian names.

In [7]:
# Some headings don't have a colon. This is a list of words to split comedian names for most other cases.

split_words =[':', ' On ', ' At ', ' About ', ' Live', 'Stand-Up', '–', '…', 'Award-Winning', 'Monologue', ' & ', 'Matters', 'E Il', 'Tries']

In [8]:
# loop through each item in the heading, splitting the text using a loop on each split word
nm=[]
for name in comedian_df['Heading']:
        for word in split_words:
            name = (name.split(word)[0].title().rstrip())
        nm.append(name)

comedian_df['Name'] = nm

In [9]:
comedian_df.head()

Unnamed: 0,Heading,Name
0,BILLY CONNOLLY: HIGH HORSE TOUR LIVE (2016) – ...,Billy Connolly
1,HANNAH GADSBY: DOUGLAS (2020) – FULL TRANSCRIPT,Hannah Gadsby
2,PATTON OSWALT: I LOVE EVERYTHING (2020) – FULL...,Patton Oswalt
3,RUSSELL PETERS: DEPORTED (2020) – FULL TRANSCRIPT,Russell Peters
4,JIMMY O. YANG: GOOD DEAL (2020) – FULL TRANSCRIPT,Jimmy O. Yang


In [10]:
# testing another example of the dataframe. Note - the name in line 50 needs work

comedian_df.iloc[49:51]

Unnamed: 0,Heading,Name
49,DAVE CHAPPELLE: STICKS AND STONES | EPILOGUE: ...,Dave Chappelle
50,"Brazil, Corruption and the Amazon Rainforest |...","Brazil, Corruption And The Amazon Rainforest |..."


#### Fixing Names

In [11]:
# Most names look correct, but we have to make an adjustment on a few

name_index = [11, 50, 57, 58, 63, 83, 86, 88, 100, 101, 103, 112, 148, 224, 231, 235, 237, 338]

In [12]:
# List of corrected comedian names
name_fix = ['Louis C.K.',
'Hasan Minhaj',
'Katherine Ryan',
'Katherine Ryan',
'Gina Yashere',
'George Carlin',
'Gabriel Iglesias',
'John Leguizamo',
'George Carlin',
'Tom Segura',
'Joe Mande',
'Dave Chappelle',
'Patton Oswalt',
'Richard Pryor',
'Dave Attell',
'Daniel Tosh',
'George Carlin',
'Pauline Kael',
]


#### Looping through corrected names

In [13]:
# Loop through adjustments using the 2 lists above
for i, n in zip(name_index, name_fix):
        comedian_df.Name.iloc[i] = n

In [14]:
# a look at the revised dataframe
comedian_df.iloc[49:51]

Unnamed: 0,Heading,Name
49,DAVE CHAPPELLE: STICKS AND STONES | EPILOGUE: ...,Dave Chappelle
50,"Brazil, Corruption and the Amazon Rainforest |...",Hasan Minhaj


### Splitting Year

The heading usually has information about the year of each stand-up set. Follow a similar process to split this out

In [16]:
# loop uses regex to find 4 digit numbers as year. Then add Year to dataframe
yr = []    
for name in comedian_df['Heading']:
    try:
        yr.append(re.search(r'\b\d{4}\b',name).group(0))
    except:
        yr.append('missing year')
comedian_df['Year'] = yr

In [17]:
# those without 4 digits numbers are classified as missing year. This is a list of them all
missing_yr = comedian_df['Year'] == 'missing year'

In [24]:
# viewing the dataframe again
comedian_df.iloc[49:52]

Unnamed: 0,Heading,Name,Year
49,DAVE CHAPPELLE: STICKS AND STONES | EPILOGUE: ...,Dave Chappelle,missing year
50,"Brazil, Corruption and the Amazon Rainforest |...",Hasan Minhaj,missing year
51,DAVE CHAPPELLE: STICKS & STONES (2019) – FULL ...,Dave Chappelle,2019


In [21]:
comedian_df.head()

Unnamed: 0,Heading,Name,Year
0,BILLY CONNOLLY: HIGH HORSE TOUR LIVE (2016) – ...,Billy Connolly,2016
1,HANNAH GADSBY: DOUGLAS (2020) – FULL TRANSCRIPT,Hannah Gadsby,2020
2,PATTON OSWALT: I LOVE EVERYTHING (2020) – FULL...,Patton Oswalt,2020
3,RUSSELL PETERS: DEPORTED (2020) – FULL TRANSCRIPT,Russell Peters,2020
4,JIMMY O. YANG: GOOD DEAL (2020) – FULL TRANSCRIPT,Jimmy O. Yang,2020


#### Fixing Years

In [22]:
# creating an index for the missing year list
missing_yr_index = [i for i, val in enumerate(missing_yr) if val]

In [23]:
missing_yr_index

[16,
 19,
 22,
 49,
 50,
 54,
 65,
 83,
 101,
 107,
 108,
 142,
 223,
 236,
 238,
 311,
 328,
 338]

In [25]:
# had to manually look up some of the missing years

yr_list = ['1993',
'1965',
'2015',
'2019',
'2019',
'2019',
'2004',
'1997',
'2011',
'2012',
'2014',
'2007',
'1996',
'1999',
'2014',
'1972',
'2014',
'1980']

#### Looping through missing years

In [26]:
# appling the missing years
for i, y in zip(missing_yr_index, yr_list):
        comedian_df.Year.iloc[i] = y

In [27]:
# checking the df - years have been added
comedian_df.iloc[49:51]

Unnamed: 0,Heading,Name,Year
49,DAVE CHAPPELLE: STICKS AND STONES | EPILOGUE: ...,Dave Chappelle,2019
50,"Brazil, Corruption and the Amazon Rainforest |...",Hasan Minhaj,2019


In [29]:
# top 20 years by count, most are recent
comedian_df.Year.value_counts()[:20]

2018    43
2017    40
2019    40
2016    27
2015    22
2014    16
2013    15
2012    14
2020    14
2010    13
2011    11
2007     9
1999     8
2006     8
2004     6
2008     6
2009     5
1996     4
1992     4
1998     3
Name: Year, dtype: int64

### Splitting Title

We can also split the title of each stand-up set from the heading

In [30]:
# Start by splitting on hyphen or parentheses or brackets
split_titles =['–', ' (', ' [']

In [31]:
# Note this time we apply 'capwords' rather than 'title' as capwords works better with apostrophes in titles
# The second part of the loop splits on colon, taking the second part of the split. 
# We need a try / except for cases without a colon

ti=[]
for ttl in comedian_df['Heading']:
        for word in split_titles:
            ttl = string.capwords(ttl.split(word)[0]).rstrip()
    
        try:
            ttl = (ttl.split(":")[1].lstrip())
        except:
            ttl = ttl
        ti.append(ttl)

comedian_df['Title'] = ti


In [32]:
# viewing the dataframe, note the dots for an extended title on line 50
comedian_df.iloc[49:51]

Unnamed: 0,Heading,Name,Year,Title
49,DAVE CHAPPELLE: STICKS AND STONES | EPILOGUE: ...,Dave Chappelle,2019,Sticks And Stones | Epilogue
50,"Brazil, Corruption and the Amazon Rainforest |...",Hasan Minhaj,2019,"Brazil, Corruption And The Amazon Rainforest |..."
