<a href="https://colab.research.google.com/github/BrendaLoznik/BigBangTheory/blob/main/Bang_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1 Housekeeping

In [282]:
#basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 200)

### 1.2 Load data

In [283]:
from google.colab import drive 
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [284]:
raw = pd.read_csv('/content/gdrive/MyDrive/Kaggle/Bigbang/raw_episodes2.csv', sep = '=')   #Note the custom delimiter here!
raw.head()

Unnamed: 0,line,episode,character_scene
0,A corridor at a sperm bank.,Series 01 Episode 01 – Pilot Episode | Big Ban...,Scene
1,So if a photon is directed through a plane wi...,Series 01 Episode 01 – Pilot Episode | Big Ban...,Sheldon
2,"Agreed, what’s your point?",Series 01 Episode 01 – Pilot Episode | Big Ban...,Leonard
3,"There’s no point, I just think it’s a good id...",Series 01 Episode 01 – Pilot Episode | Big Ban...,Sheldon
4,Excuse me?,Series 01 Episode 01 – Pilot Episode | Big Ban...,Leonard


### 1.3 Custom functions

In [285]:
#function to use regex to clean text
def string_splitter(row, x): 
  s = row['episode']
  try: 
    return s.split()[x]
  except ValueError:
    return "" 

#raw['season'] = raw.apply(string_splitter, axis=1, x=1).astype('int')

In [286]:
#function to use regex to clean text
def find_between(row, first, last):
  s = row['episode']
  try: 
    start = s.index( first ) + len( first )
    end = s.index( last, start )
    return s[start:end]
  except ValueError:
    return ""  

# 2. Cleaning

#### Season

In [287]:
raw['season'] = raw['episode'].str.split(" ").str[1].astype('int')
raw.head()

Unnamed: 0,line,episode,character_scene,season
0,A corridor at a sperm bank.,Series 01 Episode 01 – Pilot Episode | Big Ban...,Scene,1
1,So if a photon is directed through a plane wi...,Series 01 Episode 01 – Pilot Episode | Big Ban...,Sheldon,1
2,"Agreed, what’s your point?",Series 01 Episode 01 – Pilot Episode | Big Ban...,Leonard,1
3,"There’s no point, I just think it’s a good id...",Series 01 Episode 01 – Pilot Episode | Big Ban...,Sheldon,1
4,Excuse me?,Series 01 Episode 01 – Pilot Episode | Big Ban...,Leonard,1


#### Episode

In [288]:
raw['season_episode'] = raw['episode'].str.split(" ").str[3].astype('int')
raw.head()

Unnamed: 0,line,episode,character_scene,season,season_episode
0,A corridor at a sperm bank.,Series 01 Episode 01 – Pilot Episode | Big Ban...,Scene,1,1
1,So if a photon is directed through a plane wi...,Series 01 Episode 01 – Pilot Episode | Big Ban...,Sheldon,1,1
2,"Agreed, what’s your point?",Series 01 Episode 01 – Pilot Episode | Big Ban...,Leonard,1,1
3,"There’s no point, I just think it’s a good id...",Series 01 Episode 01 – Pilot Episode | Big Ban...,Sheldon,1,1
4,Excuse me?,Series 01 Episode 01 – Pilot Episode | Big Ban...,Leonard,1,1


### Episode name

In [289]:
raw['episode_name'] = raw.apply(find_between, axis=1, first = " – ", last = '|')

raw = raw.drop('episode', axis=1)
raw.head()

Unnamed: 0,line,character_scene,season,season_episode,episode_name
0,A corridor at a sperm bank.,Scene,1,1,Pilot Episode
1,So if a photon is directed through a plane wi...,Sheldon,1,1,Pilot Episode
2,"Agreed, what’s your point?",Leonard,1,1,Pilot Episode
3,"There’s no point, I just think it’s a good id...",Sheldon,1,1,Pilot Episode
4,Excuse me?,Leonard,1,1,Pilot Episode


### ID

In [290]:
raw['episode_id'] = (raw['season']).astype('str') +"-"+ (raw['season_episode']).astype('str')
raw.head()

Unnamed: 0,line,character_scene,season,season_episode,episode_name,episode_id
0,A corridor at a sperm bank.,Scene,1,1,Pilot Episode,1-1
1,So if a photon is directed through a plane wi...,Sheldon,1,1,Pilot Episode,1-1
2,"Agreed, what’s your point?",Leonard,1,1,Pilot Episode,1-1
3,"There’s no point, I just think it’s a good id...",Sheldon,1,1,Pilot Episode,1-1
4,Excuse me?,Leonard,1,1,Pilot Episode,1-1


In [291]:
#i noticed that epiwode 10-3 doesnt have a name because of a typo
raw.loc[raw['episode_id']=='10-3', "episode_name"] = 'The Dependence Transcendence'

### Sort df

In [292]:
raw = raw.sort_values(['season', 'season_episode'])

### Drop empty records

In [293]:
#drop empty records
raw=raw.dropna()


### Clean up names

I plan on focussing on the main characters of the show and focus my cleaning efforts on them.

In [294]:
#some character names include unneeded strings like: 'Raj (internally)' or 'Leonard (to Sheldon)' . I want to remove the text between brackets
raw['character_scene'] = raw['character_scene'].str.replace(r"\(.*\)","")


In [295]:
#replace weird characters with empty string
raw['character_scene'] = raw['character_scene'].str.replace(r"\<i></i>","")
raw['character_scene'] = raw['character_scene'].str.replace(r"<i>","")
raw['character_scene'] = raw['character_scene'].str.replace(r"</i>","")
raw['character_scene'] = raw['character_scene'].str.replace(r'<span style="font-style',"")

In [296]:
#remove weird text strings
faulty = ['<span style="text-decoration', '<a href="https',  '<span style="font-style' , '<span style="font-size', '<span style="font-family']  # 2-3 lines per episode
raw[raw['character_scene'].isin(faulty)]
raw = raw[~raw['character_scene'].isin(faulty)]

In [297]:
#remove trailing whitespace
raw['character_scene'] = raw['character_scene'].str.rstrip()
raw['character_scene'] = raw['character_scene'].str.lstrip()

In [298]:
#replace odd chracters
raw['character_scene'] = raw['character_scene'].str.replace(r'&amp;',"and")

In [299]:
#clean up sheldon
caracters = raw['character_scene'].unique()
sheldon_long_list = [s for s in caracters if "She" in s]

 #this list also contains some strings where multiple characters speak... I'll keep those
sheldon_short_list = ['Sheldon','On-screen Sheldon', 'Past Sheldon', 'Sheldon-bot', 'Mechanical voice on Sheldon’s phone', 'Sheldon’s voice',
                      'Shelldon', '1.  Sheldon', '3.  Sheldon', '4. Sheldon', '5.  Sheldon', 'Sheldon’s phone', 'She', 'Sehldon', 'Shldon', 'Shedon', 'Sgeldon', 'Sheldon on laptop screen'] #some were added manually

#replace values by Sheldon
raw.loc[raw['character_scene'].isin(sheldon_short_list), "character_scene"] = 'Sheldon'

In [300]:
#clean up Leonard
caracters = raw['character_scene'].unique()
Leonard_long_list = [s for s in caracters if "How" in s]

#create short list of relevant items
Leonard_short_list = ['Howard','Howard’s phone', 'Past Howard','Howard (on <span style="text-decoration','Howatd']

#replace values by Howard
raw.loc[raw['character_scene'].isin(Leonard_short_list), "character_scene"] = 'Howard'

In [301]:
#clean up Penny
caracters = raw['character_scene'].unique()
Penny_long_list = [s for s in caracters if "Pen" in s]

#create short_list
Penny_short_list = ['Penny','Penny- warrior', 'Penny-warrior', 'Past Penny', '<span lang="en-GB">Penny']

#replace values by Penny
raw.loc[raw['character_scene'].isin(Penny_short_list), "character_scene"] = 'Penny'

In [302]:
#clean up Bernadette
caracters = raw['character_scene'].unique()
Bernadette_long_list = [s for s in caracters if "Ber" in s]

#create short_list
Bernadette_short_list = ['Bermadette']

#replace values by Bernadette
raw.loc[raw['character_scene'].isin(Bernadette_short_list), "character_scene"] = 'Bernadette'

In [303]:
#clean up Leonard
caracters = raw['character_scene'].unique()
Leonard_long_list = [s for s in caracters if "Leo" in s]

#create short list
Leonard_short_list = ['Leonard', 'Leonard-warrior', 'Leonard-warrior’s head', 'Past Leonard', 'Leoanard', 'Fat Leonard']

#replace values by Leonard
raw.loc[raw['character_scene'].isin(Leonard_short_list), "character_scene"] = 'Leonard'

In [304]:
#clean up Amy
caracters = raw['character_scene'].unique()
Amy_long_list = [s for s in caracters if "Amy" in s]

#create short list
Amy_short_list = ['Amy','1. Amy','2.  Amy','3.  Amy', '4. Amy']

#replace values by Amy
raw.loc[raw['character_scene'].isin(Amy_short_list), "character_scene"] = 'Amy'

In [305]:
#clean up Raj
caracters = raw['character_scene'].unique()
Raj_long_list = [s for s in caracters if "Ra" in s]

#create short list
Raj_short_list = ['Raj','Past Raj','Raj’s Voice', 'Rajj','Fat Raj', 'Ra', 'Rai']

#replace values by Raj
raw.loc[raw['character_scene'].isin(Raj_short_list), "character_scene"] = 'Raj'

In [306]:
#clean up Stuart
caracters = raw['character_scene'].unique()
Stuart_long_list = [s for s in caracters if "Stu" in s]

#create short list
Stuart_short_list = ['Fat Stuart']

#replace values by Stuart
raw.loc[raw['character_scene'].isin(Stuart_short_list), "character_scene"] = 'Stuart'

In [307]:
#rare characters
sheldon_mon = ['Mrs Cooper', 'Mary Cooper']
howard_mom = ['Mrs Wolowitz', 'Howard’s Mother', 'Howard’s mother', 'Dead Mrs Wolowitz']
kripke  = [ 'Kripke', 'Barry Kripke', 'Barry' ]
will_wheaton = ['Wil Wheaton', 'Wil']
leonard_mom = ['Beverley', 'Beverly', 'Dr Hofstadter', 'Mrs Hofstadter',   'Leonard’s Mother']

#replace values by
raw.loc[raw['character_scene'].isin(sheldon_mon), "character_scene"] = 'Mary Cooper'
raw.loc[raw['character_scene'].isin(howard_mom), "character_scene"] = 'Mrs Wolowitz'
raw.loc[raw['character_scene'].isin(kripke ), "character_scene"] = 'Barry Kripke'
raw.loc[raw['character_scene'].isin(will_wheaton ), "character_scene"] = 'Wil Wheaton'
raw.loc[raw['character_scene'].isin(leonard_mom), "character_scene"] = 'Mrs Hofstadter'

In [308]:
#there are a few typo's in Scene... these need to be renamed
raw.loc[raw['character_scene']=='Sceme', "character_scene"] = 'Scene'

In [309]:
#the 8 main characters together have 90% of all paragraphs in the show
characters = raw[raw['character_scene']!= 'Scene']
characters = characters['character_scene'].value_counts(normalize=True)
characters[0:20]

Sheldon           0.227922
Leonard           0.190107
Penny             0.149062
Howard            0.113932
Raj               0.091036
Amy               0.067441
Bernadette        0.051957
Stuart            0.014239
Mrs Hofstadter    0.005213
Priya             0.004318
Mary Cooper       0.004124
Barry Kripke      0.003618
Emily             0.003190
Mrs Wolowitz      0.003035
Wil Wheaton       0.002957
Zack              0.002607
Arthur            0.002529
Leslie            0.002198
Man               0.002042
Bert              0.001848
Name: character_scene, dtype: float64

### Clean up locations

In [310]:
#create a copy of the line column
raw['location'] = raw['line']



In [311]:
#remove trailing whitespace
raw['line'] = raw['line'].str.rstrip()
raw['line'] = raw['line'].str.lstrip()

raw['location'] = raw['location'].str.rstrip()
raw['location'] = raw['location'].str.lstrip()

In [312]:
locations = raw[raw['character_scene']=='Scene']['location'].unique()

#scenes taped in and around the staircase
stairs_long_list = [s for s in locations if "stair" in s]
stairs_long_list.remove('The upstairs landing of Mrs Wolowitz’s house.')
stairs_long_list.append(['Ground floor hallway of the apartment building. Leonard is signing for the delivery.',  'Lobby of the apartment building, Howard, Raj, Sheldon and Leonard enter in combat gear, covered in blue paint.', 'Entering the apartment building.'
'The lobby.'])


raw.loc[raw['location'].isin(stairs_long_list), "location"] = 'The stairwell'

In [313]:
#clean up  'The appartment'
locations = raw[raw['character_scene']=='Scene']['location'].unique()
main_appartment_long_list = [s for s in locations if "apar" in s]

penny_appartment = ['Outside Penny’s old apartment building.', 'Inside Penny’s apartment. They are laying the package down on the floor.', 'Penny’s apartment, penny is sleeping, Sheldon is cleaning. Leonard enters.',
                  'Penny’s apartment. Sheldon and Leonard are trying to construct furniture.', 'Penny’s apartment.',  'Outside Penny’s apartment.',
                 'Penny’s apartment. She is on her laptop and talking into a headset.', 'Penny’s apartment. She is in her bathrobe. A note slides under the door.',
                 'Penny’s apartment, she opens the door, Sheldon is stood outside with his Star Wars sheets.', 'Penny’s apartment. Leonard enters. Penny is taping together a number of long sticks.',
                 'Outside Penny’s apartment. Sheldon has his laptop. He knocks three times, the presses a button and an electronic voice says “Penny”. He does this three times.',
                  'Penny’s apartment. Penny and Leonard are doing tequila shots.', 'Penny’s apartment. Penny is on the phone.',
                 'Penny’s apartment. Penny is making a barette. Sheldon is timing her.',  'Penny’s apartment, Penny and Leonard are kissing. They break and look at each other uncomfortably.',
                 'Penny’s apartment. Everyone is watching football.',  'Penny’s apartment door.', 'Penny’s apartment. Penny is gathering laundry.', 'Penny’s apartment. Knocking.',
                 'Penny’s apartment. She answers the door.',  'Penny’s apartment. There is a knock.', 'Penny’s apartment door. Sheldon knocks three times.', 'Penny’s apartment door. Leonard knocks.',
                 'Penny’s apartment. Amy is applying very red lip gloss.',  'Penny’s apartment. Penny is texting.', 'Penny’s apartment.</i>','Penny’s apartment. Penny answers the door. Leonard is playing his cello outside.',  'Penny’s apartment. </i>', 
                 'Penny’s apartment. Knock on door.</i>', 'Penny’s apartment. (Montage with scenes in the apartment and Leonard’s bedroom)', 'Penny’s apartment. Penny is cutting Sheldon’s hair.',]

other = [ 'The girl’s apartment.',  'Alicia’s apartment.', 'Bernadette’s apartment. Howard knocks on door.',  'The street outside the apartment block.', 
          'Alice’s apartment. Alice and Leonard are kissing.',    'Bernadette’s apartment.',
          'Emily’s apartment.</i>',  'Emily’s apartment. </i>',
 'Bert’s apartment.',  'Sheldon and Amy’s apartment.',] 

howard_bernadette_appartment = ['Howard and Bernadette’s apartment.',  'Howard and Bernadette’s apartment. </i>',  'Howard and Bernadette’s apartment.</i>',
                                'Howard and Bernadette’s apartment. Bernadette is on the webcam.</i>',  'Howard and Bernadette’s apartment. Howard is on the phone to Raj.</i>',
                                'Howard and Bernadette’s house.</i>',  'Howard and Bernadette’s kitchen.'] #added manually

raj_appartment = ['Raj’s apartment.', 'Raj’s apartment. Raj is heard singing in a drunk voice through the door. He enters with his arm around Penny.',
                   'Raj’s apartment. He wakes up clutched in the arms of the large lady. Tries to get away. She clutches him tighter. He shrugs and goes back to sleep.',
                  'Raj’s apartment. Raj is watching a Bollywood movie. There is a knock on the door.', 'Raj’s apartment. Raj is reading New Moon.',  'Raj’s apartment. They are playing Jenga,',
                   'Raj’s apartment. Leonard and Priya are kissing.', 'Raj’s apartment.</i>', 'Outside Raj’s apartment.', 'Raj’s apartment. Skype tone plays.</i>',
                  'Outside Raj’s apartment. The girl from the comic book store knocks on the door.',  'Raj’s apartment. </i>',  'Raj’s apartment. Lucy is looking through Raj’s telescope.</i>']

amy_appartment = [ 'Outside Amy’s apartment.', 'Amy’s apartment. Amy is brushing Penny’s hair.', 'Amy’s apartment. Amy is playing a harp.',  'Amy’s apartment.',
                  'Amy’s apartment. Amy is playing the harp.', 'Amy’s apartment. Amy is playing her harp.',  'Amy’s apartment.</i>',  'Amy’s apartment. </i>',
                  'Amy’s apartment. Amy is dancing to “Walking on Sunshine”. Hears door and turns off stereo.</i>',  'Outside Amy’s apartment. </i>',
                  'Amy’s apartment. Sheldon knocks three times. </i>','Amy’s apartment. Amy is watching Fun With Flags.</i>',  'Amy”s apartment.',]


#create the main apartment list by only including items that are not in the other lists
main_appartment_short = [x for x in main_appartment_long_list if x not in penny_appartment ]
main_appartment_short = [x for x in main_appartment_short if x not in other ]
main_appartment_short  = [x for x in main_appartment_short if x not in raj_appartment ]
main_appartment_short =  [x for x in main_appartment_short if x not in howard_bernadette_appartment  ]


#clean-up
raw.loc[raw['location'].isin(main_appartment_short), "location"] = 'Main appartment'
raw.loc[raw['location'].isin(penny_appartment), "location"] = "Penny’s apartment"
raw.loc[raw['location'].isin(howard_bernadette_appartment), "location"] = "Howard and Bernadette’s apartment"
raw.loc[raw['location'].isin(raj_appartment), "location"] = "Raj's appartment"
raw.loc[raw['location'].isin(amy_appartment), "location"] = "Amy’s apartment"

In [314]:
#clean up  'Car'
locations = raw[raw['character_scene']=='Scene']['location'].unique()
car_long_list = [s for s in locations if "car" in s]

#faulty items in list
remove = ['The living room. Leonard enters carrying a light sabre.', 'The comic book store. Raj enters carrying an iPod with a speaker on his tee shirt. As he enters the shop he starts the iPod and the Darth Vader theme from Star Wars emerges from the speaker.',
          'The hallway. Sheldon exits carrying the Apple 2.',  'The daycare.']

car_short_list = [x for x in car_long_list if x not in remove ]
raw.loc[raw['location'].isin(car_short_list), "location"] = 'A car'

In [315]:
#clean up  'laundry'
locations = raw[raw['character_scene']=='Scene']['location'].unique()
laundry_long_list = [s for s in locations if "laun" in s]

raw.loc[raw['location'].isin(laundry_long_list), "location"] = 'The laundry room'

In [316]:
#clean up  'commic store'
locations = raw[raw['character_scene']=='Scene']['location'].unique()
comic_long_list = [s for s in locations if "comic" in s]

raw.loc[raw['location'].isin(comic_long_list), "location"] = 'The comic book store'

In [317]:
#clean up  'cheesecake factory'
locations = raw[raw['character_scene']=='Scene']['location'].unique()
cheese_long_list = [s for s in locations if "cheese" in s]
cheese_long_list2 = [s for s in locations if "Cheese" in s]


raw.loc[raw['location'].isin(cheese_long_list), "location"] = 'The cheesecake factory'
raw.loc[raw['location'].isin(cheese_long_list2), "location"] = 'The cheesecake factory'

In [318]:
#clean up  'cafateria'
locations = raw[raw['character_scene']=='Scene']['location'].unique()
cafeteria_long_list = [s for s in locations if "cafet" in s]
cafeteria_long_list2 = [s for s in locations if "Cafet" in s]

raw.loc[raw['location'].isin(cafeteria_long_list), "location"] = 'The cafeteria'
raw.loc[raw['location'].isin(cafeteria_long_list2), "location"] = 'The cafeteria'

In [319]:
#clean up  'sheldon locations'
locations = raw[raw['character_scene']=='Scene']['location'].unique()
sheldon_long_list = [s for s in locations if "Sheldon" in s]

other = [ 'The department party. Sheldon, Raj and Leonard are at the buffet table.',  'A jungle. As the camera moves, the time machine becomes visible. Sheldon is sitting in it The disk stops spinning, and he looks around. The dials read APR 28 802,701.',
         'The store. Sheldon is looking at two routers.','The store. Sheldon is on the in-store computer.',  'Howard’s bedroom. Howard is in bed, Sheldon is on a blow up mattress on the floor.'
        'The building entrance lobby. The guys enter. Sheldon is dressed as a medieval monk, Howard is a court jester, Raj is a medieval gentleman and Leonard is a knight.',
         'The Renaissance Fair. The guys are in costume. Sheldon is Spock and has a tri-corder.',  'Dr Gablehauser’s office. Sheldon and Leslie are standing across the desk.',
         'Howard’s motor scooter. Howard is driving, Sheldon is on the back clutching him for dear life and screaming.',  'The same, only Sheldon and Leonard are present. Sheldon is practicing. There are sounds of squealing tyres and brakes and general panic and mayhem.',
         'A corridor in the university. Sheldon emerges from his office in a shower cap and bathrobe. The others are turning the corner.',  'Rock climbing centre. Sheldon and Kripke are watching someone descend the practice wall.',
         'The same. Everyone is eating. Sheldon is perched on the back of Leonard’s chair. He keeps making moaning noises. Saying “Excuse me” he tries moving back to his spot, but the cushion is not there. He tries perching on the arm in various positions.',
         'The same, later. Sheldon and Penny are making barettes, singing “She’ll be coming round the mountain.”',  'Sheldon’s mother’s kitchen.',
         'Sheldon’s mother’s house.',  'Outside Howard’s house. Sheldon rings the doorbell.', 'A police cell. Three people sit on a bench inside. Sheldon points at the one on the end.',
         'Penny and Leonard are asleep in Leonard’s bed. Sheldon creeps in and tries to take the ring from round Penny’s neck, but she turns over. He tries to make buzzing fly noises to get her to turn back, but she turns all the way the other way. He then uses an extender with a claw on the end to lif the ring from Penny’s chest. As he tries to remove it, she wakes up, screams, and punches him in the face.',
         'Outside. Sheldon is sitting at a table with a sign reading “Cats $20”)',  'A Korean ballroom dancing club. Sheldon and Amy are dancing.',  'Rothman’s office. Sheldon is trying to affix paper over a fiercely blowing air vent.',
         'Rothman’s office. Sheldon has his head stuck in the hole in the wall.','A corridor in the basement. Sheldon takes out a key, unlocks a door and enters.',
         'The room in the basement. Sheldon enters, takes out a box, takes a beanbag from the box, then starts playing keepie-uppie,',
         'The parking spot. Sheldon is spraypainting out Howard’s name and replacing it with his own.',
         'The parking spot. Sheldon has set up his office in it.',
         'Vasquez Rocks. Sheldon, dressed as Data, is having his makeup put on by Raj, dressed as Worf.', 'A railway station. Sheldon is wearing no trousers.</i>',
         'Sheldon’s classroom.</i>',  'The street. Sheldon sees Amy with another man. He leaves. </i>', 'Sheldon’s childhood home.',
         'Sheldon’s old bedroom.', 'A restaurant. Sheldon makes an “unsure” noise.',
         'Sheldon, waking up in bed next to a Geology book.',  'Sheldon’s dream.', 
         'In the university, Sheldon is asleep against a vending machine.', 'A hospital room. The guys except for Sheldon are in Hazmat suits.','A park bench. Sheldon is sitting. A stranger sits next to him.',
         'Howard’s house. Sheldon rings the doorbell.',]

sheldon_amy_appartment = [ 'Sheldon and Amy’s apartment.','Sheldon and Amy’s bedroom.','Sheldon and Amy’s door.',]
 

cheesecake = [ 'Penny’s restaurant. Sheldon is on the piano, singing “To Life” from Fiddler on the Roof enthusiastically.',]

penny_appartment = [ 'Penny’s flat. Penny is on her laptop. Everything around her is littered with empty food packaging and red bull cans. She burps loudly. Sheldon is sitting on the sofa.',
                     'Outside Penny’s door. Sheldon stands looking at his watch with his hand poised to knock. At the right moment he starts knocking.', 
                      'Penny’s bedroom. Sheldon is standing over Penny’s bed, knocking on the wall.',  'Sheldon and Amy listening at Penny’s door.',]

sheldon_office = [ 'Sheldon’s office. He is making measurements on maps. There is a knock on the door.',  'Sheldon’s office. He is making annotations on his board.',
                   'Sheldon’s office.','Sheldon’s office. Sheldon is writing on the whiteboard.', 'Sheldon’s office.</i>', 'Sheldon and Raj’s office.', 'The corridor outside Sheldon’s office.',]

stair = ['The lobby. Sheldon is listening to an iPod.',  'Opening shows some scenes from the final episode of the previous season, followed by the caption “Three months later.” Scene then opens in lobby, with the guys arriving home from the North Pole. All have long hair and bushy beards except Sheldon, whose hair is slightly longer and who has a goatee.',
          'The elevator shaft. Sheldon is inside.', 'The hallway. Sheldon exits carrying the Apple 2.', 'The hallway. Sheldon-bot approaches Penny’s door and starts bashing into it.',  'The lobby. Sheldon is playing his theramin.',]


#create the main apartment list by only including items that are not in the other lists
main_appartment = [x for x in sheldon_long_list if x not in other]
main_appartment = [x for x in main_appartment if x not in  sheldon_amy_appartment ]
main_appartment = [x for x in main_appartment if x not in  cheesecake  ]
main_appartment = [x for x in main_appartment if x not in  penny_appartment  ]
main_appartment = [x for x in main_appartment if x not in  sheldon_office  ]
main_appartment = [x for x in main_appartment if x not in  stair  ]

#clean-up
raw.loc[raw['location'].isin(main_appartment ), "location"] = 'Main appartment'
raw.loc[raw['location'].isin(penny_appartment), "location"] = "Penny’s apartment"
raw.loc[raw['location'].isin(cheesecake), "location"] = 'The cheesecake factory'
raw.loc[raw['location'].isin(sheldon_office), "location"] = "Sheldon’s office"
raw.loc[raw['location'].isin(stair), "location"] = 'The stairwell'
raw.loc[raw['location'].isin(sheldon_amy_appartment), "location"] = "Sheldon and Amy's apartment"

In [320]:
#clean up  'leonard locations'
locations = raw[raw['character_scene']=='Scene']['location'].unique()
leonard_long_list = [s for s in locations if "Leonard" in s]

penny_appartment = [ 'The hallway. Leonard knocks on Penny’s door.',  'Outside Penny’s door. Leonard knocks.',  'Leonard approaches Penny’s door and knocks.',
                     'The hallway. Leonard is outside Penny’s door.',  'Penny’s bedroom. Penny and Leonard are in bed.', 'Outside Penny’s door. Leonard knocks, Penny answers in her dressing gown.',
                    'Outside Penny’s door. Leonard is knocking.',  'Leonard and Penny are in Penny’s bed.',  'Penny’s bathroom. Leonard is vomiting in the toilet.',  'At Penny’s door. Leonard knocks three times.',]

other = [ 'The department party. Sheldon, Raj and Leonard are at the buffet table.',  'The same, later. Leonard is dressed as Frodo. Howard appears to be Peter Pan. There is a knock on the door.',
          'A corridor at the conference. Penny is attaching Leonard’s name tag.','Leonard is presenting.',  'Outside Howard’s house. Leonard rings bell.',
          'The same, only Sheldon and Leonard are present. Sheldon is practicing. There are sounds of squealing tyres and brakes and general panic and mayhem.',
          'Raj’s Apartment. Leonard knocks on the door.',   'Leonard’s old school. </i>', 'A street. Leonard is skipping ropes with some young girls. </i>',
         'Leonard’s old school.</i>',  'On the deck of a ship on the North Sea, in the middle of a storm. Leonard is on a sat-phone.',
         'Alice’s apartment. Alice and Leonard are kissing.',
         'Leonard being ejected into the corridor.',]

stairs = ['The hallway. Leonard is finishing off a bottle of spirits. He opens the lift and drops the bottle inside.',]

lab = [ 'Leonard and Lesley’s lab.',  'Leonard’s lab.',  'Leonard’s laboratory.',  'Leonard’s laboratory.</i>',  'Leonard’s lab.</i>',  'Leonard’s lab. </i>',]

#create the main apartment list by only including items that are not in the other lists
main_appartment = [x for x in leonard_long_list  if x not in other]
main_appartment = [x for x in main_appartment if x not in  penny_appartment ]
main_appartment = [x for x in main_appartment if x not in  other ]
main_appartment = [x for x in main_appartment if x not in  lab ]

#clean-up
raw.loc[raw['location'].isin(main_appartment ), "location"] = 'Main appartment'
raw.loc[raw['location'].isin(penny_appartment), "location"] = "Penny’s apartment"
raw.loc[raw['location'].isin(stairs ), "location"] = 'The stairwell'
raw.loc[raw['location'].isin(lab ), "location"] = 'The lab'

In [321]:
howard_bernadet = [ 'Howard and Bernadette’s bedroom.',  'Howard and Bernadette’s apartment',  'Howard and Bernadette’s bedroom.</i>',
                   'Howard and Bernadette’s bedroom. Howard and Bernadette are dressed as smurfs.', 'Howard and Bernadette’s house. </i>',
                   'Howard and Bernadette’s House.</i>', 'Howard and Bernadette’s dining room.',
                   'Howard and Bernadett’s house.' 'Howard and Bernadette’s living room.','Howard and Bernadette’s van.','Howard and Bernadette’s nursery.', 'Howard and Bernadette’s house. Raj rings the doorbell.',
                   'Howard and Bernadette’s minivan.', 'Howard and Bernadette’s bathroom.', 'Howard and Bernadette’s kitchen.</i>', 'Howard’s workshop.</i>',
                   'The crawlspace under Howard and Bernadette’s house.</i>', 'Stuarts former bedroom in Howard and Bernadette’s house.</i>',
                   'Howard and Bernadette’s bedroom. Howard and Bernadette are asleep.</i>','Howard and Berndette’s house.</i>','Howard and Bernadette’s patio.</i>',
                   'Howard and Bernadette’s hot tub.</i>','Howard and Bernadette’s kitchen. Screams are emanating from Bernadette’s tablet.</i>', 'Howard and Bernadette’s spare room.','Howard and Bernadette’s house.','Howard and Bernadette’s garage.',]

lab =['Howard and Lesley’s lab.',  'Howard’s lab.',  'Howard’s laboratory. Howard is soldering something.', 'Howard’s laboratory. The phone rings. Howard puts it on speaker.',  'Outside Amy’s lab. Howard knocks on the door.',]

howard_home = [ 'Outside Howard’s house.', 'Howard’s house. The door rings.','Howard’s motor scooter. Howard is driving, Sheldon is on the back clutching him for dear life and screaming.',
               'The living room. Howard on his laptop.',  'Outside Howard’s front door.','Howard’s bedroom.', 'Outside Howard’s house. Sheldon rings the doorbell.','Howard’s bathroom. He is in the bath.',
               'Howard’s bedroom. Howard is being massaged by the robot hand.', 'Howard’s bedroom. Howard enters in a silk dressing gown, puts on romantic music and sets up mood lighting. Gets onto bed.', 'Outside Howard’s house. Leonard rings bell.', 'Howard’s house.', 'Howard’s house', 'Howard’s old bedroom. A phone is ringing.','Howard’s workshop.',
               'Outside Howard’s house. Amy rings the bell.', 'Howard’s bedroom. Howard is playing with a lightsabre.','Howard’s house. Sheldon rings the doorbell.', 'Howard’s workshop</i>','Howard’s mother’s house.',]

pennies = [
 'The hallway. Howard knocks on Penny’s door with his bow.',  'Howard’s bedroom. The phone is ringing.',  'Penny’s door. Howard knocks.',]

other =[ 'The same, later. Leonard is dressed as Frodo. Howard appears to be Peter Pan. There is a knock on the door.',
        'The hospital. Howard runs in and up to the counter.',  'The top model house. The doorbell rings. One of the models answers. Howard and Raj are outside wearing blue jumpsuits.',       
        'Howard’s hotel room.', 'Howard’s hotel room. There is a knock on the door.',  'The parking spot. Sheldon is spraypainting out Howard’s name and replacing it with his own.',
        'Emily’s bedroom. Raj is still on the phone with Howard.</i>',]

main = [ 'The living room door, a fibre-optic camera emerges from underneath. We see the scene from its point of view. Cut to outside. Raj is holding a laptop, Howard is feeding the camera under the door.',
        'The living room. Raj and Howard are on the floor, fighting.',]

bernadettes_appartment = ['Bernadette’s apartment. Howard knocks on door.', ]

#clean
raw.loc[raw['location'].isin(main  ), "location"] = 'Main appartment'
raw.loc[raw['location'].isin(howard_bernadet), "location"] = "Howard and Bernadette’s apartment"
raw.loc[raw['location'].isin(lab), "location"] = 'The lab'
raw.loc[raw['location'].isin(howard_home ), "location"] = "Howard’s house"
raw.loc[raw['location'].isin(pennies), "location"] = "Penny’s apartment"
raw.loc[raw['location'].isin(bernadettes_appartment), "location"] = "Bernadette's appartment"

In [322]:
#clean up  'penny locations'
locations = raw[raw['character_scene']=='Scene']['location'].unique()
penny_long_list = [s for s in locations if "Penny" in s]

#faulty items in list
remove = [ 'A corridor at the conference. Penny is attaching Leonard’s name tag.',  'Same – later. Penny is attacking a multi-headed monster with a sword on the screen.A muscular warrior in a cape walks onto the screen.',
          'Inside the game. Penny-warrior is standing next to a battle horse.',  'A public washroom. Amy and Penny are in cubicles, Bernadette is washing her hands.',
           'The same, later. Sheldon and Penny are making barettes, singing “She’ll be coming round the mountain.”', 'A film set. Penny is being fitted with gorilla hands.']

#create list of items belonging to pennys' appartement
penny_short_list = [x for x in penny_long_list if x not in remove ]
raw.loc[raw['location'].isin(penny_short_list ), "location"] = "Penny’s apartment"

#add to stairwell
stair = [ 'The lobby. Penny finds a woman studying the lift.', ]
raw.loc[raw['location'].isin(stair ), "location"] = 'The stairwell'


In [323]:
#clean up  'penny locations'
locations = raw[raw['character_scene']=='Scene']['location'].unique()
amy_long_list = [s for s in locations if "Amy" in s]

lab =['Amy’s lab.',  'Amy’s lab. Amy is dissecting a brain.','Amy’s lab. She is dissecting a brain.','Amy’s laboratory.', 'Amy’s lab. Phone rings.',
      'Amy’s lab.</i>','Amy’s laboratory.</i>', 'Amy’s lab. Phone rings.', 'Amy’s lab.</i>',
 'Amy’s laboratory.</i>', 'Amy’s laboratory']

amy_appartment =[ 'Amy’s bedroom. Phone rings.', 'Amy’s bedroom. </i>',  'Amy’s bedroom.</i>',]

#clean
raw.loc[raw['location'].isin(amy_appartment), "location"] = "Amy’s apartment"
raw.loc[raw['location'].isin(lab), "location"] = 'The lab'

In [324]:
#clean up  'raj locations'
locations = raw[raw['character_scene']=='Scene']['location'].unique()
raj_long_list = [s for s in locations if "Raj" in s]


raj_appartemnt = ['Outside Raj’s flat.' , 'Raj’s room.',  'Raj’s Apartment. Leonard knocks on the door.',  'Raj’s bedroom. Phone rings.']
lab  = [ 'Raj’s Lab.',  'The telescope lab. Raj’s phone rings.</i>',]
howard  =['Mrs Wolowitz’s house. Raj is climbing out of the window.']
raj_office = ['Raj is exiting his office.',   'Outside Raj’s office.' ,'Raj’s office.' ,'Raj’s office.</i>',]

#clean
raw.loc[raw['location'].isin(lab), "location"] = 'The lab'
raw.loc[raw['location'].isin(raj_appartemnt ), "location"] = "Raj's appartment"
raw.loc[raw['location'].isin(howard ), "location"] = "Howard’s house"
raw.loc[raw['location'].isin(raj_office ), "location"] = "Raj’s office"

In [325]:
#clean up  'Bernadette locations'
locations = raw[raw['character_scene']=='Scene']['location'].unique()
bernadette_long_list = [s for s in locations if "Bern" in s]

bernadette_appartment = ['Bernadette’s bedroom.',"Bernadette's appartment", 'Bernadette’s apartment.',]
howard_bernadette = ['Howard and Bernadett’s house.', 'Howard and Bernadette’s living room.']

raw.loc[raw['location'].isin(bernadette_appartment ), "location"] = "Bernadette's appartment"
raw.loc[raw['location'].isin(howard_bernadette  ), "location"] = "Howard and Bernadette’s apartment"

In [326]:
#clean up  'Wolowitz locations'
locations = raw[raw['character_scene']=='Scene']['location'].unique()
howard_long_list = [s for s in locations if "Wolowitz" in s]
raw.loc[raw['location'].isin(howard_long_list  ), "location"] = "Howard’s house"

In [327]:
#bar/restaurant
locations = raw[raw['character_scene']=='Scene']['location'].unique()
bar = [s for s in locations if "bar" in s]
restaurant = [s for s in locations if "restaur" in s]

raw.loc[raw['location'].isin(bar  ), "location"] = "Bar or restaurant"
raw.loc[raw['location'].isin(restaurant ), "location"] = "Bar or restaurant"

In [328]:
#uni
locations = raw[raw['character_scene']=='Scene']['location'].unique()
university = [s for s in locations if "university" in s]
university2 = [s for s in locations if "University" in s]

raw.loc[raw['location'].isin(university ), "location"] = "University"
raw.loc[raw['location'].isin(university2 ), "location"] = "University"

In [329]:
#lobby to stairwell
raw.loc[raw['location']== 'The lobby.', "location"] = "The stairwell"
raw.loc[raw['location']== 'The hallway.', "location"] = "The stairwell"
raw.loc[raw['location']== 'A coffee shop.', "location"] = "Bar or restaurant"
raw.loc[raw['location']== 'The living room.', "location"] = "Main appartment"
raw.loc[raw['location']== 'The kitchen.', "location"] = "Main appartment"
raw.loc[raw['location']== 'The Apartment.', "location"] = "Main appartment"
raw.loc[raw['location']== 'The lab.', "location"] = "The lab"
raw.loc[raw['location']== 'The pub.</i>', "location"] = "Bar or restaurant"
raw.loc[raw['location']== 'The laboratory.', "location"] = "The lab"
raw.loc[raw['location']== 'A diner.', "location"] = "Bar or restaurant" 
raw.loc[raw['location']== 'The guys laboratory', "location"] = "The lab"
raw.loc[raw['location']== 'The Comic Book Store.', "location"] = "The comic book store"
   

In [330]:
#Same indicates that the scene continues. I will set these to np.nan and forward fill empty rows
locations = raw[raw['character_scene']=='Scene']['location'].unique()
same = [s for s in locations if "Same" in s or 'same' in s or 'moments' in s or 'Moments' in s]

raw.loc[raw['location'].isin(same ), "location"] = np.nan



In [331]:
#top 10 locations make up 73% of all scenes
locations = raw[raw['character_scene']=='Scene']
locations  = locations['location'].value_counts(normalize=True)

locations[0:10]

Main appartment                      0.327273
Penny’s apartment                    0.095900
Howard and Bernadette’s apartment    0.065597
The stairwell                        0.048841
A car                                0.048485
The cafeteria                        0.042068
Bar or restaurant                    0.035651
Howard’s house                       0.028164
The lab                              0.022816
The comic book store                 0.022103
Name: location, dtype: float64

### Empty rows

In [332]:
#if it is a scene, remove the line
#if it is not a scene, remove the location
raw.loc[raw['character_scene']=='Scene', "line"] = np.nan
raw.loc[raw['character_scene']!='Scene', "location"] = np.nan

raw.head()

Unnamed: 0,line,character_scene,season,season_episode,episode_name,episode_id,location
0,,Scene,1,1,Pilot Episode,1-1,A corridor at a sperm bank.
1,So if a photon is directed through a plane wit...,Sheldon,1,1,Pilot Episode,1-1,
2,"Agreed, what’s your point?",Leonard,1,1,Pilot Episode,1-1,
3,"There’s no point, I just think it’s a good ide...",Sheldon,1,1,Pilot Episode,1-1,
4,Excuse me?,Leonard,1,1,Pilot Episode,1-1,


In [333]:
#forward fill the location
raw["location"] = raw.groupby('episode_id')['location'].transform(lambda x: x.ffill())

### Remove scene and cation

In [334]:
#remove scene
raw = raw[raw['character_scene']!= 'Scene']
raw = raw[raw['character_scene']!= 'Caption']

In [335]:
#rename columns
raw =  raw.rename(columns={'character_scene': "character"})

In [336]:
raw.head()

Unnamed: 0,line,character,season,season_episode,episode_name,episode_id,location
1,So if a photon is directed through a plane wit...,Sheldon,1,1,Pilot Episode,1-1,A corridor at a sperm bank.
2,"Agreed, what’s your point?",Leonard,1,1,Pilot Episode,1-1,A corridor at a sperm bank.
3,"There’s no point, I just think it’s a good ide...",Sheldon,1,1,Pilot Episode,1-1,A corridor at a sperm bank.
4,Excuse me?,Leonard,1,1,Pilot Episode,1-1,A corridor at a sperm bank.
5,Hang on.,Receptionist,1,1,Pilot Episode,1-1,A corridor at a sperm bank.


### Create indexes

In [337]:
#turn season and season_episode to integers for sorting purposes
raw[['season', 'season_episode']] = raw[['season', 'season_episode']].astype('int')

#groupby (line is not actually needed here)
unique_episodes = raw.groupby(['season', 'season_episode', 'episode_id'])['line'].count().reset_index()

#turn season and episode back to string
raw[['season', 'season_episode']] = raw[['season', 'season_episode']].astype('str')

#create a series that counts from 1 to..... and use this to create a new column
episode_count = pd.Series(range(1,len(unique_episodes)+1))
unique_episodes['episode_nr'] = episode_count

#drop redundant columns
unique_episodes = unique_episodes.drop(['season', 'season_episode', 'line'], axis=1)
unique_episodes.head()

Unnamed: 0,episode_id,episode_nr
0,1-1,1
1,1-2,2
2,1-3,3
3,1-4,4
4,1-5,5


In [338]:
raw = raw.merge(unique_episodes, how =  'left', on = 'episode_id')
raw.head()

Unnamed: 0,line,character,season,season_episode,episode_name,episode_id,location,episode_nr
0,So if a photon is directed through a plane wit...,Sheldon,1,1,Pilot Episode,1-1,A corridor at a sperm bank.,1
1,"Agreed, what’s your point?",Leonard,1,1,Pilot Episode,1-1,A corridor at a sperm bank.,1
2,"There’s no point, I just think it’s a good ide...",Sheldon,1,1,Pilot Episode,1-1,A corridor at a sperm bank.,1
3,Excuse me?,Leonard,1,1,Pilot Episode,1-1,A corridor at a sperm bank.,1
4,Hang on.,Receptionist,1,1,Pilot Episode,1-1,A corridor at a sperm bank.,1


### Completely empty string

In [339]:
raw = raw.replace(r'^\s*$', np.nan, regex=True) # some completely empty lines are returned (they are not even np.nan), replace them

In [340]:
#there are about 250 records that contain some empty files in lines or character
raw = raw.dropna()
len(raw)

51376

In [341]:
raw.head()

Unnamed: 0,line,character,season,season_episode,episode_name,episode_id,location,episode_nr
0,So if a photon is directed through a plane wit...,Sheldon,1,1,Pilot Episode,1-1,A corridor at a sperm bank.,1
1,"Agreed, what’s your point?",Leonard,1,1,Pilot Episode,1-1,A corridor at a sperm bank.,1
2,"There’s no point, I just think it’s a good ide...",Sheldon,1,1,Pilot Episode,1-1,A corridor at a sperm bank.,1
3,Excuse me?,Leonard,1,1,Pilot Episode,1-1,A corridor at a sperm bank.,1
4,Hang on.,Receptionist,1,1,Pilot Episode,1-1,A corridor at a sperm bank.,1


#### Create line identifier

In [342]:
length_df = len(raw)
line_count = pd.Series(range(1,58000 ))
raw['line_id']= line_count
raw.tail()

Unnamed: 0,line,character,season,season_episode,episode_name,episode_id,location,episode_nr,line_id
51391,"Mmm. No big deal, I enjoy spending time with you.",Ramona,10,24,The Long Distance Dissonance,10-24,Sheldon’s office,231,51392
51392,"And I with you. Question, are you seeking a ro...",Sheldon,10,24,The Long Distance Dissonance,10-24,Sheldon’s office,231,51393
51393,What if I were?,Ramona,10,24,The Long Distance Dissonance,10-24,Sheldon’s office,231,51394
51394,"Well, that would raise a number of problems. W...",Sheldon,10,24,The Long Distance Dissonance,10-24,Sheldon’s office,231,51395
51395,"(Knock, knock, knock)",Sheldon,10,24,The Long Distance Dissonance,10-24,Princeton.,231,51396


### Save df

In [343]:
#save  dataframe 
raw = raw[['episode_nr','season', 'season_episode',  'episode_id',   'episode_name', 'character', 'location', 'line',  'line_id']]
raw.head()
#raw.to_csv("cleaned_episodes2.csv", index=False, sep = '|')

### Grouping by lines and character

In [349]:
character_episode = raw.copy()

#non-main actors are renamed as 'other'
main_characters = ['Sheldon', 'Leonard', 'Penny', 'Howard', 'Raj', 'Amy', 'Bernadette', 'Stuart']
character_episode.loc[~character_episode['character'].isin(main_characters), "character"] = 'Other'


#turn numerical for sorting purposes
character_episode[['season', 'season_episode']] = character_episode[['season', 'season_episode']].astype('int')

#groupby season-episode-character and sort
character_episode= character_episode.groupby(['season', 'season_episode', 'character'])['line'].apply(' '.join).reset_index()
character_episode =character_episode.sort_values(['season', 'season_episode'])

#create key and drop redundant 
character_episode['episode_id'] = (character_episode['season']).astype('str') + "-" + (character_episode['season_episode']).astype('str')
character_episode = character_episode.drop(['season', 'season_episode'], axis=1)

character_episode.head(2)

Unnamed: 0,character,line,episode_id
0,Howard,Wait till you see this. It’s a Stephen Hawking...,1-1
1,Leonard,"Agreed, what’s your point? Excuse me? One acro...",1-1


In [350]:
character_episode = character_episode[['episode_id', 'character', 'line']]
#character_episode.to_csv("lines_main_characters2.csv", index=False)