### Libraries

In [1]:
import re
import httplib2
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup, SoupStrainer

#### Create the directories

In [2]:
Path("Data").mkdir(parents=True, exist_ok=True)
Path("Data/Scripts").mkdir(parents=True, exist_ok=True)

#### Download the scripts of the six episodes

In [3]:
http = httplib2.Http()

scripts = []
films = ['https://imsdb.com/scripts/Star-Wars-The-Phantom-Menace.html',
         'https://imsdb.com/scripts/Star-Wars-Attack-of-the-Clones.html',
         'https://imsdb.com/scripts/Star-Wars-A-New-Hope.html',
         'https://imsdb.com/scripts/Star-Wars-The-Empire-Strikes-Back.html',
         'https://imsdb.com/scripts/Star-Wars-Return-of-the-Jedi.html']

for film in films:
    film_name = film.strip('http://www.imsdb.com/scripts/')
    film_name = film_name.replace('.html','')
    film_name = film_name.replace(':', '')
    status, response = http.request(film)
    filename ='./Data/Scripts/' + film_name + '.txt'
    for link in BeautifulSoup(response, parse_only = SoupStrainer('pre')):
        scripts = link.text
        with open(filename, "w", encoding='utf-8', errors='ignore') as f:
            for script in scripts:
                f.write(script)

scripts = []                
film = 'https://imsdb.com/scripts/Star-Wars-Revenge-of-the-Sith.html'
film_name = film.strip('http://www.imsdb.com/scripts/')
film_name = film_name.replace('.html','')
film_name = film_name.replace(':', '')
status, response = http.request(film)
filename ='./Data/Scripts/' + film_name + '.txt'
for link in BeautifulSoup(response, parse_only = SoupStrainer('body')):
    scripts = link.text
    scripts = scripts[scripts.find('STAR WARS'):]
    with open(filename, "w", encoding='utf-8', errors='ignore') as f:
        for script in scripts:
            f.write(script)

#### Create the dataframes

In [4]:
data_movie = pd.DataFrame(columns=['Scene_Title','Scene_Description', 'Scene_Characters', 'Scene_Dialogue', 'Movie'])
data_lines = pd.DataFrame(columns=['Character', 'Line', 'Movie'])

### Functions to help us with the script

#### Split the script into rows

In [5]:
def split_script_to_rows(script):
    temp_script = []
    for row in script:
        row = re.sub(r'\(.*\)', '', row)
        row = re.sub(r'\-|\#\d+', '', row)
        temp_script.append(row.replace('\t', ' ').lstrip(" "))
    return temp_script

#### Find the name of every scene

In [6]:
def find_scenes(temp_script):
    scenes = []
    for row in temp_script:
        scene = re.search(r'(((INT\.|EXT\.)\s[A-Z]+)|((INT\.|EXT\.)\s+[A-Z]+\W+.+))', row)
        if scene:
            scenes.append(row[:-1])
    return scenes

In [7]:
def find_scenes_EP_VI(temp_script):
    scenes = []
    for row in temp_script:
        scene = re.search(r'(INT|EXT)', row)
        if scene:
            scenes.append(row[:-1])
    return scenes

#### Find the character from every line

In [8]:
def find_lines_character(temp_script):
    characters = []
    for row in temp_script:
        char = re.findall(r'(^[A-Z]+[A-Z]+\n)|(^[A-Z]+[A-Z]+\s+[A-Z]+[A-Z]+\n)', row)
        if char:
            untupled_char = [tuple(filter(None, i)) for i in char]
            characters.append(untupled_char)
    characters = [x[0][0][:-1] for x in characters]
    return characters

In [9]:
def find_lines_character_EP_I(temp_script):
    characters = []
    for row in temp_script:
        char = re.findall(r'(^[A-Z]+[A-Z]+ :)|(^[A-Z]+[A-Z]+\s+[A-Z]+[A-Z]+ :)|(^[A-Za-z0-9_-]+ :)', row)
        if char:
            untupled_char = [tuple(filter(None, i)) for i in char]
            characters.append(untupled_char)
    characters = [x[0][0][:-1] for x in characters]
    return characters

In [10]:
def find_lines_character_EP_III(temp_script):
    characters = []
    for row in temp_script:
        char = re.findall(r'(^[A-Z]+[A-Z]+:)|(^[A-Z]+[A-Z]+\s+[A-Z]+[A-Z]+:)', row)
        if char:
            untupled_char = [tuple(filter(None, i)) for i in char]
            characters.append(untupled_char)
    characters = [x[0][0][:-1] for x in characters]
    return characters

#### All the script by adding a keyword to every change of scene

In [11]:
def change_scene(script):
    text = []
    for row in script:
        row = row.replace('\t', ' ').lstrip(" ")
        row = re.sub(r'(((INT\.|EXT\.)\s[A-Z]+)|((INT\.|EXT\.)\s+[A-Z]+\W+.+))', 'SPLIT', row)
        row = re.sub(r'^\W+', r'', row)
        row = re.sub(r"[^a-zA-Z0-9.,?'&\n ]+", '', row)
        text.append(row)

    text = "".join([s for s in text if s.strip()])
    text = re.sub(r'\nTHE END\n(.|\n)*', '', text)
    return text

In [12]:
def change_scene_EP_VI(script):
    text = []
    for row in script:
        row = row.replace('\t', ' ').lstrip(" ")
        row = re.sub(r'(INT|EXT)', 'SPLIT', row)
        row = re.sub(r'^\W+', r'', row)
        row = re.sub(r"[^a-zA-Z0-9.,?'&\n ]+", '', row)
        text.append(row)
    text = "".join([s for s in text if s.strip()])
    text = re.sub(r'\nTHE END\n(.|\n)*', '', text)
    return text

#### Script's dialogues

In [13]:
def every_dialogue(text, characters, movie):
    scene_dialogue, scene_description, char_turn = [], [], []
    every_lines_char, every_chars_line = [], []
    scenes_loop = text.split('SPLIT')[1:]
    for scene in scenes_loop:
        scene_line = re.compile('(\n[A-Z]+[A-Z]+\n)|(\n[A-Z]+[A-Z]+\s+\n)|(\n[A-Z]+[A-Z]+\s+[A-Z]+[A-Z]+\s+\n)\
        |(\n[A-Z]+[A-Z]+\s+[A-Z]+[A-Z]+\n)|(\nMR\s+[A-Z]+[A-Z]+|MRS\s+[A-Z]+[A-Z]+\n)\
        |(\n[A-Z]+[A-Z]+\'S\s+[A-Z]+[A-Z]+\n)|(\nMR\s+[A-Z]+[A-Z]+|MRS\s+[A-Z]+[A-Z]+\s+\n)').split(scene)
        scene_line = [x for x in scene_line if x != None]
        char_line = re.findall('(\n[A-Z]+[A-Z]+\n)|(\n[A-Z]+[A-Z]+\s+\n)|(\n[A-Z]+\.\s+[A-Z]+\n)|(\n[A-Z]+[A-Z]+\s+[A-Z]+[A-Z]+\s+\n)\
        |(\n[A-Z]+[A-Z]+\s+[A-Z]+[A-Z]+\s+[A-Z]+[A-Z]+\n)|(\n[A-Z]+[A-Z]+\s+[A-Z]+[A-Z]+\n)|(\n[A-Z]+[A-Z]+\'S\s+[A-Z]+[A-Z]+\s+[A-Z]+[A-Z]+\n)\
        |(\n[A-Z]+[A-Z]+\'S\s+[A-Z]+[A-Z]+\n)|(\n[A-Z]+[A-Z]+\'S\s+[A-Z]+[A-Z]+\s+\n)|(\nMR\s+[A-Z]+[A-Z]+|MRS\s+[A-Z]+[A-Z]+\s+\n)\
        |(\n[A-Z]+[A-Z]+\s+\&\s+[A-Z]+[A-Z]+\n)|(\nMR\s+[A-Z]+[A-Z]+|MRS\s+[A-Z]+[A-Z]+\n)', scene)
        tupled_char = []
        for line in char_line:
            temp_line = tuple(filter(None, line))
            tupled_char.append(temp_line)
        untupled_char = [x[0] for x in tupled_char]
        ordered_char = []
        for c in untupled_char:
            temp_char = c.replace('\n', '').strip()
            if temp_char in characters:
                ordered_char.append(c)
        dial=False
        temp_scene_description = []
        for i in range(len(scene_line)):
            if scene_line[i] in ordered_char:
                dial = True
                continue
            if dial==True:
                dial=False
                continue
            temp_scene_description.append(scene_line[i])
        scene_description.append(temp_scene_description)
        if ordered_char != []:
            temp_dialogue = []
            temp_char_turn = [] 
            for i, line in enumerate(scene_line):
                if line in ordered_char:
                    every_lines_char.append(line)
                    every_chars_line.append(scene_line[i+1])
                    temp_char_turn.append(line)
                    temp_dialogue.append(scene_line[i+1])
                    every_lines_char = [re.sub(r'\n', r' ', y).strip() for y in every_lines_char]
                    every_chars_line = [re.sub(r'\n', r' ', y) for y in every_chars_line]
                    every_chars_line = [re.sub(r'into comlink', r'', y) for y in every_chars_line]
                    every_chars_line = [re.sub(r'over comlink', r'', y) for y in every_chars_line]
                    temp_char_turn = [re.sub(r'\n', r' ', y).strip() for y in temp_char_turn]
                    temp_dialogue = [re.sub(r'\n', r' ', dial) for dial in temp_dialogue]
                    temp_dialogue = [re.sub(r'into comlink', r'', dial) for dial in temp_dialogue]
                    temp_dialogue = [re.sub(r'over comlink', r'', dial) for dial in temp_dialogue]
            scene_dialogue.append(temp_dialogue)
            char_turn.append(temp_char_turn)
        else:    
            scene_dialogue.append(None)
            char_turn.append(None)
    #scene_description = [re.sub(r'\n', r' ', x).strip() for x in scene_description]
    movie = [movie]*len(scenes)
    movie_data = list(zip(scenes, scene_description, char_turn, scene_dialogue, movie))
    dial_data = list(zip(every_lines_char, every_chars_line, movie))
    return movie_data, dial_data

In [14]:
def every_dialogue_EP_I(text, characters, movie):
    scene_dialogue, scene_description, char_turn = [], [], []
    every_lines_char, every_chars_line = [], []
    scenes_loop = text.split('SPLIT')[1:]
    for scene in scenes_loop:
        scene_line = re.compile('(\n[A-Z]+[A-Z]+  )|(\n[A-Z]+[A-Z]+\s+  )|(\n[A-Z]+[A-Z]+\s+[A-Z]+[A-Z]+\s+  )\
        |(\n[A-Z]+[A-Z]+\s+[A-Z]+[A-Z]+  )|(\nMR\s+[A-Z]+[A-Z]+|MRS\s+[A-Z]+[A-Z]+  )\
        |(\n[A-Z]+[A-Z]+\'S\s+[A-Z]+[A-Z]+  )|(\nMR\s+[A-Z]+[A-Z]+|MRS\s+[A-Z]+[A-Z]+\s+)|(?=(\n[A-Z][a-z]+\s+[A-Z][a-z]+))').split(scene)
        scene_line = [x for x in scene_line if x != None]
        char_line = re.findall('(\n[A-Z]+[A-Z]+  )|(\n[A-Z]+[A-Z]+\s+  )|(\n[A-Z]+\.\s+[A-Z]+  )|(\n[A-Z]+[A-Z]+\s+[A-Z]+[A-Z]+\s+  )\
        |(\n[A-Z]+[A-Z]+\s+[A-Z]+[A-Z]+\s+[A-Z]+[A-Z]+  )|(\n[A-Z]+[A-Z]+\s+[A-Z]+[A-Z]+  )|(\n[A-Z]+[A-Z]+\'S\s+[A-Z]+[A-Z]+\s+[A-Z]+[A-Z]+  )\
        |(\n[A-Z]+[A-Z]+\'S\s+[A-Z]+[A-Z]+  )|(\n[A-Z]+[A-Z]+\'S\s+[A-Z]+[A-Z]+\s+  )|(\nMR\s+[A-Z]+[A-Z]+|MRS\s+[A-Z]+[A-Z]+\s+  )\
        |(\n[A-Z]+[A-Z]+\s+\&\s+[A-Z]+[A-Z]+  )|(\nMR\s+[A-Z]+[A-Z]+|MRS\s+[A-Z]+[A-Z]+  )', scene)
        tupled_char = []
        for line in char_line:
            temp_line = tuple(filter(None, line))
            tupled_char.append(temp_line)
        untupled_char = [x[0] for x in tupled_char]
        ordered_char = []
        for c in untupled_char:
            temp_char = c.replace('\n', '').strip()
            if temp_char in characters:
                ordered_char.append(c)
        dial=False
        temp_scene_description = []
        for i in range(len(scene_line)):
            if scene_line[i] in ordered_char:
                dial = True
                continue
            if dial==True:
                dial=False
                continue
            temp_scene_description.append(scene_line[i])
        scene_description.append(temp_scene_description)
        if ordered_char != []:
            temp_dialogue = []
            temp_char_turn = []
            for i, line in enumerate(scene_line):
                if line in ordered_char:
                    every_lines_char.append(line)
                    every_chars_line.append(scene_line[i+1])
                    temp_char_turn.append(line)
                    temp_dialogue.append(scene_line[i+1])
                    every_lines_char = [re.sub(r'\n', r' ', y).strip() for y in every_lines_char]
                    every_chars_line = [re.sub(r'\n', r' ', y) for y in every_chars_line]
                    temp_char_turn = [re.sub(r'\n', r' ', y).strip() for y in temp_char_turn]
                    temp_dialogue = [re.sub(r'\n', r' ', dial) for dial in temp_dialogue]
            scene_dialogue.append(temp_dialogue)
            char_turn.append(temp_char_turn)
        else:    
            scene_dialogue.append(None)
            char_turn.append(None)
    #scene_description = [re.sub(r'\n', r' ', x).strip() for x in scene_description]
    movie = [movie]*len(scenes)
    movie_data = list(zip(scenes, scene_description, char_turn, scene_dialogue, movie))
    dial_data = list(zip(every_lines_char, every_chars_line, movie))
    return movie_data, dial_data

In [15]:
def every_dialogue_EP_III(text, characters, movie):
    scene_dialogue, scene_description, char_turn = [], [], []
    every_lines_char, every_chars_line = [], []
    scenes_loop = text.split('SPLIT')[1:]
    for scene in scenes_loop:
        scene_line = re.compile('(\n[A-Z]+[A-Z])|(\n[A-Z]+[A-Z]+\s)|(\n[A-Z]+[A-Z]+\s+[A-Z]+[A-Z]+\s)\
        |(\n[A-Z]+[A-Z]+\s+[A-Z]+[A-Z])|(\nMR\s+[A-Z]+[A-Z]+|MRS\s+[A-Z]+[A-Z])\
        |(\n[A-Z]+[A-Z]+\'S\s+[A-Z]+[A-Z])|(\nMR\s+[A-Z]+[A-Z]+|MRS\s+[A-Z]+[A-Z]+\s+)').split(scene)
        scene_line = [x for x in scene_line if x != None]
        char_line = re.findall('(\n[A-Z]+[A-Z])|(\n[A-Z]+[A-Z]+\s)|(\n[A-Z]+\.\s+[A-Z])|(\n[A-Z]+[A-Z]+\s+[A-Z]+[A-Z]+\s)\
        |(\n[A-Z]+[A-Z]+\s+[A-Z]+[A-Z]+\s+[A-Z]+[A-Z])|(\n[A-Z]+[A-Z]+\s+[A-Z]+[A-Z])|(\n[A-Z]+[A-Z]+\'S\s+[A-Z]+[A-Z]+\s+[A-Z]+[A-Z])\
        |(\n[A-Z]+[A-Z]+\'S\s+[A-Z]+[A-Z])|(\n[A-Z]+[A-Z]+\'S\s+[A-Z]+[A-Z]+\s)|(\nMR\s+[A-Z]+[A-Z]+|MRS\s+[A-Z]+[A-Z]+\s)\
        |(\n[A-Z]+[A-Z]+\s+\&\s+[A-Z]+[A-Z])|(\nMR\s+[A-Z]+[A-Z]+|MRS\s+[A-Z]+[A-Z])', scene)
        tupled_char = []
        for line in char_line:
            temp_line = tuple(filter(None, line))
            tupled_char.append(temp_line)
        untupled_char = [x[0] for x in tupled_char]
        ordered_char = []
        for c in untupled_char:
            temp_char = c.replace('\n', '').strip()
            if temp_char in characters:
                ordered_char.append(c)
        dial=False
        temp_scene_description = []
        for i in range(len(scene_line)):
            if scene_line[i] in ordered_char:
                dial = True
                continue
            if dial==True:
                dial=False
                continue
            temp_scene_description.append(scene_line[i])
        scene_description.append(temp_scene_description)
        if ordered_char != []:
            temp_dialogue = []
            temp_char_turn = []
            for i, line in enumerate(scene_line):
                if line in ordered_char:
                    every_lines_char.append(line)
                    every_chars_line.append(scene_line[i+1])
                    temp_char_turn.append(line)
                    temp_dialogue.append(scene_line[i+1])
                    every_lines_char = [re.sub(r'\n', r' ', y).strip() for y in every_lines_char]
                    every_chars_line = [re.sub(r'\n', r' ', y) for y in every_chars_line]
                    temp_char_turn = [re.sub(r'\n', r' ', y).strip() for y in temp_char_turn]
                    temp_dialogue = [re.sub(r'\n', r' ', dial) for dial in temp_dialogue]
            scene_dialogue.append(temp_dialogue)
            char_turn.append(temp_char_turn)
        else:    
            scene_dialogue.append(None)
            char_turn.append(None)
    #scene_description = [re.sub(r'\n', r' ', x).strip() for x in scene_description]
    movie = [movie]*len(scenes)
    movie_data = list(zip(scenes, scene_description, char_turn, scene_dialogue, movie))
    dial_data = list(zip(every_lines_char, every_chars_line, movie))
    return movie_data, dial_data

### The Phantom Menace

In [16]:
with open('./Data/Scripts/Star-Wars-The-Phantom-Menace.txt', "r", encoding='utf-8', errors='ignore') as f:
    script = [row for row in f]
    
scenes = find_scenes(split_script_to_rows(script))
characters = find_lines_character_EP_I(split_script_to_rows(script))
# Remove space after name and remove a non-character name (Hardcoded)
characters = [x[:-1] for x in characters]
characters.pop(0)

text = change_scene(script)
temp_movie, temp_dial = every_dialogue_EP_I(text, characters, 'Episode I')
temp_movie = pd.DataFrame(temp_movie, columns=['Scene_Title','Scene_Description', 'Scene_Characters', 'Scene_Dialogue', 'Movie'])
temp_dial = pd.DataFrame(temp_dial, columns=['Character', 'Line', 'Movie'])

data_movie = pd.concat([data_movie, temp_movie], axis=0)
data_lines = pd.concat([data_lines, temp_dial], axis=0)

### Attack of the Clones

In [17]:
with open('./Data/Scripts/Star-Wars-Attack-of-the-Clones.txt', "r", encoding='utf-8', errors='ignore') as f:
    script = [row for row in f]

scenes = find_scenes(split_script_to_rows(script))
characters = find_lines_character(split_script_to_rows(script))
text = change_scene(script)
temp_movie, temp_dial = every_dialogue(text, characters, 'Episode II')
temp_movie = pd.DataFrame(temp_movie, columns=['Scene_Title','Scene_Description', 'Scene_Characters', 'Scene_Dialogue', 'Movie'])
temp_dial = pd.DataFrame(temp_dial, columns=['Character', 'Line', 'Movie'])

data_movie = pd.concat([data_movie, temp_movie], axis=0)
data_lines = pd.concat([data_lines, temp_dial], axis=0)

### Revenge of the Sith

In [18]:
with open('./Data/Scripts/Star-Wars-Revenge-of-the-Sith.txt', "r", encoding='utf-8', errors='ignore') as f:
    script = [row for row in f]
    
scenes = find_scenes(split_script_to_rows(script))
characters = find_lines_character_EP_III(split_script_to_rows(script))
text = change_scene(script)
temp_movie, temp_dial = every_dialogue_EP_III(text, characters, 'Episode III')
temp_movie = pd.DataFrame(temp_movie, columns=['Scene_Title','Scene_Description', 'Scene_Characters', 'Scene_Dialogue', 'Movie'])
temp_dial = pd.DataFrame(temp_dial, columns=['Character', 'Line', 'Movie'])

data_movie = pd.concat([data_movie, temp_movie], axis=0)
data_lines = pd.concat([data_lines, temp_dial], axis=0)

### A New Hope

In [19]:
with open('./Data/Scripts/Star-Wars-A-New-Hope.txt', "r", encoding='utf-8', errors='ignore') as f:
    script = [row for row in f]
    
scenes = find_scenes(split_script_to_rows(script))
characters = find_lines_character(split_script_to_rows(script))
text = change_scene(script)
temp_movie, temp_dial = every_dialogue(text, characters, 'Episode IV')
temp_movie = pd.DataFrame(temp_movie, columns=['Scene_Title','Scene_Description', 'Scene_Characters', 'Scene_Dialogue', 'Movie'])
temp_dial = pd.DataFrame(temp_dial, columns=['Character', 'Line', 'Movie'])

data_movie = pd.concat([data_movie, temp_movie], axis=0)
data_lines = pd.concat([data_lines, temp_dial], axis=0)

### The Empire Strikes Back

In [20]:
with open('./Data/Scripts/Star-Wars-The-Empire-Strikes-Back.txt', "r", encoding='utf-8', errors='ignore') as f:
    script = [row for row in f]
    
scenes = find_scenes(split_script_to_rows(script))
characters = find_lines_character(split_script_to_rows(script))
text = change_scene(script)
temp_movie, temp_dial = every_dialogue(text, characters, 'Episode V')
temp_movie = pd.DataFrame(temp_movie, columns=['Scene_Title','Scene_Description', 'Scene_Characters', 'Scene_Dialogue', 'Movie'])
temp_dial = pd.DataFrame(temp_dial, columns=['Character', 'Line', 'Movie'])

data_movie = pd.concat([data_movie, temp_movie], axis=0)
data_lines = pd.concat([data_lines, temp_dial], axis=0)

### Return of the Jedi

In [21]:
with open('./Data/Scripts/Star-Wars-Return-of-the-Jedi.txt', "r", encoding='utf-8', errors='ignore') as f:
    script = [row for row in f]

scenes = find_scenes_EP_VI(split_script_to_rows(script))
characters = find_lines_character(split_script_to_rows(script))
text = change_scene_EP_VI(script)
temp_movie, temp_dial = every_dialogue(text, characters, 'Episode VI')
temp_movie = pd.DataFrame(temp_movie, columns=['Scene_Title','Scene_Description', 'Scene_Characters', 'Scene_Dialogue', 'Movie'])
temp_dial = pd.DataFrame(temp_dial, columns=['Character', 'Line', 'Movie'])

data_movie = pd.concat([data_movie, temp_movie], axis=0)
data_lines = pd.concat([data_lines, temp_dial], axis=0)

#### Save dataframes

In [22]:
data_movie.to_pickle('./Data/Movies.pkl')
#data_lines.to_pickle('./Data/Dialogues.pkl')

#### Create dataframe for line and character

In [23]:
temp_df = data_movie.reset_index(drop=True)
dial = pd.DataFrame(columns=['Character', 'Line', 'Movie'])

for row in range(len(temp_df)):
    scene = temp_df.loc[row]
    if scene['Scene_Characters']!=None:
        for i in range(len(scene['Scene_Characters'])):
            temp_char = scene['Scene_Characters'][i]
            temp_line = scene['Scene_Dialogue'][i]
            temp_movie = scene['Movie']
            temp_dial = pd.DataFrame({"Character":[temp_char],
                                      "Line":[temp_line],
                                      "Movie":[temp_movie]})
            dial = dial.append(temp_dial, ignore_index = True)

#### Save dataframe

In [24]:
dial.to_pickle('./Data/Dialogues.pkl')