## Workflow Setup

### Upload Google Translate API Key (JSON)

In [None]:
from google.colab import files
import os
uploaded = files.upload()

Saving google_translate_token.json to google_translate_token (1).json


In [None]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="google_translate_token.json"

### Binding to Google Translate API

In [None]:
import six
from google.cloud import translate_v2 as translate

def translate_text(target, text):
    """
    Translates text into the target language.
    """
    translate_client = translate.Client()

    if isinstance(text, six.binary_type):
        text = text.decode("utf-8")

    result = translate_client.translate(text, target_language=target)

    #print(u"Text: {}".format(result["input"]))
    #print(u"Translation: {}".format(result["translatedText"]))
    #print(u"Detected source language: {}".format(result["detectedSourceLanguage"]))
    return result["translatedText"]

## Web Crawling

In [None]:
import io
import re
import requests
import string
from bs4 import BeautifulSoup

In [None]:
main_characters = ['CHANDLER', 'JOEY', 'MONICA', 'PHOEBE', 'ROSS', 'RACHEL']
url_base = 'https://www.drodd.com/friends'
SEASON_COUNT = 10

In [None]:
r = requests.get(url_base)
soup = BeautifulSoup(r.content)
links_by_episode = [(i['href'][14:-4], url_base + "/" + i['href']) for i in soup.find_all('a') if str(i).find('name') == -1][1:-3]
links = []
for i, j in links_by_episode:
    if i == '212_213':
        links.append((2, j))
    elif i == '615616':
        links.append((6, j))
    elif i == '723.':
        links.append((7, j))
    else:
        links.append((int(i) // 100, j))

# Links to every episode
links_by_episode = links

all_scripts = {i + 1: {} for i in range(SEASON_COUNT)}
scripts_with_context = {i + 1: {} for i in range(SEASON_COUNT)}

## Data Preprocessing

In [None]:
for season in range(1, SEASON_COUNT):   
    script_by_character = { character : '' for character in main_characters }
    current_script_with_context = ''

    for num, lnk in links_by_episode:
        if num == season:
            r = requests.get(lnk)
            soup = BeautifulSoup(r.content)
            script_raw = soup.find_all('p')
            script_raw = script_raw[3]

            script = str(script_raw).replace("<p>", "").replace("</p>", "").replace("<br/><br/>", "\n").replace("<br/>", " ")
            script = script.replace('\r\n', '')
            script = re.sub(r'\[(.*?)\]', '', script)
            script = re.sub(r'\(.*?\)', '', script)
            script = re.sub(' +', ' ', script)
            script = re.sub('\n+', '\n', script)
            script = re.sub('\n ', '\n', script)
            script = re.sub('\n\n+', '\n', script)
            script = [i for i in script.split('\n') if not(re.search('SCENE [0-9]+:|SCENE:', i))]
            script = [i for i in script if i != '']

            for line in script:
                idx = line.find(':')
                if idx != -1:
                    current_script_with_context += line[:idx].upper() + line[idx:] + '\n'

                    cur_char = line[:idx].upper()
                    if cur_char in main_characters:
                        script_by_character[cur_char] += (line[idx + 1:].strip()) + ' '

    all_scripts[season] = script_by_character
    scripts_with_context[season] = current_script_with_context

In [None]:
print(scripts_with_context[9][:1000])

Written by: Sherry Bilsing-Graham &amp; Ellen Plummer Transcribed by: <a href="mailto:webmaster@thecfsi.com">Eric Aasen</a>
Written by: Dana Klein Borkow Transcribed by: <a href="mailto:webmaster@thecfsi.com">Eric Aasen</a>
<p class="spoiler">Scene: 
 Written by: Scott Silveri 
Phoebe: Oh hey you guys, I couldn't get a reservation for the night of my birthday, so we have to do dinner Thursday night instead. 
Joey: Thursday? But that's Halloween. 
Phoebe: So? 
Joey: So spooky, that's all. 
Ross: So, so, is Mike coming to dinner? 
Phoebe: No! It's my first birthday with a boyfriend, and he has to work. Ugh, I get mad at him, but I think it's a little to soon to show my true colors. 
Rachel: Pheebs, I would make a reservation for five, because one of us has to stay home and watch Emma. Which one of us should go to dinner? 
Phoebe: Oh, Rachel! 
Ross: Actually, um, I was thinking maybe both of us could go. 
Phoebe: Oh, yay! 
Ross: Thanks, I'll put a lot of extra thought into your gift. 
Pho

In [None]:
all_scripts

{1: {'CHANDLER': 'So does he have a hump? A hump and a hairpiece? Sounds like a date to me. Alright, so I\'m back in high school, I\'m standing in the middle of the cafeteria, and I realise I am totally naked. Then I look down, and I realise there\'s a phone... there. That\'s right. All of a sudden, the phone starts to ring. And it turns out it\'s my mother, which is very weird, because- she never calls me! Cookie? Sometimes I wish I was a lesbian... Did I say that out loud? And I just want a million dollars! Ooh, she should not be wearing those pants. Please don\'t do that again, it\'s a horrible sound. Ooh, this is a Dear Diary moment. I\'m sorry, I didn\'t catch your name. Paul, was it? Yes, and we\'re very excited about it. I have no idea. All finished! Oh, man. Stay out of my freezer! That is amazing. Hi, Paul, is it? All right, kids, I gotta get to work. If I don\'t input those numbers,... it doesn\'t make much of a difference... \'Look, Gippetto, I\'m a real live boy.\' You\'re 

## Translation of scripts

In [None]:
all_scripts_ru = {i + 1: {} for i in range(SEASON_COUNT)}
for i in range(1, SEASON_COUNT):
    by_char_in_russian = { character : '' for character in main_characters }
    for char in main_characters:
        #words_dirty = words.replace('\n', ' 5380 ')
        by_char_in_russian[char] = translate_text("ru", all_scripts[i][char])
        #translation = transl_dirty.replace('5380', '\n')
    all_scripts_ru[i] = by_char_in_russian

In [None]:
all_scripts_ru

{1: {'CHANDLER': 'Так есть ли у него горбинка? Горбинка и шиньон? Для меня это похоже на свидание. Хорошо, я вернулся в старшую школу, стою посреди кафетерия и понимаю, что полностью голый. Затем я смотрю вниз и понимаю, что там телефон ... там. Верно. Внезапно телефон начинает звонить. Оказывается, это моя мама, что очень странно, потому что она мне никогда не звонит! Cookie? Иногда мне хочется быть лесбиянкой ... Я сказал это вслух? А мне просто нужен миллион долларов! Ох, ей не следует носить эти штаны. Пожалуйста, не делай этого снова, это ужасный звук. Ох, это момент Дорогого дневника. Извините, я не расслышал ваше имя. Пол, это было? Да, и мы очень рады этому. Не имею представления. Все закончено! О чувак. Держись подальше от моей морозильной камеры! Это удивительно. Привет, Пол, не так ли? Ладно, дети, мне нужно работать. Если я не ввожу эти числа ... это не имеет большого значения ... «Послушай, Гиппетто, я настоящий живой мальчик». Ты прав, мне очень жаль. «Когда-то я был дере

## Downloading results

In [None]:
%%bash

for value in CHANDLER JOEY MONICA PHOEBE ROSS RACHEL
do
     mkdir -p english/$value russian/$value
done

mkdir -p english/full_scripts

In [None]:
!ls english/ -l

total 28
drwxr-xr-x 2 root root 4096 Jul  6 14:18 CHANDLER
drwxr-xr-x 2 root root 4096 Jul  6 14:18 full_scripts
drwxr-xr-x 2 root root 4096 Jul  6 14:18 JOEY
drwxr-xr-x 2 root root 4096 Jul  6 14:18 MONICA
drwxr-xr-x 2 root root 4096 Jul  6 14:18 PHOEBE
drwxr-xr-x 2 root root 4096 Jul  6 14:18 RACHEL
drwxr-xr-x 2 root root 4096 Jul  6 14:18 ROSS


In [None]:
!ls russian/ -l

total 24
drwxr-xr-x 2 root root 4096 Jul  6 14:18 CHANDLER
drwxr-xr-x 2 root root 4096 Jul  6 14:18 JOEY
drwxr-xr-x 2 root root 4096 Jul  6 14:18 MONICA
drwxr-xr-x 2 root root 4096 Jul  6 14:18 PHOEBE
drwxr-xr-x 2 root root 4096 Jul  6 14:18 RACHEL
drwxr-xr-x 2 root root 4096 Jul  6 14:18 ROSS


In [None]:
for season, script in all_scripts.items():
    for person, lines in script.items():
        with open(f"english/{person}/{season}.txt", "w") as text_file:
            text_file.write(lines)

In [None]:
for season, script in all_scripts_ru.items():
    for person, lines in script.items():
        with open(f"russian/{person}/{season}.txt", "w") as text_file:
            text_file.write(lines)

In [None]:
for season, s_w_context in scripts_with_context.items():
    if isinstance(s_w_context, str):
        with open(f"english/full_scripts/{season}.txt", "w") as text_file:
            text_file.write(s_w_context)

In [None]:
!ls english/CHANDLER/ -l

total 340
-rw-r--r-- 1 root root 30647 Jul  6 14:18 1.txt
-rw-r--r-- 1 root root 39899 Jul  6 14:18 2.txt
-rw-r--r-- 1 root root 44932 Jul  6 14:18 3.txt
-rw-r--r-- 1 root root 41339 Jul  6 14:18 4.txt
-rw-r--r-- 1 root root 40876 Jul  6 14:18 5.txt
-rw-r--r-- 1 root root 54481 Jul  6 14:18 6.txt
-rw-r--r-- 1 root root 37662 Jul  6 14:18 7.txt
-rw-r--r-- 1 root root  7073 Jul  6 14:18 8.txt
-rw-r--r-- 1 root root 36380 Jul  6 14:18 9.txt


In [None]:
!tar -czvf scripts.tar.gz english/ russian/

english/
english/RACHEL/
english/RACHEL/4.txt
english/RACHEL/5.txt
english/RACHEL/7.txt
english/RACHEL/8.txt
english/RACHEL/1.txt
english/RACHEL/3.txt
english/RACHEL/9.txt
english/RACHEL/2.txt
english/RACHEL/6.txt
english/full_scripts/
english/full_scripts/4.txt
english/full_scripts/5.txt
english/full_scripts/7.txt
english/full_scripts/8.txt
english/full_scripts/1.txt
english/full_scripts/3.txt
english/full_scripts/9.txt
english/full_scripts/10.txt
english/full_scripts/2.txt
english/full_scripts/6.txt
english/JOEY/
english/JOEY/4.txt
english/JOEY/5.txt
english/JOEY/7.txt
english/JOEY/8.txt
english/JOEY/1.txt
english/JOEY/3.txt
english/JOEY/9.txt
english/JOEY/2.txt
english/JOEY/6.txt
english/CHANDLER/
english/CHANDLER/4.txt
english/CHANDLER/5.txt
english/CHANDLER/7.txt
english/CHANDLER/8.txt
english/CHANDLER/1.txt
english/CHANDLER/3.txt
english/CHANDLER/9.txt
english/CHANDLER/2.txt
english/CHANDLER/6.txt
english/ROSS/
english/ROSS/4.txt
english/ROSS/5.txt
english/ROSS/7.txt
english/ROSS

In [None]:
from google.colab import files
files.download('scripts.tar.gz') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>