What are the available fields?

There's no "description" or "plot summary" of movies.
So one must look at all the dialogs and decipher the movie genre?

How can we do this?
Let's do this in a naive way first, then let's try to get creative.

---

Destruction of the data:
- movies
    - conversations
        - dialogs
            - sentences
                - words

---

## Naive way:

- Average all word embeddings in a dialog to get dialog embeddings.
- Average all dialog embeddings to get movie plot embedding.

Thought process:
  - We get a sentence embedding by averaging the word embeddings
  - In the same way, can we get "scene" embedding by averaging all the sentence embeddings in a scene?
  - Then averaging all scene embeddings should give us movie embedding.
  - We don't have scenes, do we? If not, we can skip scenes, will averaging all dialog embeddings give us movie embedding?

Problems:
- The conversations might mostly include "everyday lines" that do not relate to the plot.
- For example, "You must learn how to lie", "they have to!" do not say anything about the plot.

## Another method:

- Find all named entities (of some relevant category).
- Then get the dialogs with named entities only.
- Assumption: dialogs with named entities will contain plot-related information.
-

---

Extracting all information about a movie.


In [1]:
from os import listdir
from os.path import join
from typing import List, Tuple, Dict, Optional
import pandas as pd
from pydantic import BaseModel

In [2]:

class Movie(BaseModel):
    id: str
    title: str
    release_year: int
    imdb_rating: float
    num_votes: int
    genres: List[str] = []
    characters: dict = {}
    conversations: list = []


class Character(BaseModel):
    name: str
    id: str
    gender: Optional[str]  # m|f|None
    credit_position: Optional[int]
    movie: Movie

    def __eq__(self, other):
        return self.id == other.id


class Dialog(BaseModel):
    id: str
    speaker: Character
    listener: Optional[Character]
    dialog: str


class Conversation(BaseModel):
    characters: Tuple[Character, Character]
    dialogs: List[Dialog]

In [3]:
def update_movie_characters(movies: Dict[str, Movie], characters_path: str):
    with open(characters_path, "r", encoding="ISO-8859-1") as characters_file:
        for character in characters_file:
            # sample: u0 +++$+++ BIANCA +++$+++ m0 +++$+++ 10 things i hate about you +++$+++ f +++$+++ 4
            character = character.strip().split(" +++$+++ ")
            character_id, name, movie_id, movie_title, gender, position = character

            gender = None if gender == "?" else gender
            position = None if position == "?" else position

            assert movie_id in movies
            movie = movies[movie_id]

            character = Character(
                name=name,
                id=character_id,
                gender=gender,  # m|f|None
                credit_position=position,
                movie=movie,
            )

            movie.characters[character_id] = character


In [4]:

def get_all_dialogs(movies: Dict[str, Movie], lines_path: str) -> Dict[str, Dialog]:
    all_dialogs = {}
    with open(lines_path, "r", encoding="ISO-8859-1") as dialogs_file:
        for dialog in dialogs_file:
            # Sample: L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
            # Note: Sometimes, there's no dialog, so splitting with ' +++$+++ ' will be wrong (with space at the end.)
            #       So, we will split without spaces on either side and then strip
            dialog = dialog.strip().split("+++$+++")
            dialog = [d.strip() for d in dialog]
            dialog_id, character_id, movie_id, character_name, dialog = dialog

            assert movie_id in movies
            movie = movies[movie_id]

            assert character_id in movie.characters
            speaker = movie.characters[character_id]

            dialog = Dialog(id=dialog_id, speaker=speaker, dialog=dialog)
            all_dialogs[dialog_id] = dialog
    return all_dialogs


In [5]:

def update_movie_dialogs(
    movies: Dict[str, Movie], conversations_path: str, lines_path: str
):
    all_dialogs: Dict[str, Dialog] = get_all_dialogs(movies, lines_path)

    with open(conversations_path, "r", encoding="ISO-8859-1") as conversations_file:
        for conversation in conversations_file:
            # Sample: u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']
            conversation = conversation.strip().split(" +++$+++ ")
            character1, character2, movie_id, dialogs = conversation

            dialogs = dialogs.strip("][").replace("'", "").split(", ")

            assert movie_id in movies
            movie = movies[movie_id]

            assert character1 in movie.characters
            assert character2 in movie.characters

            character1 = movie.characters[character1]
            character2 = movie.characters[character2]

            for dialog in dialogs:
                assert dialog in all_dialogs
                dialog = all_dialogs[dialog]
                listener = character1 if character2 == dialog.speaker else character2
                dialog.listener = listener

            conversation = Conversation(
                characters=(character1, character2),
                dialogs=[all_dialogs[dialog] for dialog in dialogs],
            )

            movie.conversations.append(conversation)


In [6]:

def _preprocess_year(year: str) -> int:
    if "/I" in year:
        year = year.replace("/I", "")
    year = int(year)
    return year


def get_movies_dict(movie_titles_metadata_path: str) -> Dict[str, Movie]:
    movies_dict = {}
    with open(movie_titles_metadata_path, "r", encoding="ISO-8859-1") as movies_file:
        for movie in movies_file:
            movie = movie.strip().split(" +++$+++ ")
            movie_id, title, year, rating, votes, genres = movie
            # Replace list representation with list of strings
            genres = genres.strip("][").replace("'", "").split(", ")
            year = _preprocess_year(year)
            rating = float(rating)

            movie = Movie(
                id=movie_id,
                title=title,
                release_year=year,
                imdb_rating=rating,
                num_votes=votes,
                genres=genres,
            )

            movies_dict[movie_id] = movie

    return movies_dict


In [7]:

def get_data(path: str):
    files = listdir(path)

    # 1. For each movie:
    #    a. get all characters
    #    b. get all conversations

    assert "movie_titles_metadata.txt" in files
    assert "movie_characters_metadata.txt" in files

    # This will likely use more memory.
    # movies_df = pd.read_table(
    #     join(path, 'movie_titles_metadata.txt'),
    #     sep=r' \+\+\+\$\+\+\+ ',
    #     encoding='ISO-8859-1',
    #     header=None,
    # )

    movies_dict = get_movies_dict(join(path, "movie_titles_metadata.txt"))
    update_movie_characters(movies_dict, join(path, "movie_characters_metadata.txt"))
    update_movie_dialogs(
        movies_dict,
        join(path, "movie_conversations.txt"),
        join(path, "movie_lines.txt"),
    )

    return movies_dict


data_path = "/Users/akhil/code/lexical_lab/companies/ginger/data/cornell"
x = get_data(data_path)

In [8]:
list(x.keys())[:5]

['m0', 'm1', 'm2', 'm3', 'm4']

In [9]:
m0 = x['m0']

In [10]:
list(m0.characters.keys())[:5]

['u0', 'u1', 'u2', 'u3', 'u4']

In [11]:
conversations = m0.conversations[:5]
conversations[0].dialogs[0].dialog

'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.'