In [19]:
movies = ["Avatar",
    "BlinkTwice",
    "ChildrenOfMen",
    "CitizenKane",
    "DjangoUnchained",
    "GetOut",
    "Inception",
    "KnivesOut",
    "Mandy",
    "Parasite",
    "SnowPiercer",
    "TheGreatestShowman",
    "TheMartian",
    "TheMummy",
    "TheRevenant",
    "TheRitual",
    "Lord of the Rings\\FellowshipOfTheRing",
    "Lord of the Rings\\ReturnOfTheKing",
    "Lord of the Rings\\TheTwoTowers",
    "Pirates of the Caribbean\\AtWorldsEnd",
    "Pirates of the Caribbean\\CurseOfTheBlackPearl",
    "Pirates of the Caribbean\\DeadMansChest",
    "Pirates of the Caribbean\\DeadMenTellNoTales",
    "Pirates of the Caribbean\\OnStrangerTides",
    "Star Wars\\StarWarsANewHope",
    "Star Wars\\StarWarsEmpireStrikesBack",
    "Star Wars\\StarWarsReturnOfTheJedi",
    "The Godfather\\GodfatherPart1",
    "The Godfather\\GodfatherPart2",
    "The Godfather\\GodfatherPart3",
]


In [20]:

def get_movie_plot(movie_name: str) -> str:
    plotPath = fr"Wikipedia Plots/{movie_name}.txt"
    plot = ""
    with open(plotPath) as f:
        lines = f.readlines()
        for line in lines:
            if line.isspace():
                continue
            plot += line.strip()
    return plot 

In [21]:
import spacy

def get_significant_word_counts(words: str, desired_pos: list[str]) -> dict[str, int]:
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(words)

    token_counts = {}

    for token in doc:
        
        if token.pos_ not in desired_pos:
            continue

        t = token.lemma_
        if t not in token_counts:
            token_counts[t] = 0
        token_counts[t] += 1

    return token_counts

In [22]:
def get_movie_significant_words_to_file(movie):
    plot = get_movie_plot(movie)
    token_counts = get_significant_word_counts(plot, ["NOUN", "VERB"])

    wordsPath = f"Plot Significant Words\\{movie}.txt"

    with open(wordsPath, "w+") as f:
        for token, count in token_counts.items():
            #print(token, ",", count, sep="")
            f.write(f"{token},{count}\n")

In [None]:
for movie in movies:
    #get_movie_significant_words_to_file(movie)
    pass

In [23]:
def retrieve_movie_significant_words_from_file(movie):
    wordsPath = f"Plot Significant Words\\{movie}.txt"

    token_counts = {}

    with open(wordsPath) as f:
        lines = f.readlines()
        for line in lines:
            parts = line.strip().split(",")
            word = parts[0]
            count = int(parts[-1])
            token_counts[word] = count
    
    return token_counts

In [24]:
def define_master_word_list():
    master_word_list = set()

    for movie in movies:
        plot = get_movie_plot(movie)
        word_counts = get_significant_word_counts(plot, ["NOUN", "VERB"])
        words = word_counts.keys()
        master_word_list |= set(words)

    print(master_word_list)
    master_word_list = list(master_word_list)
    
    with open("master_word_list.txt", "w+") as f:
        for word in master_word_list:
            f.write(f"{word}\n")

In [25]:
def retrieve_master_word_list():
    master_word_list = []
    with open("master_word_list.txt") as f:
        lines = f.readlines()
        for line in lines:
            word = line.strip()
            master_word_list.append(word)
    return master_word_list

In [26]:
#define_master_word_list()
master = retrieve_master_word_list()    # always retrieve the list in the same order, don't redefine it

In [27]:
def get_movie_word_weight_vector(movie, master):
    counts = retrieve_movie_significant_words_from_file(movie)
    weights_vector = []
    total_count = 0.0

    for word in master:
        if word in counts:
            weights_vector.append(counts[word])
            total_count += counts[word]
        else:
            weights_vector.append(0)

    for i in range(len(weights_vector)):
        weights_vector[i] /= total_count

    return weights_vector

In [28]:
vectors = {}
for movie in movies:
    print(movie)
    vec = get_movie_word_weight_vector(movie, master)
    vectors[movie] = vec
print(vectors)

Avatar
BlinkTwice
ChildrenOfMen
CitizenKane
DjangoUnchained
GetOut
Inception
KnivesOut
Mandy
Parasite
SnowPiercer
TheGreatestShowman
TheMartian
TheMummy
TheRevenant
TheRitual
Lord of the Rings\FellowshipOfTheRing
Lord of the Rings\ReturnOfTheKing
Lord of the Rings\TheTwoTowers
Pirates of the Caribbean\AtWorldsEnd
Pirates of the Caribbean\CurseOfTheBlackPearl
Pirates of the Caribbean\DeadMansChest
Pirates of the Caribbean\DeadMenTellNoTales
Pirates of the Caribbean\OnStrangerTides
Star Wars\StarWarsANewHope
Star Wars\StarWarsEmpireStrikesBack
Star Wars\StarWarsReturnOfTheJedi
The Godfather\GodfatherPart1
The Godfather\GodfatherPart2
The Godfather\GodfatherPart3
{'Avatar': [0.0, 0.0, 0.005319148936170213, 0.0, 0.005319148936170213, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005319148936170213, 0.0, 0.0, 0.0, 0.0, 0.0