In [47]:
import pickle

with open("Data\\movies_by_id.bin", "rb") as f:
    movies_by_id = pickle.load(f)
with open("Data\\genres_by_id.bin", "rb") as f:
    genres_by_id = pickle.load(f)
with open("Data\\plots_by_id.bin", "rb") as f:
    plots_by_id = pickle.load(f)

ids = list(plots_by_id.keys())

In [88]:
genres_by_id['0120382']

['Comedy', 'Drama']

In [41]:
genres_set = set()
genre_counts = {}

for id, genres in genres_by_id.items():
    for genre in genres:
        genres_set.add(genre)
        if genre not in genre_counts:
            genre_counts[genre] = 0
        genre_counts[genre] += 1
    
genre_master_list = list(genres_set)

In [76]:
genre_master_list

['Crime',
 'Action',
 'Western',
 'Romance',
 'Music',
 'Family',
 'Drama',
 'Fantasy',
 'Animation',
 'Comedy',
 'Sport',
 'Musical',
 'Film-Noir',
 'Adventure',
 'Sci-Fi',
 'Horror',
 'Mystery',
 'War',
 'Thriller',
 'Biography',
 'History']

In [77]:
genre_master_list

with open("Data\\genre_master_list.bin", "wb+") as f:
    pickle.dump(genre_master_list, f)

In [43]:
movie_count = len(ids)

genre_probabilities = {}

for genre, count in genre_counts.items():
    genre_probabilities[genre] = count / movie_count

genre_probabilities

{'Drama': 0.7228915662650602,
 'Crime': 0.20983935742971888,
 'Action': 0.1897590361445783,
 'Adventure': 0.19578313253012047,
 'Biography': 0.10943775100401607,
 'History': 0.055220883534136546,
 'Sci-Fi': 0.06726907630522089,
 'Romance': 0.12550200803212852,
 'Western': 0.020080321285140562,
 'Fantasy': 0.06626506024096386,
 'Comedy': 0.23393574297188754,
 'Thriller': 0.13654618473895583,
 'Animation': 0.0823293172690763,
 'Family': 0.05622489959839357,
 'War': 0.05120481927710843,
 'Mystery': 0.09839357429718876,
 'Music': 0.035140562248995984,
 'Horror': 0.0321285140562249,
 'Musical': 0.01706827309236948,
 'Film-Noir': 0.019076305220883535,
 'Sport': 0.019076305220883535}

In [44]:
import spacy

word_classes = ["NOUN", "VERB", "ADJ"]

nlp = spacy.load("en_core_web_sm")
keywords_set = set()
keywords_by_id = {}

for id, plot in plots_by_id.items():
    doc = nlp(plot)
    keywords_by_id[id] = set()

    for token in doc:
        if token.pos_ not in word_classes:
            continue
        lem = token.lemma_
        keywords_by_id[id].add(lem)
        keywords_set.add(lem)


In [48]:
keywords_master_list = list(keywords_set)

with open("Data\\keywords_master_list.bin", "wb+") as f:
    pickle.dump(keywords_master_list, f)

In [49]:
keyword_counts = {}

for keyword in keywords_master_list:
    keyword_counts[keyword] = 0
    for id, movie_keywords in keywords_by_id.items():
        if keyword in movie_keywords:
            keyword_counts[keyword] += 1

In [50]:
len(keyword_counts)

18249

In [51]:
trimmed_keyword_counts = {}
threshold = 20

for keyword, count in keyword_counts.items():
    if count >= threshold:
        trimmed_keyword_counts[keyword] = count

trimmed_keyword_list = list(trimmed_keyword_counts.keys())

In [52]:
len(trimmed_keyword_list)

2510

In [None]:
trimmed_keyword_list

In [54]:
idx_by_genre = {}

for i, genre in enumerate(genre_master_list):
    idx_by_genre[genre] = i

idx_by_keyword = {}

for i, keyword in enumerate(trimmed_keyword_list):
    idx_by_keyword[keyword] = i

In [25]:
import numpy as np

lg = len(genre_master_list)
genre_vectors_by_id = {}

for id, genres in genres_by_id.items():
    vec = np.zeros(lg)
    
    for genre in genres:
        idx = idx_by_genre[genre]
        vec[idx] = 1
    
    genre_vectors_by_id[id] = vec

kg = len(trimmed_keyword_list)
keyword_vectors_by_id = {}

for id, keywords in keywords_by_id.items():
    vec = np.zeros(kg)

    for keyword in keywords:
        if keyword not in trimmed_keyword_list:
            continue

        idx = idx_by_keyword[keyword]
        vec[idx] = 1

    keyword_vectors_by_id[id] = vec


In [33]:
len(ids)

996

In [36]:
len(genre_vectors_by_id)

996

In [None]:
plots_by_id

In [55]:
matrix = np.zeros((lg, kg))

for id in ids:
    g = genre_vectors_by_id[id]
    k = keyword_vectors_by_id[id]
    for idx, gen in enumerate(g):
        if gen == 1:
            matrix[idx] += k

In [57]:
transpose = matrix.T

word_counts_per_genre = {}

for keyword in trimmed_keyword_list:
    idx = idx_by_keyword[keyword]
    word_counts_per_genre[keyword] = transpose[idx]

In [69]:
word_counts_per_genre

smoothing = 0.05
genre_probabilities_given_keyword = {}

for word, counts in word_counts_per_genre.items():
    occurrences = trimmed_keyword_counts[word]
    probability = (counts + smoothing) / (occurrences + smoothing * counts.shape[0])
    genre_probabilities_given_keyword[word] = probability

genre_probabilities_given_keyword

{'thug': array([0.48945616, 0.31187569, 0.02330744, 0.06770255, 0.02330744,
        0.00110988, 0.64483907, 0.00110988, 0.06770255, 0.15649279,
        0.02330744, 0.02330744, 0.02330744, 0.13429523, 0.02330744,
        0.00110988, 0.22308546, 0.00110988, 0.40066593, 0.00110988,
        0.02330744]),
 'force': array([0.21420476, 0.29228057, 0.02531167, 0.06812744, 0.02531167,
        0.04042312, 0.67006674, 0.06309029, 0.07820174, 0.19405616,
        0.0127188 , 0.00768165, 0.0127188 , 0.26709482, 0.09583176,
        0.02783025, 0.09583176, 0.07316459, 0.17138899, 0.12857323,
        0.07820174]),
 'render': array([0.18669131, 0.33456562, 0.00184843, 0.03881701, 0.00184843,
        0.11275416, 0.40850277, 0.07578558, 0.18669131, 0.18669131,
        0.00184843, 0.03881701, 0.07578558, 0.3715342 , 0.18669131,
        0.03881701, 0.22365989, 0.00184843, 0.22365989, 0.03881701,
        0.03881701]),
 'clear': array([0.29553841, 0.17477357, 0.04729956, 0.12780946, 0.01375377,
        0.0405

In [70]:
genre_probabilities_vector = np.zeros(lg)

for genre, probability in genre_probabilities.items():
    idx = idx_by_genre[genre]
    genre_probabilities_vector[idx] = probability

genre_probabilities_vector


array([0.20983936, 0.18975904, 0.02008032, 0.12550201, 0.03514056,
       0.0562249 , 0.72289157, 0.06626506, 0.08232932, 0.23393574,
       0.01907631, 0.01706827, 0.01907631, 0.19578313, 0.06726908,
       0.03212851, 0.09839357, 0.05120482, 0.13654618, 0.10943775,
       0.05522088])

In [73]:
scaled_word_probabilities = {}

for word, probabilities in genre_probabilities_given_keyword.items():
    scaled_word_probabilities[word] = probabilities / genre_probabilities_vector

In [74]:
scaled_word_probabilities

{'thug': array([2.33252792, 1.6435354 , 1.16071032, 0.53945394, 0.66326304,
        0.01973997, 0.89202738, 0.01674907, 0.82233832, 0.66895629,
        1.22180034, 1.36554156, 1.22180034, 0.6859387 , 0.34648069,
        0.03454495, 2.26727672, 0.02167526, 2.9342887 , 0.01014164,
        0.42207648]),
 'force': array([1.02080355, 1.54027221, 1.26052134, 0.54283944, 0.72029791,
        0.71895407, 0.92692566, 0.95208984, 0.94986501, 0.82952764,
        0.66673294, 0.45005445, 0.66673294, 1.36423818, 1.42460347,
        0.86621647, 0.97396359, 1.42886138, 1.25517234, 1.17485261,
        1.41616238]),
 'render': array([0.88968683, 1.76310771, 0.09205176, 0.3092939 , 0.052601  ,
        2.00541326, 0.5650955 , 1.14367333, 2.26761643, 0.79804527,
        0.09689659, 2.27421985, 3.97276   , 1.89768235, 2.77529175,
        1.2081793 , 2.27311479, 0.03609873, 1.63797978, 0.35469484,
        0.70294068]),
 'clear': array([1.40840314, 0.92102895, 2.35551828, 1.01838578, 0.39139311,
        0.7219

In [78]:
def mean(vector):
    s = sum(vector)
    n = vector.shape[0]
    return s / n

def mad(vector):
    m = mean(vector)
    n = vector.shape[0]
    total_deviation = 0

    for i in range(n):
        deviation = abs(m - vector[i])
        total_deviation += deviation

    return total_deviation / n

In [89]:
featured_embeddings = {}
feature_threshold = 0.5

for word, probabilities in scaled_word_probabilities.items():
    m = mad(scaled_word_probabilities[word])
    if m > feature_threshold:
        featured_embeddings[word] = probabilities
    else:
        print(f"-{word}")

len(featured_embeddings)

-force
-clear
-change
-year
-drop
-lack
-advice
-available
-vacation
-burst
-launch
-transfer
-separate
-building
-card
-contact
-stand
-tired
-market
-man
-teach
-possible
-confuse
-local
-provide
-child
-grow
-happen
-Most
-trial
-son
-joke
-flight
-race
-desk
-birth
-personal
-thing
-huge
-comfort
-moment
-beautiful
-successful
-family
-crush
-service
-manage
-occur
-sister
-wonder
-sentence
-full
-fix
-worry
-further
-person
-panic
-foot
-smoke
-react
-morning
-bizarre
-drink
-funeral
-hour
-visit
-assemble
-treat
-retrieve
-dead
-radio
-right
-short
-overhear
-table
-hearing
-vow
-roof
-poor
-driver
-cause
-trick
-bathroom
-effect
-charge
-alone
-wish
-upcoming
-shift
-hard
-acquire
-break
-pain
-silent
-tree
-boy
-impress
-doubt
-swim
-body
-relate
-dozen
-future
-legal
-lose
-various
-behavior
-engage
-escape
-complain
-land
-attempt
-suffer
-remain
-nickname
-unknown
-open
-accident
-amount
-angry
-ball
-endure
-bond
-school
-screen
-case
-seek
-express
-gain
-influence
-declin

1385

In [90]:
with open("Data\\genre_probabilities_vector.bin", "wb+") as f:
    pickle.dump(genre_probabilities_vector, f)
with open("Data\\word_embeddings.bin", "wb+") as f:
    pickle.dump(scaled_word_probabilities, f)
with open("Data\\feature_embeddings_50.bin", "wb+") as f:
    pickle.dump(featured_embeddings, f)