In [3]:
import pandas as pd
import json
import re

In [37]:
df = pd.read_csv('movies_data_400.csv')
df = df.drop('Unnamed: 0', axis=1)

In [38]:
def get_actors_and_movies(df):
    
    df['cast_id'] = df['cast'].apply(lambda x: re.findall("(?<={').*?(?=':)", str(x[1:-1])))
    
    actor_dict = {}

    def add_to_dict(x):
        movie = x['movie_id']
        for i in x['cast_id']:
            if i not in actor_dict.keys():
                actor_dict[i] = [movie]
            else:
                actor_dict[i].append(movie)

    df.apply(lambda x: add_to_dict(x), axis=1)
    
    return actor_dict

actors_dictionary = get_actors_and_movies(df)

In [137]:
def sum_movies_per_actor(df, names=True):
    
    if names:
        names = 'summary_wiki'
    if not names:
        names = 'movie_plot_no_names'
    
    df['cast_id'] = df['cast'].apply(lambda x: re.findall("(?<={').*?(?=':)", str(x[1:-1])))
    
    actor_dict = {}

    def add_to_dict(x):
        movie = x[names][2:-2]
        #removing empty plots
        if x[names] not in  ["['NoText']", 'NoSearch']:
            for i in x['cast_id']:
                if i not in actor_dict.keys():
                    actor_dict[i] = movie
                else:
                    actor_dict[i] += ' ' + movie

    df.apply(lambda x: add_to_dict(x), axis=1)
    
    return actor_dict

actors_movie_sum = sum_movies_per_actor(df)

In [123]:
#read all movies from directory
names = []
for i in range(400, 6500, 400):
    names.append('movies_data_{}.csv'.format(i))
names += ['movie_data_end.csv', 'movie_data_total.csv']
df = pd.read_csv(names[0])
for name in names[1:]:
    df_t = pd.read_csv(name)
    df = df.append(df_t)

In [124]:
df = df.drop_duplicates(subset=['movie_id', 'movie_name'], keep='first')

Remove names from text

In [142]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.wordnet import WordNetLemmatizer
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer
import spacy

nlp = spacy.load('en_core_web_sm')
stop_words = stopwords.words("english")

#filter out names through nerc
def remove_names(text):
    document = nlp(text)
    ents = [e.text for e in document.ents if e.label_ != 'PERSON']
    return " ".join([item.text for item in document if item.text not in ents])

df['movie_plot_no_names'] = df['summary_wiki'].apply(lambda x: remove_names(x))

# actors_df['movie_plot_no_names'] = actors_df['movie_plot'].apply(lambda x: remove_names(x))

In [143]:
actors_dictionary = get_actors_and_movies(df)
actors_movie_sum_True = sum_movies_per_actor(df, True)

actors_data = []
for i in actors_movie_sum.keys():
    actors_data.append([i, actors_movie_sum[i]])

actors_df = pd.DataFrame(actors_data, columns=['actor_id', 'movie_plot'])

In [144]:
actors_df.to_csv('actors_df.csv')

Try to train a Doc2Vec model on grouped text by actor

In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
actors_df = pd.read_csv('actors_df.csv')

In [155]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(actors_df['movie_plot'])]

In [156]:
max_epochs = 50 #was 100 before
vec_size = 20
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =0) #changing dm to dm=0 will make it use BoW approach)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    model.alpha -= 0.0002
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")



iteration 0




iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
Model Saved


In [166]:
model= Doc2Vec.load("d2v.model")

#using deadpool(1) as test case
test_data = word_tokenize("""In Nazi-occupied Poland, in 1944, young Erik Lehnsherr is separated from his parents upon entering the Auschwitz concentration camp. While trying to reach them, he causes a set of metal gates to bend towards him as the result of his mutant ability to create magnetic fields and control metal manifesting, only to be knocked out by the guards.In the not too distant future, U.S. Senator Robert Kelly attempts to pass a "Mutant Registration Act" in Congress, which would force mutants to publicly reveal their identities and abilities. Present are Lehnsherr, now going by the name "Magneto", and his telepathic colleague Professor Charles Xavier. Seeing Lehnsherr in attendance, Xavier becomes concerned with how he will respond to the Registration Act.

Meanwhile, in Meridian, Mississippi, 17-year-old Marie D'Ancanto accidentally puts her boyfriend into a coma upon kissing him as the result of her mutant ability to absorb the powers and life force of others. She runs away from home and adopts the name Rogue. In Alberta, she meets Logan, also known as Wolverine, a mutant who possesses superhuman healing abilities and metal "claws" that protrude from between his knuckles. While on the road together, they are attacked by a minion of Magneto's, Sabretooth, until two of Xavier's teachers – Cyclops and Storm – arrive and save them. Wolverine and Rogue are brought to Xavier's mansion and school for mutants in Westchester County, New York. Xavier tells Logan that Magneto appears to have taken an interest in Wolverine and asks him to stay while Xavier's mutants, the X-Men, investigate the matter. Meanwhile, Rogue enrolls in the school.

Senator Kelly is abducted by two more of Magneto's minions, Toad and Mystique, and is brought to their hideout on the uncharted island of Genosha. There, Magneto uses Kelly as a test subject for a machine powered by his magnetic abilities that generates a field of radiation, inducing mutation in normal humans. Kelly later escapes by taking advantage of his newfound mutation. Rogue visits Wolverine during the night while he is having a nightmare; however, he accidentally stabs her once startled, but Rogue is able to absorb his healing ability to recover, which other students witness, having arrived to help. She is later convinced by Mystique, who disguises herself as Rogue's crush Bobby Drake, that Xavier is angry with her and she should leave the school. Xavier uses his mutant-locating machine Cerebro to find Rogue at a train station, and the X-Men go to retrieve her. Meanwhile, Mystique enters Cerebro and sabotages it.

Having left ahead of Storm and Cyclops, Wolverine finds Rogue on a train and convinces her to return to the school. Before they can leave, Magneto arrives, knocks out Wolverine and subdues Rogue, revealing it was Rogue who he wants rather than Wolverine. Although Xavier attempts to stop Magneto by mentally controlling Sabretooth, he is forced to release his hold on Sabretooth when Magneto threatens the police who have converged on the train station, allowing Magneto's Brotherhood to escape with Rogue. Kelly arrives at Xavier's school, and Xavier reads his mind to learn about Magneto's machine. Realizing the strain of powering it nearly killed Magneto, the group deduces he intends to transfer his powers to Rogue and use her to power it at the cost of her life. Kelly's body rejects his mutation, and his body dissolves into liquid. Xavier attempts to locate Rogue using Cerebro, but Mystique's sabotage incapacitates him, and he falls into a coma. Fellow telekinetic and telepath Jean Grey fixes Cerebro and uses it, learning that Magneto plans to place his mutation-inducing machine on Liberty Island and use it to "mutate" the world leaders meeting at a summit on nearby Ellis Island. The X-Men scale the Statue of Liberty, battling and overpowering the Brotherhood while Magneto transfers his powers to Rogue and activates the mutating machine. As Wolverine confronts and distracts Magneto, Cyclops blasts him away, allowing Wolverine to destroy the machine. He transfers his powers to Rogue and his healing abilities rejuvenate her, while incapacitating himself.

Professor Xavier and Wolverine recover from their comas. The group also learns that Mystique escaped the island battle and is impersonating Senator Kelly. Xavier gives Wolverine a lead to his past at an abandoned military installation in Canada. Magneto is imprisoned in a complex constructed of plastic and is visited by Xavier, and Magneto warns him he intends to escape one day and continue the fight.
""".lower())


# find all similair movie plots
similar_actor = model.docvecs.most_similar(positive=[model.infer_vector(test_data)],topn=50)
for doc, perc in similar_actor:
    if 'X-Men' not in actors_dictionary[actors_df.iloc[int(doc)]['actor_id']]:
        print('actor id:',actors_df.iloc[int(doc)]['actor_id'] ,'\nmovies played in:', actors_dictionary[actors_df.iloc[int(doc)]['actor_id']], '   \nSimilarity:', perc , '\n')
#     print('summary: ' , actors_df.iloc[int(doc)]['info_json']['Plot'], '\n')

actor id: nm0835758 
movies played in: ['X-Men 2']    
Similarity: 0.9422240853309631 

actor id: nm4446467 
movies played in: ['Scouts Guide to the Zombie Apocalypse', 'Dark Places', 'The Yellow Birds', 'Ready Player One', 'X-Men: Apocalypse']    
Similarity: 0.9267494678497314 

actor id: nm7093076 
movies played in: ['Deadly Class', 'To All the Boys: Always and Forever, Lara Jean', 'Patriots Day', 'Alita: Battle Angel', 'To All the Boys: P.S. I Still Love You', "To All the Boys I've Loved Before", 'X-Men: Apocalypse']    
Similarity: 0.921867847442627 

actor id: nm1546300 
movies played in: ['Rock of Ages', 'Scream Queens', 'Before I Fall', 'Terminator: Dark Fate']    
Similarity: 0.9080660343170166 

actor id: nm10761387 
movies played in: ['Terminator: Dark Fate']    
Similarity: 0.9045665264129639 

actor id: nm6353378 
movies played in: ['Terminator: Dark Fate']    
Similarity: 0.904315710067749 

actor id: nm0721376 
movies played in: ['Running with the Devil', 'Terminator: Da