In [1]:
import pandas as pd
metadata = pd.read_csv('movie_metadata.csv')
metadata[['movie_title', 'actor_1_name','actor_2_name','actor_3_name','director_name', 'plot_keywords', 'genres']].head(5)

Unnamed: 0,movie_title,actor_1_name,actor_2_name,actor_3_name,director_name,plot_keywords,genres
0,Avatar,CCH Pounder,Joel David Moore,Wes Studi,James Cameron,avatar|future|marine|native|paraplegic,Action|Adventure|Fantasy|Sci-Fi
1,Pirates of the Caribbean: At World's End,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski,goddess|marriage ceremony|marriage proposal|pi...,Action|Adventure|Fantasy
2,Spectre,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes,bomb|espionage|sequel|spy|terrorist,Action|Adventure|Thriller
3,The Dark Knight Rises,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan,deception|imprisonment|lawlessness|police offi...,Action|Thriller
4,Star Wars: Episode VII - The Force Awakens ...,Doug Walker,Rob Walker,,Doug Walker,,Documentary


#### Filtering Data

In [2]:
#Combine actors in a cast
metadata['cast'] = metadata['actor_1_name'].str.cat(metadata[['actor_2_name', 'actor_3_name']], sep='|', na_rep='')
metadata[['movie_title', 'cast','director_name', 'plot_keywords', 'genres']].head(5)

Unnamed: 0,movie_title,cast,director_name,plot_keywords,genres
0,Avatar,CCH Pounder|Joel David Moore|Wes Studi,James Cameron,avatar|future|marine|native|paraplegic,Action|Adventure|Fantasy|Sci-Fi
1,Pirates of the Caribbean: At World's End,Johnny Depp|Orlando Bloom|Jack Davenport,Gore Verbinski,goddess|marriage ceremony|marriage proposal|pi...,Action|Adventure|Fantasy
2,Spectre,Christoph Waltz|Rory Kinnear|Stephanie Sigman,Sam Mendes,bomb|espionage|sequel|spy|terrorist,Action|Adventure|Thriller
3,The Dark Knight Rises,Tom Hardy|Christian Bale|Joseph Gordon-Levitt,Christopher Nolan,deception|imprisonment|lawlessness|police offi...,Action|Thriller
4,Star Wars: Episode VII - The Force Awakens ...,Doug Walker|Rob Walker|,Doug Walker,,Documentary


In [3]:
#Converting many values in a list
def converToList(feature):
    values = []
    for ind in metadata.index:
        current_list = str(metadata[feature][ind]).split('|')
        normalized_list = [x for x in current_list if pd.isnull(x) == False and x != '']
        values.append(normalized_list)
    return values


In [4]:
features = ['cast', 'plot_keywords', 'genres']
for feature in features:
    metadata[feature] = converToList(feature)

In [5]:
metadata[['movie_title', 'cast','director_name', 'plot_keywords', 'genres']].head(5)

Unnamed: 0,movie_title,cast,director_name,plot_keywords,genres
0,Avatar,"[CCH Pounder, Joel David Moore, Wes Studi]",James Cameron,"[avatar, future, marine, native, paraplegic]","[Action, Adventure, Fantasy, Sci-Fi]"
1,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Jack Davenport]",Gore Verbinski,"[goddess, marriage ceremony, marriage proposal...","[Action, Adventure, Fantasy]"
2,Spectre,"[Christoph Waltz, Rory Kinnear, Stephanie Sigman]",Sam Mendes,"[bomb, espionage, sequel, spy, terrorist]","[Action, Adventure, Thriller]"
3,The Dark Knight Rises,"[Tom Hardy, Christian Bale, Joseph Gordon-Levitt]",Christopher Nolan,"[deception, imprisonment, lawlessness, police ...","[Action, Thriller]"
4,Star Wars: Episode VII - The Force Awakens ...,"[Doug Walker, Rob Walker]",Doug Walker,[nan],[Documentary]


#### Cleaning Data

In [6]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [7]:
features = ['cast', 'plot_keywords', 'director_name', 'genres']

for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)

In [8]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
metadata[['movie_title', 'cast','director_name', 'plot_keywords', 'genres']].head(50)

Unnamed: 0,movie_title,cast,director_name,plot_keywords,genres
0,Avatar,"[cchpounder, joeldavidmoore, wesstudi]",jamescameron,"[avatar, future, marine, native, paraplegic]","[action, adventure, fantasy, sci-fi]"
1,Pirates of the Caribbean: At World's End,"[johnnydepp, orlandobloom, jackdavenport]",goreverbinski,"[goddess, marriageceremony, marriageproposal, ...","[action, adventure, fantasy]"
2,Spectre,"[christophwaltz, rorykinnear, stephaniesigman]",sammendes,"[bomb, espionage, sequel, spy, terrorist]","[action, adventure, thriller]"
3,The Dark Knight Rises,"[tomhardy, christianbale, josephgordon-levitt]",christophernolan,"[deception, imprisonment, lawlessness, policeo...","[action, thriller]"
4,Star Wars: Episode VII - The Force Awakens ...,"[dougwalker, robwalker]",dougwalker,[nan],[documentary]
5,John Carter,"[darylsabara, samanthamorton, pollywalker]",andrewstanton,"[alien, americancivilwar, malenipple, mars, pr...","[action, adventure, sci-fi]"
6,Spider-Man 3,"[j.k.simmons, jamesfranco, kirstendunst]",samraimi,"[sandman, spiderman, symbiote, venom, villain]","[action, adventure, romance]"
7,Tangled,"[bradgarrett, donnamurphy, m.c.gainey]",nathangreno,"[17thcentury, basedonfairytale, disney, flower...","[adventure, animation, comedy, family, fantasy..."
8,Avengers: Age of Ultron,"[chrishemsworth, robertdowneyjr., scarlettjoha...",josswhedon,"[artificialintelligence, basedoncomicbook, cap...","[action, adventure, sci-fi]"
9,Harry Potter and the Half-Blood Prince,"[alanrickman, danielradcliffe, rupertgrint]",davidyates,"[blood, book, love, potion, professor]","[adventure, family, fantasy, mystery]"


#### Creating Soup Field

In [9]:
def create_soup(x):
    return ' '.join(x['plot_keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director_name']
    #return ' '.join(x['plot_keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director_name'] + ' ' + ' '.join(x['genres'])

In [10]:
# Create a new soup feature, falta eliminar el nan del registro 4
metadata['soup'] = metadata.apply(create_soup, axis=1)
metadata[['movie_title','soup']].head(50)

Unnamed: 0,movie_title,soup
0,Avatar,avatar future marine native paraplegic cchpoun...
1,Pirates of the Caribbean: At World's End,goddess marriageceremony marriageproposal pira...
2,Spectre,bomb espionage sequel spy terrorist christophw...
3,The Dark Knight Rises,deception imprisonment lawlessness policeoffic...
4,Star Wars: Episode VII - The Force Awakens ...,nan dougwalker robwalker dougwalker
5,John Carter,alien americancivilwar malenipple mars princes...
6,Spider-Man 3,sandman spiderman symbiote venom villain j.k.s...
7,Tangled,17thcentury basedonfairytale disney flower tow...
8,Avengers: Age of Ultron,artificialintelligence basedoncomicbook captai...
9,Harry Potter and the Half-Blood Prince,blood book love potion professor alanrickman d...


In [11]:
# Entry movie
print(metadata[['soup']].head(5)['soup'][0])

avatar future marine native paraplegic cchpounder joeldavidmoore wesstudi jamescameron


In [12]:
# Aplying trim to the titles
df_obj = metadata.select_dtypes(['object'])
metadata[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())

In [13]:
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['movie_title'])

#### Recomendation Algorithm

###### Training Algorithm

In [14]:
m1 = indices['The Amazing Spider-Man']
m2 = indices['Avengers: Age of Ultron']
m3 = indices['Captain America: Civil War']
m4 = indices['Iron Man 3']

m5 = indices['Toy Story 3']
m6 = indices['Cars 2']


val1 = metadata.iloc[m1]['soup']
val2 = metadata.iloc[m2]['soup']
val3 = metadata.iloc[m3]['soup']
val4 = metadata.iloc[m4]['soup']

val5 = metadata.iloc[m5]['soup']
val6 = metadata.iloc[m6]['soup']

In [15]:
like = [val1, val2, val3, val4]
unlike = [val5, val6]

###### Probabilidad de Like y Unlike

In [16]:
# Probabilities
total_sentences = len(like) + len(unlike)

p_like = len(like)/total_sentences;
p_unlike = len(unlike)/total_sentences;

###### Probabilidad de cada palabra en cada clasificación

In [17]:
# Frecuencias (Bag of words)
def create_table_freq(corpus):
    freq = {}
    for sent in corpus:
        tokens = sent.split(' ')
        for token in tokens:
            if token not in freq.keys():
                freq[token] = 1
            else:
                freq[token] += 1
    return freq

In [18]:
freq_like = create_table_freq(like)
freq_unlike = create_table_freq(unlike)

print(freq_like)
print(freq_unlike)

{'lizard': 1, 'outcast': 1, 'spider': 1, 'spiderman': 1, 'teenager': 1, 'emmastone': 1, 'andrewgarfield': 1, 'chriszylka': 1, 'marcwebb': 1, 'artificialintelligence': 1, 'basedoncomicbook': 2, 'captainamerica': 1, 'marvelcinematicuniverse': 2, 'superhero': 2, 'chrishemsworth': 1, 'robertdowneyjr.': 3, 'scarlettjohansson': 2, 'josswhedon': 1, 'knife': 1, 'returningcharacterkilledoff': 1, 'chrisevans': 1, 'anthonyrusso': 1, 'armor': 1, 'explosion': 1, 'humanbomb': 1, 'missileattack': 1, 'terrorist': 1, 'jonfavreau': 1, 'doncheadle': 1, 'shaneblack': 1}
{'college': 1, 'daycare': 1, 'escape': 1, 'teddybear': 1, 'toy': 1, 'tomhanks': 1, 'johnratzenberger': 1, 'donrickles': 1, 'leeunkrich': 1, 'bestfriend': 1, 'carrace': 1, 'conspiracy': 1, 'gadgetcar': 1, 'spy': 1, 'joemantegna': 1, 'thomaskretschmann': 1, 'eddieizzard': 1, 'johnlasseter': 1}


In [19]:
# Obteniendo probabilidades de cada palabra
def count_words(corpus):
    freq = 0
    for sent in corpus:
        freq += len(sent.split(' '))
    return freq

In [20]:
total_like = count_words(like)
total_unlike = count_words(unlike)

print(f'like: {total_like}\nunlike: {total_unlike}')

like: 36
unlike: 18


In [21]:
# Transform to Cpt's
def create_cpt(freq, total):
    cpt_eqv = {}
    for k,v in freq.items():
        probability = v/total
        cpt_eqv[k] = probability
    return cpt_eqv

In [22]:
cpt_like = create_cpt(freq_like, total_like)
cpt_unlike = create_cpt(freq_unlike, total_unlike)
print(cpt_like)
print(cpt_unlike)

{'lizard': 0.027777777777777776, 'outcast': 0.027777777777777776, 'spider': 0.027777777777777776, 'spiderman': 0.027777777777777776, 'teenager': 0.027777777777777776, 'emmastone': 0.027777777777777776, 'andrewgarfield': 0.027777777777777776, 'chriszylka': 0.027777777777777776, 'marcwebb': 0.027777777777777776, 'artificialintelligence': 0.027777777777777776, 'basedoncomicbook': 0.05555555555555555, 'captainamerica': 0.027777777777777776, 'marvelcinematicuniverse': 0.05555555555555555, 'superhero': 0.05555555555555555, 'chrishemsworth': 0.027777777777777776, 'robertdowneyjr.': 0.08333333333333333, 'scarlettjohansson': 0.05555555555555555, 'josswhedon': 0.027777777777777776, 'knife': 0.027777777777777776, 'returningcharacterkilledoff': 0.027777777777777776, 'chrisevans': 0.027777777777777776, 'anthonyrusso': 0.027777777777777776, 'armor': 0.027777777777777776, 'explosion': 0.027777777777777776, 'humanbomb': 0.027777777777777776, 'missileattack': 0.027777777777777776, 'terrorist': 0.027777

###### Recomendation

In [23]:
def calcLike(texto, alpha):
    p_cadena_like = 1
    p_cadena_unlike = 1
    oracion = texto.split(' ')
    for i in range(len(oracion)):
        if oracion[i] not in cpt_like:
            laplace1 = alpha / ((total_like+total_unlike)+(2*alpha))
            p_cadena_like *=  laplace1
        else:
            p_cadena_like *=  cpt_like[oracion[i]]
        if oracion[i] not in cpt_unlike:
            laplace2 = alpha / ((total_like+total_unlike)+(2*alpha))
            p_cadena_unlike *=  laplace2
        else:
            p_cadena_unlike *=  cpt_unlike[oracion[i]]
    p_final = (p_cadena_like * p_like) / ((p_cadena_like * p_like) + (p_cadena_unlike * p_unlike))
    return p_final

In [24]:
#Prueba de uno por uno
test = indices['X-Men: Apocalypse']
test_val = metadata.iloc[test]['soup']
#test_val
calcLike(test_val, 1)


0.8615384615384615