# Pre-processing

In [27]:
import pandas
import numpy
import math
import re
from ast import literal_eval

In [28]:
def special_split(string):
    return re.split('[^a-zA-Z]', string)

### Preparing Movies Dataset

In [9]:
mov_df = pandas.read_csv('./Tutorial Datasets/Movies Dataset.csv', usecols=[5, 6, 22, 23, 8, 3, 9], low_memory=False)
key_df = pandas.read_csv('./Tutorial Datasets/Keywords Dataset.csv', low_memory=False)
cre_df = pandas.read_csv('./Tutorial Datasets/Credits Dataset.csv', low_memory=False)

mov_df = mov_df.drop([19730, 29503, 35587])

mov_df['id'] = mov_df['id'].astype('int')
key_df['id'] = key_df['id'].astype('int')
cre_df['id'] = cre_df['id'].astype('int')

mov_df = pandas.merge(mov_df, key_df, how='inner', on='id')
mov_df = pandas.merge(mov_df, cre_df, how='inner', on='id')

for feature in ['cast', 'crew', 'keywords', 'genres']:
    mov_df[feature] = mov_df[feature].apply(literal_eval)

def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return numpy.nan

def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        if len(names) > 3:
            names = names[:3]
        return names

    return []

mov_df['director'] = mov_df['crew'].apply(get_director)

for feature in ['cast', 'keywords', 'genres']:
    mov_df[feature] = mov_df[feature].apply(get_list)

del mov_df['id']
mov_df = mov_df[['imdb_id', 'vote_average', 'vote_count', 'original_title', 'overview', 'genres', 'keywords', 'director', 'cast']]
mov_df.columns = ['id', 'Vote Average', 'Vote Count', 'Title', 'Description', 'Genres', 'Keywords', 'Director', 'Cast']
mov_df.set_index('id', drop=True, inplace=True)

mov_df.to_csv('./Movies Dataset.csv')

In [10]:
mov_df.head()

Unnamed: 0_level_0,Vote Average,Vote Count,Title,Description,Genres,Keywords,Director,Cast
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
tt0114709,7.7,5415.0,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]","[jealousy, toy, boy]",John Lasseter,"[Tom Hanks, Tim Allen, Don Rickles]"
tt0113497,6.9,2413.0,Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]","[board game, disappearance, based on children'...",Joe Johnston,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]"
tt0113228,6.5,92.0,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Romance, Comedy]","[fishing, best friend, duringcreditsstinger]",Howard Deutch,"[Walter Matthau, Jack Lemmon, Ann-Margret]"
tt0114885,6.1,34.0,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]","[based on novel, interracial relationship, sin...",Forest Whitaker,"[Whitney Houston, Angela Bassett, Loretta Devine]"
tt0113041,5.7,173.0,Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],"[baby, midlife crisis, confidence]",Charles Shyer,"[Steve Martin, Diane Keaton, Martin Short]"


### Preparing User Dataset

In [11]:
user_df = pandas.read_csv('./IMDB Datasets/User Dataset.csv', usecols=[0, 1], encoding='latin1', low_memory=False)
user_df.columns = ['id', 'User Rating']
user_df.set_index('id', drop=True, inplace=True)

user_df.to_csv('./User Dataset.csv')

### Importing Datasets

In [55]:
movies_df = pandas.read_csv('./Movies Dataset.csv', encoding='latin1', low_memory=False)
user_df = pandas.read_csv('./User Dataset.csv', encoding='latin1', low_memory=False)

In [56]:
movies_df.Genres = movies_df.Genres.apply(literal_eval)
movies_df.Keywords = movies_df.Keywords.apply(literal_eval)
movies_df.Cast = movies_df.Cast.apply(literal_eval)

movies_df = movies_df.copy().loc[movies_df['Vote Count'] >= movies_df['Vote Count'].quantile(0.99)]

movies_df.dropna(subset=['Vote Average'], axis=0, inplace=True)

# 2 Feature Extraction

## 2.1 Feature Cleanup

In [57]:
stop_words = ["", "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

def stop_words_cleanup(x):
    if(isinstance(x, str)):
        description = [word for word in special_split(x) if word not in stop_words]
        return description
    return []

In [58]:
movies_df['Description'] = movies_df['Description'].apply(stop_words_cleanup)

In [59]:
movies_df.shape

(467, 9)

In [60]:
def create_bow(x):
    bow = x['Description'] + x['Genres'] + [x['Director']] + x['Keywords'] + x['Cast']
    return list(set(bow))

In [61]:
movies_df['Bag of Words'] = movies_df.apply(create_bow, axis=1)

movies_df = movies_df[['id', 'Vote Average', 'Vote Count', 'Bag of Words']]
movies_df.set_index('id', drop=True, inplace=True)

## 2.2 Feature Selection

### Index de Gini

$$\textrm{Gini}(w) = 1 - \sum_{i=1}^{t} p_{i} (w)^{2} $$

In [None]:
def gini_index(dataframe, min_value=0.0, max_value=10.0, step=1.0):
    if(['Vote Average', 'Features'] not in dataframe.columns):
        raise ValueError("Invalid Dataframe. Dataframes need to have a column 'Features'.")
    else:
        gini = dict()
        
        for index, row in dataframe.iterrows():
            for word in word_split(row['Features']):
                if word not in gini:
                    ocurrence_count = dict()
                    total_ocurrence_count = 0
                    
                    for index, row in dataframe.iterrows():
                        if word in row['Features']:
                            if row['Vote Average'] >= min_value and row['Vote Average'] <= max_value:
                                total_ocurrence_count += 1
                                
                                interval_key = min_value
                                
                                while(interval_key + step < row['Vote Average']):
                                    interval_key += step
                                
                                if interval_key in ocurrence_count:
                                    ocurrence_count[interval_key] += 1
                                else:
                                    ocurrence_count[interval_key] = 1
                    
                    ocurrence_fractions = [value/total_ocurrence_count for value in ocurrence_count.values()]
                    
                    gini[word] = 1
                    
                    for fraction in ocurrence_fractions:
                        gini[word] -= math.pow(fraction, 2)
                        
        return gini

### Entropia

$$\textrm{Entropia}(w) = - \sum_{i=1}^{t} p(i) \cdot \textrm{log} (p(i))$$

In [None]:
def entropy(dataframe, min_value=0.0, max_value=10.0, step=1.0):
    if(['Vote Average', 'Features'] not in dataframe.columns):
        raise ValueError("Invalid Dataframe. Dataframes need to have a column 'Features'.")
    else:
        entropy = dict()
        
        for index, row in dataframe.iterrows():
            for word in word_split(row['Features']):
                if word not in entropy:
                    ocurrence_count = dict()
                    total_ocurrence_count = 0
                    
                    for index, row in dataframe.iterrows():
                        if word in row['Features']:
                            if row['Vote Average'] >= min_value and row['Vote Average'] <= max_value:
                                total_ocurrence_count += 1
                                
                                interval_key = min_value
                                
                                while(interval_key + step < row['Vote Average']):
                                    interval_key += step
                                
                                if interval_key in ocurrence_count:
                                    ocurrence_count[interval_key] += 1
                                else:
                                    ocurrence_count[interval_key] = 1
                    
                    ocurrence_fractions = [value/total_ocurrence_count for value in ocurrence_count.values()]
                    
                    entropy[word] = 0
                    
                    for fraction in ocurrence_fractions:
                        entropy[word] -= fraction * math.log(fraction)
                        
        return entropy

### Desvio Normalizado

$$\textrm{Dev}(w) = \frac{\mid\mu^{+}(w) - \mu^{-}(w)\mid}{\sigma}$$

In [37]:
def normalized_deviation(dataframe):
    norm_dev = dict()

    for index_1, row_1 in dataframe.iterrows():
        for word in row_1['Bag of Words']:
            val = [0, 0, 0, 0, 0]
                
            if word not in norm_dev:
                for index_2, row_2 in dataframe.iterrows():
                    if word in row_2['Bag of Words']:
                        val[0] = val[0] + row_2['Vote Average']
                        val[1] = val[1] + 1
                    else:
                        val[2] = val[2] + row_2['Vote Average']
                        val[3] = val[3] + 1
                    val[4] += math.pow(row_2['Vote Average'], 2)

                mean_p = val[0] / val[1]
                mean_n = val[2] / val[3]
                mean = (val[0] + val[2]) / (val[1] + val[3])
                standard_deviation = math.sqrt((val[4] - 2 * mean * (val[0] + val[2]) + (val[1] + val[3]) * math.pow(mean, 2)) / (val[1] + val[3]))
                
                norm_dev[word] = abs(mean_p - mean_n) / standard_deviation
                
                print(word, norm_dev[word], end='\n', sep=': ')
    return norm_dev

In [62]:
norms = normalized_deviation(movies_df)

print(norms)

But: 0.12229014938522725
Family: 0.14300999819676602
happily: 0.14534043941051253
eventually: 0.7066784400350536
separate: 0.9387199509455326
plots: 0.9387199509455326
Buzz: 0.7213460230432811
Animation: 0.14290664353758553
put: 0.4765656912308712
live: 0.06769310850534667
Led: 0.9387199509455326
circumstances: 0.9387199509455326
Andy: 1.0446566333530312
Tim Allen: 0.7213460230432811
jealousy: 0.9448023696341655
Comedy: 0.0818472488891882
Tom Hanks: 0.4213466562122194
differences: 0.9387199509455326
scene: 0.6770619961258089
toy: 0.31239203274792876
owner: 0.24582252360200713
duo: 0.7333638796320782
place: 0.23729013385437434
aside: 0.5430395714693697
Don Rickles: 0.9387199509455326
heart: 0.12090728934806372
Lightyear: 0.675605948822312
toys: 0.7213460230432811
boy: 0.27680777012626206
John Lasseter: 0.1459682598831295
Woody: 0.7213460230432811
onto: 1.0070218922046856
brings: 0.17925301445608982
room: 0.5031792022636531
learns: 0.19580618506344707
Afraid: 0.9387199509455326
losing: 0

anxious: 0.8064380507972028
anything: 0.11268350531016953
grandchildren: 0.8064380507972028
exotic island: 0.1996550876916393
paleontology: 0.11979231529538366
drawn: 0.9478188149803338
calm: 0.675605948822312
investors: 0.8064380507972028
creates: 0.013828429084537614
featuring: 0.8064380507972028
Jeff Goldblum: 0.8525743773056638
line: 0.41055108849338534
dna: 0.2971865960585831
invites: 0.8064380507972028
prehistoric: 0.2342217269510831
amusing: 0.8064380507972028
park: 0.07576646147122068
entrepreneur: 0.4767563827928869
security: 0.44553105103976887
experience: 0.4767563827928869
two: 0.16459671840901768
wealthy: 0.01277406205756483
systems: 0.01277406205756483
Steven Spielberg: 0.31442817305858345
eager: 0.8064380507972028
day: 0.08934054378065526
Laura Dern: 0.8064380507972028
Nazis: 0.7783785967693556
Liam Neeson: 0.3228071346754605
Jewish: 1.562742534475235
Oskar: 1.7324113518355106
worked: 1.4710042129399912
II: 0.8241230957708269
hero: 0.055415910640968465
thousand: 0.252902

riddle: 0.22634052728866216
legendary: 0.23574594643232863
regime: 0.9387199509455326
Covenant: 0.38492507000128934
Paul Freeman: 0.9387199509455326
Nazi: 0.8984821307131602
tweed: 0.9387199509455326
sends: 0.2933895265554555
space marine: 1.073305080881161
answers: 0.17925301445608982
colonists: 0.9387199509455326
Michael Biehn: 0.7418891374987874
return: 0.25330677921514677
crew: 0.0505021350195206
formers: 0.9387199509455326
terra: 0.9387199509455326
They: 0.41135783462097414
marines: 0.9387199509455326
found: 0.23887009019136338
enlist: 0.8744555148517477
Horror: 0.027742665556486176
salvage: 0.9387199509455326
later: 0.3939959443714568
investigate: 0.22016404041943113
Sigourney Weaver: 0.7118090876234382
search: 0.31206374553453403
lost: 0.6009081002190206
species: 0.8081723261752605
James Remar: 0.9387199509455326
lifepod: 0.9387199509455326
company: 0.11268350531016919
contact: 0.8934765172142304
Ripley: 0.9387199509455326
colonial: 0.9387199509455326
extraterrestrial technology

Titanic: 0.6741561506488741
class: 0.17925301445608982
whole: 0.012857010512483504
Frances Fisher: 0.6741561506488741
Rose: 0.4767563827928869
Buell: 0.6741561506488741
Rossi: 0.6741561506488741
drink: 1.0710018510938626
large: 0.8116633513423254
producers: 1.0710018510938626
bowl: 1.0710018510938626
involving: 0.544209915373387
Jeff Bridges: 0.23422172695108512
Russians: 0.675605948822312
wants: 0.2892323168818267
Joel Coen: 1.0070218922046856
strange: 0.35976772527377576
sums: 1.0710018510938626
John Goodman: 0.2530259668856824
millionaire: 1.0710018510938626
Dude: 1.0710018510938626
white russian: 1.0710018510938626
slacker: 0.41135783462097414
bowling: 1.0710018510938626
dude: 0.9407387035282102
nihilists: 1.0710018510938626
series: 0.15930639134288058
wheelchair: 0.8984821307131702
Lebowski: 1.0710018510938626
errant: 1.0710018510938626
toes: 1.0710018510938626
mistaken: 1.0710018510938626
bound: 0.6146097255267052
money: 0.8099140768782268
ferrets: 1.0710018510938626
dragged: 0.1

truly: 0.4095923503522156
McWhiggen: 0.4095923503522156
devices: 0.675605948822312
belong: 0.4095923503522156
Al: 0.4095923503522156
heads: 0.012801592363723836
mount: 0.4767563827928816
prosecution: 0.4095923503522156
Barn: 0.4095923503522156
Joan Cusack: 0.4095923503522156
Cowboy: 0.4095923503522156
Things: 0.11979231529538366
Toy: 0.4095923503522156
guard: 0.2785057538685562
Coffey: 1.6001294516871785
possesses: 1.6001294516871785
David Morse: 1.6001294516871785
stave: 1.6001294516871785
cellblock: 1.6001294516871785
black people: 1.6001294516871785
Edgecomb: 1.6001294516871785
miraculous: 0.5442099153733847
southern usa: 1.6001294516871785
recognizes: 1.6001294516871785
ailments: 1.6001294516871785
row: 1.6001294516871785
condemned: 1.6001294516871785
Paul: 1.6001294516871785
Michael Clarke Duncan: 1.6001294516871785
desperately: 0.41047319411640826
Southern: 1.6001294516871785
heal: 1.6001294516871785
gift: 0.5442099153733847
execution: 1.6001294516871785
took: 0.27850575386855736

civil: 0.11979231529538366
Padm: 0.7809447509827612
invasion: 1.285276186754121
Under: 0.47675638279288807
Senator: 0.7809447509827612
Naboo: 0.7809447509827612
Hayden Christensen: 0.31864188132480875
brink: 0.7894148675877573
Queen: 0.5996919636691135
Obi: 0.31864188132480875
assassination: 0.7002704489773234
beginning: 0.22016404041942583
apprentice: 0.31864188132480875
Ten: 0.7809447509827612
Wars: 0.31864188132480875
twenty: 0.7809447509827612
assigned: 0.5837746360307069
Kenobi: 0.31864188132480875
leadership: 0.7809447509827612
Amidala: 0.7809447509827612
Republic: 0.31864188132480875
Wan: 0.31864188132480875
darker: 0.7809447509827612
investigation: 0.41055108849337973
made: 0.14500510078506726
Clone: 0.31864188132480875
renegade: 0.7809447509827612
Dooku: 0.7809447509827612
Separatist: 0.7809447509827612
many: 0.32001830413830795
account: 0.5453853147586413
want: 0.07576646147122287
Franka Potente: 0.34419000543993844
amnesia: 0.8339870768929784
Wounded: 0.4095923503522156
Jaso

Russian: 0.5530115861490341
purchase: 0.27731045020388456
classified: 0.07905725073404025
run: 0.4941663418309334
Paul Greengrass: 0.012829241591290562
revolves: 0.6741561506488741
threatened: 0.012801592363723836
pub: 0.6093227601458377
strenuous: 0.6741561506488741
routine: 0.27850575386855475
record collection: 0.6741561506488741
ordinary: 0.23422172695108512
cheese: 0.6741561506488741
Edgar Wright: 0.4999258884559142
Simon Pegg: 0.27910727817496844
Londoners: 0.6741561506488741
supremely: 0.6741561506488741
flower: 0.6741561506488741
snack: 0.6741561506488741
Shaun: 0.6741561506488741
uneventful: 0.6741561506488741
Kate Ashfield: 0.6741561506488741
Nick Frost: 0.6093227601458377
trying: 0.05745710921144341
Danny Glover: 0.27731045020388456
choices: 0.4674809022339089
Obsessed: 0.27731045020388456
die: 0.27850575386855736
morally: 0.27731045020388456
based on short film: 0.27731045020388456
Monica Potter: 0.27731045020388456
sadist: 0.27731045020388456
James Wan: 0.4113578346209753


Jake Cherry: 0.7809447509827495
Attila: 0.7809447509827495
stirs: 0.18607550397185654
President: 0.655698239672104
rex: 0.7809447509827495
Chaos: 0.7809447509827495
Shawn Levy: 0.6500578247071823
Larry: 0.7809447509827495
Roosevelt: 0.7809447509827495
awakening: 0.7809447509827495
Daley: 0.7809447509827495
curse: 0.12005048838869246
Carla Gugino: 1.04775695676603
Tyrannosaurus: 0.7809447509827495
Hun: 0.7809447509827495
reigns: 0.7809447509827495
Density: 0.9387199509455326
loses: 0.675605948822312
heavily: 0.9387199509455326
cards: 0.9387199509455326
expensive: 0.9387199509455326
device: 0.21162362808698992
Bone: 0.9387199509455326
Gabriele Muccino: 0.9387199509455326
steady: 0.9387199509455326
invests: 0.9387199509455326
pay: 0.2779068167634642
san francisco: 0.9387199509455326
Christopher: 1.0070218922046856
product: 0.9387199509455326
homeless person: 0.9387199509455326
receive: 1.0070218922046856
Gardner: 0.9387199509455326
marginally: 0.9387199509455326
sell: 0.9387199509455326
m

clad: 0.7809447509827495
Wesley: 0.7809447509827495
Gibson: 0.7809447509827495
secret society: 0.7809447509827495
Timur Bekmambetov: 0.7809447509827495
sexpot: 0.7809447509827495
belonged: 0.7163410133836566
guild: 0.5837746360307069
leather: 0.7809447509827495
Doormat: 0.7809447509827495
recently: 0.7809447509827495
knew: 1.7768720322072526
miss: 1.0455085512794091
idea: 0.18993770003361285
weary: 0.28673354956534763
stick: 1.0455085512794091
antihero: 1.2466065227954541
habits: 1.0455085512794091
Jason Bateman: 0.012829241591290562
lifesaving: 1.0455085512794091
inflicted: 1.0455085512794091
Hancock: 1.0455085512794091
Charlize Theron: 0.7400268652333057
imprisoning: 1.0455085512794091
purpose: 1.0455085512794091
employ: 1.0455085512794091
PR: 1.0455085512794091
successful: 0.05374026755462846
Peter Berg: 1.0455085512794091
heroics: 1.0455085512794091
public: 0.6514588113983624
image: 1.0455085512794091
slip: 1.0455085512794091
alternate: 0.012746649907226055
Matthew Goode: 0.6756059

nature: 0.053509126618909414
opposition: 1.3100723515760677
commissioned: 1.3100723515760677
weapons: 1.1828671344080488
knife: 1.3100723515760677
Church: 0.47432270372847335
specialist: 0.5837746360307081
enthusiast: 1.3100723515760677
scout: 1.3100723515760677
mercenaries: 0.8489073907366074
Jason Statham: 0.9171629729031959
Sylvester Stallone: 1.2466065227954553
martial: 1.3100723515760677
Barney: 1.3100723515760677
Dolph Lundgren: 1.2466065227954553
sniper: 0.5628907575634078
locale: 1.3100723515760677
Lee: 0.6514588113983624
heavy: 1.3100723515760677
Scott: 0.04611399616425088
underage girlfriend: 0.27731045020388456
critically: 0.7163410133836674
Malley: 0.27731045020388456
Flowers: 0.27731045020388456
delivery: 0.34419000543993844
award: 0.11979231529538366
hipster: 0.27731045020388456
acclaimed: 0.27731045020388456
adaptation: 0.17925301445608854
Bryan: 0.6957428383158346
exes: 0.27731045020388456
cartoonist: 0.27731045020388456
whipping: 0.27731045020388456
Canadian: 0.5442099

suggesting: 1.177790451427739
Autobot: 1.177790451427739
Prime: 1.9094384095602035
centers: 1.177790451427739
Ken Jeong: 1.177790451427739
grind: 0.7809447509827495
bad boss: 0.7809447509827495
conspiracy of murder: 0.7809447509827495
devise: 0.7809447509827495
advice: 0.7809447509827495
permanently: 0.7809447509827495
employee: 0.2779068167634642
option: 0.7809447509827495
intolerable: 0.7809447509827495
rid: 0.6500578247071823
respective: 0.7809447509827495
bosses: 0.7809447509827495
convoluted: 0.5837746360307069
Dale: 0.7809447509827495
benefit: 0.7809447509827495
drinks: 0.7809447509827495
dubious: 0.7809447509827495
foolproof: 0.7809447509827495
tolerable: 0.7809447509827495
Kurt: 0.7809447509827495
dust: 0.7809447509827495
hustling: 0.7809447509827495
Quitting: 0.7809447509827495
Seth Gordon: 0.7809447509827495
Nick: 0.41313860446781836
Charlie Day: 0.5837746360307069
employers: 0.7809447509827495
objects: 1.2032837512421923
wizards: 1.2032837512421923
frog: 1.2032837512421923
J

K: 0.9132266511310928
told: 1.0942990805731008
aliens: 1.8431552208837294
Josh: 0.9132266511310928
perplexes: 0.9132266511310928
inexplicable: 0.9132266511310928
reveal: 0.8858591318786229
seen: 0.05350912661891248
Agents: 0.9132266511310928
Tommy: 0.9132266511310928
fairy tale: 0.7826242020601426
obtain: 1.5746361518727487
Huntsman: 1.5746361518727487
decade: 1.5746361518727487
marries: 1.5746361518727487
performs: 1.5746361518727487
Forest: 1.5746361518727487
Snow: 0.5186067306459538
Evil: 1.5746361518727487
Almost: 1.5746361518727487
Rupert Sanders: 1.5117392775013558
coup: 1.5746361518727487
explorers: 0.16433451530617094
mankind: 1.2680440901446055
corners: 0.9132266511310928
Noomi Rapace: 0.9132266511310928
Emma Thompson: 0.38409905053776233
custom: 0.38409905053776233
seeks: 0.14565367311614177
granted: 0.38409905053776233
MacGuffin: 0.38409905053776233
turmoil: 0.4104731941164138
surly: 0.38409905053776233
enormous: 0.4767563827928869
rida: 0.38409905053776233
Fergus: 0.3840990

necessarily: 0.012746649907226055
Monsters: 0.5837746360307081
weren: 0.012746649907226055
Dan Scanlon: 0.012746649907226055
monstrous: 0.38409905053776233
legions: 0.38409905053776233
began: 0.38409905053776233
Jaeger: 0.38409905053776233
minds: 0.34419000543993844
Hunnam: 0.38409905053776233
Kikuchi: 0.38409905053776233
pilots: 0.38409905053776233
proving: 0.38409905053776233
Rinko: 0.38409905053776233
verge: 0.38409905053776233
Idris Elba: 0.38409905053776233
teamed: 0.38409905053776233
Kaiju: 0.38409905053776233
defenseless: 0.38409905053776233
simultaneously: 0.38409905053776233
relentless: 0.38409905053776233
drive: 0.38409905053776233
mounting: 0.38409905053776233
untested: 0.38409905053776233
giant robot: 0.38409905053776233
Jaegers: 0.38409905053776233
Charlie Hunnam: 0.38409905053776233
giant monster: 0.38409905053776233
obsolete: 0.38409905053776233
consume: 0.38409905053776233
neural: 0.38409905053776233
defending: 0.38409905053776233
Gerry: 0.38409905053776233
populations:

Chlo: 0.14502855005555473
reawakened: 0.14502855005555473
stacked: 0.14502855005555473
else: 0.18607550397186026
Denzel: 0.14502855005555473
ultra: 0.4767563827928869
brutalize: 0.14502855005555473
imposed: 0.14502855005555473
Moretz: 0.14502855005555473
Teri: 0.14502855005555473
idly: 0.14502855005555473
McCall: 0.14502855005555473
nowhere: 0.14502855005555473
plays: 0.14502855005555473
Armed: 0.23422172695108512
dedicated: 0.14502855005555473
skies: 0.8064380507972028
favorite: 0.8064380507972028
Train: 0.8064380507972028
thrilling: 0.8064380507972028
fantastical: 0.8064380507972028
Snotlout: 0.8064380507972028
How: 0.8064380507972028
unmapped: 0.8064380507972028
challenging: 0.8064380507972028
inseparable: 0.8064380507972028
charting: 0.8064380507972028
Dean DeBlois: 0.8064380507972028
territories: 0.8064380507972028
Astrid: 0.8064380507972028
Your: 0.8064380507972028
hundreds: 0.8064380507972028
dragons: 0.8064380507972028
races: 0.8064380507972028
comeback: 0.5418742505005455
Broa

Anastasia: 2.368327552762703
enigmatic: 1.7105888435308028
prominent: 2.368327552762703
pain: 2.368327552762703
sexually: 2.368327552762703
Jamie Dornan: 1.8431552208837294
proclivities: 2.368327552762703
plunges: 2.368327552762703
sick: 2.368327552762703
pleasure: 2.368327552762703
sexual: 2.368327552762703
Steele: 1.8431552208837294
perversion: 2.368327552762703
spanking: 2.368327552762703
Ana: 1.8431552208837294
Jennifer Ehle: 2.368327552762703
Dakota Johnson: 1.8431552208837294
Christian: 1.8431552208837294
headlong: 2.368327552762703
Sam Taylor-Johnson: 2.368327552762703
Jennifer Jason Leigh: 0.8064380507972028
Kurt Russell: 0.8064380507972028
hunters: 0.8064380507972028
Bounty: 0.8064380507972028
deception: 0.8064380507972028
wyoming: 0.8064380507972028
blizzard: 0.8064380507972028
raging: 0.8064380507972028
Three: 0.38409905053776233
Rodrigo Santoro: 0.38409905053776233
Aires: 0.38409905053776233
Nicky: 0.38409905053776233
fatale: 0.38409905053776233
Buenos: 0.38409905053776233


In [91]:
len(norms)

7334

In [98]:
values = sorted(norms.values())
cutoff_low = values[3300]
cutoff_hi =  values[3800]

bag_of_words = [word for word in norms if (norms[word] >= cutoff_low and norms[word] <= cutoff_hi)]
print(bag_of_words)



### 3.1 Classificação por Vizinhos mais Próximos

Este classificador é uma das técnicas mais simples, e também mais fácil de ser implementada.

Primeiro é necessário definir uma função de similaridade, para avaliar a proximidade de dois itens. O mais comum é usar a função cosseno mostrada abaixo:

$$\textrm{Cosseno}(\overline{X}, \overline{Y}) = \frac{\sum_{i=1}^{d} x_i \cdot y_i}{\sqrt{\sum_{i=1}^{d} x_i^2} \cdot \sqrt{\sum_{i=1}^{d} y_i^2}}$$

Onde $\overline{X} = (x_1 \dots x_d)$ e $\overline{Y} = (y_1 \dots y_d)$ são dois conjuntos que contém as frequências das palavras no __Bag of Words__ em cada item. Notando que essas frequências devem estar devidamente pesadas e normalizadas através de algum método tal como o _Vectorização tf-idf_, _Index de Gini_, _Entropia_ etc.

Para realizar a predição, é necessário dois conjuntos de itens. O primeiro, um conjunto $D_L$, que contém os dados de filmes que o usuário já avaliou, e um conjunto $D_U$, que contém os dados de filmes cuja avaliação do usuário é desconhecida. É necessário que ambos $D_L$ e $D_U$ contenham as frequências de cada palavra em cada item, e que $D_L$ contenha as avaliações do usuário para cada item.

Subsequentemente, para cada item em $D_U$, é calculado o valor da proximidade para cada elemento em $D_L$, sendo que quanto maior o valor mais próximo os dois itens são.

#### Pré-processamento

In [77]:
def generate_neighbor_dldu(dataframe, bag_of_words):
    dldu_df = pandas.DataFrame(0, index=dataframe.index, columns=bag_of_words, dtype=float)
    
    for keyword in bag_of_words:
        idf = 0
        
        for index, row in dataframe.iterrows():
            if keyword in row['Bag of Words']:
                idf += 1
        
        idf = idf / len(dataframe.index)
    
        for index, row in dataframe.iterrows():
            tf = row['Bag of Words'].count(keyword) / len(row['Bag of Words'])
            dldu_df.loc[index][keyword] = tf * idf
    
    return dldu_df

#### Algoritmo

In [115]:
def neighbor(dudl_df, dlr_df):
    neighbor_df = pandas.DataFrame(0, index=dudl_df.index, columns=dlr_df.index, dtype=float)
    
    for index_l, row_l in dudl_df.iterrows():
        for index_u, row_u in dudl_df.iterrows():  
            sum_products = 0
            sum_squares_1 = 0
            sum_squares_2 = 0
    
            for column in dudl_df.columns:
                sum_products += row_u[column] * row_l[column]
                sum_squares_1 += math.pow(row_u[column], 2)
                sum_squares_2 += math.pow(row_l[column], 2)
            
            neighbor_df.loc[index_u][index_l] = sum_products / math.sqrt(sum_squares_1) / math.sqrt(sum_squares_2)
    
    return neighbor_df

def neighbor_recommend(ne_df, dlr_df):
    rec_df = pandas.DataFrame(0, index=ne_df.index, columns=["Predicted Rating"], dtype=float)
    
    for index, row in ne_df.iterrows():
        neighbors = [row[column] for column in ne_df.columns]
        neighbors = sorted(range(len(neighbors)), key=lambda i: neighbors[i])[-10:]
        rec_df[index]['Predicted Rating'] = sum(neighbors)/len(neighbors)
    
    return rec_df

In [99]:
dldu_df = generate_neighbor_dldu(movies_df, bag_of_words)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


dldu_df.sort_values('discover', ascending=False)

In [None]:
neighbor_df = neighbor(dldu_df, user_df)

  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [None]:
recs_df = neighbor_recommend(neighbor_df, user_df)

### Classificador de Bayes

#### Pré-processamento

In [None]:
def generate_bayes_ratings_dl(dataframe):
    if ['Vote Average', 'Vote Count'] not in dataframe.columns:
        raise ValueError("Invalid Dataframe")
    
    dlr_df = pandas.Dataframe(0, index=dataframe.index, columns=['Rating'], dtype=int)
    
    for index, row in dataframe.iterrows():
        if row['Vote Average'] >= 7.0:
            dlr_df.iloc[index]['Rating'] = 1
    
    return dlr_df

def generate_bayes_dl(dataframe, bag_of_words):
    if ['id', 'Features'] not in dataframe.columns:
        raise ValueError("Invalid Dataframe")
    
    dl_df = pandas.Dataframe(False, index=dataframe.index, columns=bag_of_words, dtype=bool)
    
    for index, row in dataframe.iterrows():
        for keyword in bag_of_words:
            if keyword in row['Features']:
                dl_df.iloc[index][keyword] = True
    
    return dl_df

def generate_bayes_du(dataframe, bag_of_words):
    if ['id', 'Features'] not in dataframe.columns:
        raise ValueError("Invalid Dataframe")
    
    du_df = pandas.Dataframe(False, index=dataframe.index, columns=bag_of_words, dtype=bool)
    
    for index, row in dataframe.iterrows():
        for keyword in bag_of_words:
            if keyword in row['Features']:
                du_df.iloc[index][keyword] = True
    
    return du_df