In [53]:
import pandas as pd
from math import sqrt
import numpy as np

In [54]:
movies_df = pd.read_csv('Netflix_Dataset_Movie.csv')
Ratings_df = pd.read_csv('Netflix_Dataset_Rating.csv')
print(movies_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17770 entries, 0 to 17769
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Movie_ID  17770 non-null  int64 
 1   Year      17770 non-null  int64 
 2   Name      17770 non-null  object
dtypes: int64(2), object(1)
memory usage: 416.6+ KB
None


In [55]:
userInput = [{'Name':'Dragonheart', 'Rating':5},
             {'Name':'Sea of Love', 'Rating':1},
             {'Name':'The Color of Money', 'Rating':1},
             {'Name':'Superstar', 'Rating':5},
             {'Name':'Scandal', 'Rating':4.5},
             {'Name':"National Lampoon's Van Wilder", 'Rating':1}]

inputMovies = pd.DataFrame(userInput)
print(inputMovies)


                            Name  Rating
0                    Dragonheart     5.0
1                    Sea of Love     1.0
2             The Color of Money     1.0
3                      Superstar     5.0
4                        Scandal     4.5
5  National Lampoon's Van Wilder     1.0


In [56]:

inputId = movies_df[movies_df['Name'].isin(inputMovies['Name'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies.drop('Year', 1)
inputMovies = inputMovies[['Movie_ID','Name','Rating']]
print(inputMovies)

   Movie_ID                           Name  Rating
0        58                    Dragonheart     5.0
1       110                        Scandal     4.5
2       599                    Sea of Love     1.0
3      1502                      Superstar     5.0
4      1509  National Lampoon's Van Wilder     1.0
5      2139             The Color of Money     1.0


  inputMovies = inputMovies.drop('Year', 1)


In [57]:
userSubset = Ratings_df[Ratings_df['Movie_ID'].isin(inputMovies['Movie_ID'].tolist())]
print(userSubset.groupby('Movie_ID').count())

          User_ID  Rating
Movie_ID                 
58          16116   16116
110          1863    1863
599          8928    8928
1502         8736    8736
1509        32975   32975
2139        20496   20496


In [58]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['User_ID'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])


  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)


[(16272,          User_ID  Rating  Movie_ID
188089     16272       4        58
263599     16272       3       110
2310897    16272       3       599
5386753    16272       1      1502
5414831    16272       4      1509
7971646    16272       3      2139), (64765,          User_ID  Rating  Movie_ID
188600     64765       2        58
263643     64765       2       110
2311126    64765       4       599
5386984    64765       2      1502
5415562    64765       1      1509
7972174    64765       4      2139), (151004,          User_ID  Rating  Movie_ID
192823    151004       4        58
264164    151004       3       110
2313496   151004       4       599
5389269   151004       1      1502
5424346   151004       5      1509
7977645   151004       3      2139), (303948,          User_ID  Rating  Movie_ID
179034    303948       4        58
262593    303948       3       110
2305911   303948       4       599
5381809   303948       4      1502
5396274   303948       4      1509
7960232   3039

In [59]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='Movie_ID')
    inputMovies = inputMovies.sort_values(by='Movie_ID')

    #Get the N for the formula
    nRatings = len(group)

    # #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['Movie_ID'].isin(group['Movie_ID'].tolist())]

    # #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['Rating'].tolist()
   
    # #Let's also put the current user group reviews in a list format
    tempGroupList = group['Rating'].tolist()
   
    
    # #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0


In [60]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['User_ID'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())


   similarityIndex  User_ID
0        -0.346518    16272
1        -0.445532    64765
2        -0.544089   151004
3        -0.368048   303948
4        -0.607097   305344


In [61]:

topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())



    similarityIndex  User_ID
54         0.666667    44434
70         0.645497   166041
85         0.612372   290916
96         0.612372   394895
82         0.612372   267378


In [62]:

topUsersRating=topUsers.merge(Ratings_df, left_on='User_ID', right_on='User_ID', how='inner')
print(topUsersRating.head(100))

    similarityIndex  User_ID  Rating  Movie_ID
0          0.666667    44434       1        26
1          0.666667    44434       2        30
2          0.666667    44434       2        46
3          0.666667    44434       2        58
4          0.666667    44434       2        78
..              ...      ...     ...       ...
95         0.666667    44434       2      1236
96         0.666667    44434       3      1255
97         0.666667    44434       1      1289
98         0.666667    44434       4      1300
99         0.666667    44434       2      1305

[100 rows x 4 columns]


In [63]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['Rating']
print(topUsersRating.head())


   similarityIndex  User_ID  Rating  Movie_ID  weightedRating
0         0.666667    44434       1        26        0.666667
1         0.666667    44434       2        30        1.333333
2         0.666667    44434       2        46        1.333333
3         0.666667    44434       2        58        1.333333
4         0.666667    44434       2        78        1.333333


In [64]:

#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('Movie_ID').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())


          sum_similarityIndex  sum_weightedRating
Movie_ID                                         
3                    0.292617            0.710645
8                    1.895848            4.929304
16                   2.121550            7.623800
17                   1.062291            4.215310
18                   4.957048           18.969511


In [65]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['Movie_ID'] = tempTopUsersRating.index

print(recommendation_df.head(10))

          weighted average recommendation score  Movie_ID
Movie_ID                                                 
3                                      2.428589         3
8                                      2.600053         8
16                                     3.593505        16
17                                     3.968133        17
18                                     3.826775        18
26                                     2.283243        26
28                                     4.021103        28
30                                     3.980418        30
32                                     4.316373        32
33                                     1.907859        33


In [66]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)

print(recommendation_df)


          weighted average recommendation score  Movie_ID
Movie_ID                                                 
2423                                   8.133141      2423
3529                                   7.844890      3529
1481                                   7.518619      1481
1476                                   5.186384      1476
3456                                   5.049442      3456
...                                         ...       ...
1604                                  -1.249197      1604
2889                                  -1.249197      2889
4387                                  -1.837474      4387
2944                                  -3.185202      2944
762                                  -17.570654       762

[1350 rows x 2 columns]


In [67]:
recommended_movie=movies_df.loc[movies_df['Movie_ID'].isin(recommendation_df['Movie_ID'])]

#we don't want to recommend the same movie
recommended_movie=recommended_movie.loc[~recommended_movie.Movie_ID.isin(userSubset['Movie_ID'])]

print(recommended_movie)

recommended_movie = recommended_movie["Name"].tolist()
print(len(recommended_movie))

      Movie_ID  Year                        Name
2            3  1997                   Character
7            8  2004  What the #$*! Do We Know!?
15          16  1996                   Screamers
16          17  2005                   7 Seconds
17          18  1994            Immortal Beloved
...        ...   ...                         ...
4487      4488  2000                 Wonder Boys
4489      4490  2004                   Ned Kelly
4491      4492  2004                  Club Dread
4492      4493  2003           Ju-on: The Grudge
4495      4496  1993       Farewell My Concubine

[1344 rows x 3 columns]
1344


In [68]:
moviess_df = pd.read_csv('movies_metadata.csv')
print(moviess_df["genres"][8][21:24])

Act


  moviess_df = pd.read_csv('movies_metadata.csv')


In [69]:
action = []
comedy = []
horror = []
drama = []
family = []
adventure = []

In [70]:
for i in range (len(recommended_movie) - 1):
    a = (moviess_df[moviess_df["original_title"] == (recommended_movie[i])]["genres"].to_string())
    b = (a.find('name'))

    if ((a[b+8:b+11])== "Fam"):
        family.append(recommended_movie[i])
    if ((a[b+8:b+11])== "Adv"):
        adventure.append(recommended_movie[i])
    if ((a[b+8:b+11])== "Hor"):
        horror.append(recommended_movie[i])
    if ((a[b+8:b+11])== "Com"):
        comedy.append(recommended_movie[i])
    if ((a[b+8:b+11])== "Act"):
        action.append(recommended_movie[i])

counted = {"action" : len(action),
"comedy" : len(comedy),
"horror" : len(horror),
"drama" : len(drama),
"adventure" : len(adventure),
"family" : len(family)}

print(counted["adventure"])

56


In [71]:
counted = {"action" : len(action),
"comedy" : len(comedy),
"horror" : len(horror),
"drama" : len(drama),
"adventure" : len(adventure),
"family" : len(family)}

In [72]:
maksimumaa = [counted["action"],counted["horror"],counted["drama"],counted["adventure"],counted["family"],counted["comedy"]]

maximize = max(maksimumaa)
minimize = min(maksimumaa)

for x in counted:
  if maximize == counted[x]:
    print("You mostly will enjoy",x,"genre")
  if minimize == counted[x]:
    print("You mostly will not enjoy",x,"genre")

You mostly will enjoy comedy genre
You mostly will not enjoy drama genre


In [73]:
x = 0
print("This is adventure genre reccomendation built for you")
print('\n')
print("No","Title")
for i in adventure:
    
    print(x,i)
    x += 1

This is adventure genre reccomendation built for you


No Title
0 The Great Race
1 Fatal Beauty
2 The Hunchback of Notre Dame II
3 Captain Blood
4 The 10th Kingdom
5 Rabbit-Proof Fence
6 Journey to the Center of the Earth
7 The Hitchhiker's Guide to the Galaxy
8 Robin Hood: Prince of Thieves
9 Rob Roy
10 Logan's Run
11 Back to the Future Part III
12 The Mummy
13 The Shadow
14 Cloak & Dagger
15 Nicholas Nickleby
16 Mutiny on the Bounty
17 For Your Eyes Only
18 Journey to the Center of the Earth
19 Walking Tall
20 Hook
21 Monkeybone
22 Clash of the Titans
23 The Life Aquatic with Steve Zissou
24 Young Black Stallion
25 Earthsea
26 Lethal Weapon
27 Pirates of the Caribbean: The Curse of the Black Pearl
28 Final Fantasy: The Spirits Within
29 VeggieTales: Dave and the Giant Pickle
30 Mystery Men
31 The Ghost and the Darkness
32 The Hitchhiker's Guide to the Galaxy
33 Spice World
34 Sullivan's Travels
35 The Emperor's New Groove
36 The Rundown
37 Red Sonja
38 Pocahontas
39 Where Eagles Dare

In [74]:
x = 0
print("This is drama genre reccomendation built for you")
print('\n')
print("No","Title")
for i in drama:
    
    print(x,i)
    x += 1

This is drama genre reccomendation built for you


No Title


In [75]:
x = 0
print("This is comedy genre reccomendation built for you")
print('\n')
print("No","Title")
for i in comedy:
    
    print(x,i)
    x += 1

This is comedy genre reccomendation built for you


No Title
0 The Love Letter
1 Richard Pryor: Live on the Sunset Strip
2 Death to Smoochy
3 Airplane II: The Sequel
4 Home Movie
5 The Cookout
6 Gross Anatomy
7 Woman of the Year
8 A Night at the Opera
9 Parenthood
10 Bridget Jones's Diary
11 Jack
12 Ed Wood
13 High Fidelity
14 Chasing Amy
15 The Taming of the Shrew
16 Life or Something Like It
17 Spun
18 The Trouble with Angels
19 Party Monster
20 Igby Goes Down
21 The Unsinkable Molly Brown
22 Happiness
23 Untamed Heart
24 Girls Just Want to Have Fun
25 Scorched
26 City Lights
27 Monsoon Wedding
28 Two Can Play That Game
29 Mumford
30 Drop Dead Fred
31 My Family
32 Stir Crazy
33 Buying the Cow
34 Police Academy 3: Back in Training
35 The Mighty
36 Major League
37 Curly Sue
38 Mr. Magoo
39 Ernest Goes to Jail
40 The Lonely Guy
41 Mean Girls
42 Clerks
43 The Mouse That Roared
44 Dead Man on Campus
45 Bill Cosby: Himself
46 Serial Mom
47 The Lady Eve
48 Animal Crackers
49 Comedian
50 My 

In [76]:
x = 0
print("This is horror genre reccomendation built for you")
print('\n')
print("No","Title")
for i in horror:
    
    print(x,i)
    x += 1

This is horror genre reccomendation built for you


No Title
0 Screamers
1 Dead Birds
2 Halloween 5: The Revenge of Michael Myers
3 The Last House on the Left
4 Hellbound: Hellraiser II
5 Jaws
6 The Lawnmower Man
7 The Faculty
8 Creepshow
9 13 Ghosts
10 Silver Bullet
11 Freddy vs. Jason
12 Valentine
13 Poltergeist
14 The Serpent and the Rainbow
15 The Dead Zone
16 The Rage: Carrie 2
17 Rosemary's Baby
18 Fright Night
19 Bram Stoker's Dracula
20 Invasion of the Body Snatchers
21 The Omen
22 Scream 3
23 Cujo
24 Ghost Ship
25 From Hell
26 House of the Dead
27 The Hunger
28 The Addams Family
29 A Nightmare on Elm Street 3: Dream Warriors
30 The Blob
31 28 Days Later
32 Once Bitten
33 Halloween II
34 Saw
35 The Seventh Sign
36 The Others
37 I Still Know What You Did Last Summer
38 April Fool's Day
39 Bride of Chucky
40 A Nightmare on Elm Street


In [77]:
x = 0
print("This is action genre reccomendation built for you")
print('\n')
print("No","Title")
for i in action:
    
    print(x,i)
    x += 1

This is action genre reccomendation built for you


No Title
0 7 Seconds
1 Never Die Alone
2 Justice League
3 Jade
4 Congo
5 Rambo: First Blood Part II
6 The Last Shot
7 Taking Lives
8 Impostor
9 The Final Countdown
10 Taxi
11 The Hebrew Hammer
12 The Pacifier
13 Tremors 4: The Legend Begins
14 Out for Justice
15 The Replacement Killers
16 Nightbreed
17 Kill Bill: Vol. 2
18 Rush Hour 2
19 Three Days of the Condor
20 The Alamo
21 Hudson Hawk
22 First Knight
23 Jimmy Neutron: Boy Genius
24 The In-Laws
25 Speed
26 Armageddon
27 Unleashed
28 The Last Dragon
29 The Quest
30 Last Man Standing
31 The Poseidon Adventure
32 White Squall
33 Deep Cover
34 Blankman
35 I Spy
36 The Cowboys
37 Rising Sun
38 Outrageous Fortune
39 Training Day
40 Man on Fire
41 S.W.A.T.
42 The Limey
43 No Way Out
44 The Recruit
45 Belly of the Beast
46 Mo' Money
47 Shoot to Kill
48 Dungeons & Dragons
49 Trapped
50 The In-Laws
51 The Three Musketeers
52 The Warriors
53 Beverly Hills Cop
54 Beverly Hills Cop III
55 Deat

In [78]:
x = 0
print("This is family genre reccomendation built for you")
print('\n')
print("No","Title")
for i in family:
    
    print(x,i)
    x += 1

This is family genre reccomendation built for you


No Title
0 Jingle All the Way
1 Stuart Little 2
2 Born Free
3 Free Willy
4 VeggieTales: The Ballad of Little Joe
5 The Adventures of Elmo in Grouchland
6 D.A.R.Y.L.
7 Flight of the Navigator
8 Time Bandits
