# KNN intro -- Frank Kane

April 19, 2021

- Kane is the other ML guy whose course I bought in 2020 alongside Portella
- Digging into this because... maybe neat to see from another angle.

## About KNN
- Supervised learning 

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [13]:
r_cols = ['user_id', 'movie_id', 'rating']

df = pd.read_csv('data/u.data', sep='\t', names = r_cols, usecols = range(3))

In [6]:
df.head()

Unnamed: 0,user_id,movie_id,rating
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3


In [8]:
df.describe()

Unnamed: 0,user_id,movie_id,rating
count,100003.0,100003.0,100003.0
mean,462.470876,425.520914,3.529864
std,266.622454,330.797791,1.125704
min,0.0,1.0,1.0
25%,254.0,175.0,3.0
50%,447.0,322.0,4.0
75%,682.0,631.0,4.0
max,943.0,1682.0,5.0


In [17]:
df.info

<bound method DataFrame.info of         user_id  movie_id  rating
0             0        50       5
1             0       172       5
2             0       133       1
3           196       242       3
4           186       302       3
...         ...       ...     ...
99998       880       476       3
99999       716       204       5
100000      276      1090       1
100001       13       225       2
100002       12       203       3

[100003 rows x 3 columns]>

In [19]:
len(df.movie_id.unique())

1682

In [21]:
int(100_000/1682)

59

#### Sum up
- 100,000 rows
- 1682 unique movies rated ... so avg of 59 ratings per movie.

Let's aggregate data for each movie.

## 1. Group by movie id
- Take the size and mean of the set of ratings for each unique movie id

In [22]:
df2 = df.groupby('movie_id').agg({'rating': [np.size, np.mean]})
df2.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2
1,452,3.878319
2,131,3.206107
3,90,3.033333
4,209,3.550239
5,86,3.302326


In [24]:
df2.describe()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
count,1682.0,1682.0
mean,59.454816,3.076037
std,80.390832,0.781654
min,1.0,1.0
25%,6.0,2.6596
50%,27.0,3.161528
75%,80.0,3.653428
max,584.0,5.0


## 2. Standardize the data
- 1 = highest rating, 0 = lowest rating

In [26]:
df3 = pd.DataFrame(df2['rating']['size'])
df3.head()

Unnamed: 0_level_0,size
movie_id,Unnamed: 1_level_1
1,452
2,131
3,90
4,209
5,86


In [34]:
func = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))

df3 = df3.apply(func)

df3.head()

Unnamed: 0_level_0,size
movie_id,Unnamed: 1_level_1
1,0.773585
2,0.222985
3,0.152659
4,0.356775
5,0.145798


## 3. Get genre info and process it

In [44]:
dict = {}

with open(r'data/u.item') as f:
    temp = ''
          
    for line in f:
        fields = line.rstrip('\n').split('|')
        ID = int(fields[0])
        name = fields[1]
        genres = fields[5:25]
        genres = map(int, genres)
        dict[ID] = (name, np.array(list(genres)), df3.loc[ID].get('size'), df2.loc[ID].rating.get('mean'))

In [46]:
dict[1]

('Toy Story (1995)',
 array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 0.7735849056603774,
 3.8783185840707963)

#### 0 indicates not part of that genre, 1 = part of that genre
- Genres themselves are not that important
- Just want to compare movies

## 4. Compute the 'distance' between two movies based on similarity of their genre and popularity

In [47]:
from scipy import spatial

In [58]:
def get_distance(a, b):
    genresA = a[1]
    genresB = b[1]
    
    genreDistance = spatial.distance.cosine(genresA, genresB)
    
    popularityA = a[2]
    popularityB = b[2]
    popularityDistance = abs(popularityA - popularityB)
    
    return genreDistance + popularityDistance

In [56]:
for i in range(1,30):
    print(i, dict[i][0])

1 Toy Story (1995)
2 GoldenEye (1995)
3 Four Rooms (1995)
4 Get Shorty (1995)
5 Copycat (1995)
6 Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
7 Twelve Monkeys (1995)
8 Babe (1995)
9 Dead Man Walking (1995)
10 Richard III (1995)
11 Seven (Se7en) (1995)
12 Usual Suspects, The (1995)
13 Mighty Aphrodite (1995)
14 Postino, Il (1994)
15 Mr. Holland's Opus (1995)
16 French Twist (Gazon maudit) (1995)
17 From Dusk Till Dawn (1996)
18 White Balloon, The (1995)
19 Antonia's Line (1995)
20 Angels and Insects (1995)
21 Muppet Treasure Island (1996)
22 Braveheart (1995)
23 Taxi Driver (1976)
24 Rumble in the Bronx (1995)
25 Birdcage, The (1996)
26 Brothers McMullen, The (1995)
27 Bad Boys (1995)
28 Apollo 13 (1995)
29 Batman Forever (1995)


#### Let's compare Braveheart and Taxi Driver -- should be similar
- Lower number = closer in distance

In [59]:
get_distance(dict[22], dict[23])

0.789007284150202

#### Now let's do Babe and Toy Story

In [60]:
get_distance(dict[1], dict[8])

0.7329902801600916

Hmmm, closer...

#### Now let's do Taxi Driver and Babe

In [61]:
get_distance(dict[23], dict[8])

0.6552165465858797

Okay... that makes no sense.

In [64]:
import operator

def getNeighbors(movieID, K):
    distances = []
    for movie in dict:
        if (movie != movieID):
            dist = get_distance(dict[movieID], dict[movie])
            distances.append((movie, dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(K):
        neighbors.append(distances[x][0])
    return neighbors

In [73]:
K = 10
avgRating = 0

neighbors = getNeighbors(1, K)

In [74]:
for neighbor in neighbors:
    avgRating += dict[neighbor][3]
    print (dict[neighbor][0] + " " + str(dict[neighbor][3]))
    
avgRating /= K

Liar Liar (1997) 3.156701030927835
Aladdin (1992) 3.8127853881278537
Willy Wonka and the Chocolate Factory (1971) 3.6319018404907975
Monty Python and the Holy Grail (1974) 4.0664556962025316
Full Monty, The (1997) 3.926984126984127
George of the Jungle (1997) 2.685185185185185
Beavis and Butt-head Do America (1996) 2.7884615384615383
Birdcage, The (1996) 3.4436860068259385
Home Alone (1990) 3.0875912408759123
Aladdin and the King of Thieves (1996) 2.8461538461538463


## I didn't find this very helpful... the pre-processing maybe...