In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa
import os
import gc
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import sklearn

# Evaluating Data Directory

In [2]:
for file in os.listdir('dataset/ml-25m/'):
    print(file, os.path.getsize('dataset/ml-25m/'+file)/(1024*1024),'MB')

.ipynb_checkpoints 0.0 MB
genome-scores.csv 415.00487995147705 MB
genome-tags.csv 0.017264366149902344 MB
links.csv 1.3051776885986328 MB
movies.csv 2.8973569869995117 MB
ratings.csv 646.8400831222534 MB
README.txt 0.009975433349609375 MB
tags.csv 37.01241683959961 MB


# Data Import && Exploration

In [3]:
scores_df = pd.read_csv('dataset/ml-25m/genome-scores.csv')
tagged_df = pd.read_csv('dataset/ml-25m/tags.csv')
tags_df = pd.read_csv('dataset/ml-25m/genome-tags.csv')
links_df = pd.read_csv('dataset/ml-25m/links.csv')
movie_df = pd.read_csv('dataset/ml-25m/movies.csv')
ratings_df = pd.read_csv('dataset/ml-25m/ratings.csv')

## Genome-Scores DF

In [4]:
print(scores_df.info())
print(scores_df.agg('nunique'))
scores_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15584448 entries, 0 to 15584447
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   tagId      int64  
 2   relevance  float64
dtypes: float64(1), int64(2)
memory usage: 356.7 MB
None
movieId      13816
tagId         1128
relevance     4000
dtype: int64


Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.06250
3,1,4,0.07575
4,1,5,0.14075
...,...,...,...
15584443,206499,1124,0.11000
15584444,206499,1125,0.04850
15584445,206499,1126,0.01325
15584446,206499,1127,0.14025


In [5]:
pd.DataFrame([[scores_df.movieId.nunique(), ratings_df.movieId.nunique(), scores_df.tagId.nunique(), ratings_df.userId.nunique(), ratings_df.rating.count()]],
             columns=['tagged_movies', 'rated_movies', 'num_tags', 'num_reviewers', 'num_reviews'])


Unnamed: 0,tagged_movies,rated_movies,num_tags,num_reviewers,num_reviews
0,13816,59047,1128,162541,25000095


In [6]:
print(tagged_df.info())
print(tagged_df.agg('nunique'))
tagged_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1093360 entries, 0 to 1093359
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   userId     1093360 non-null  int64 
 1   movieId    1093360 non-null  int64 
 2   tag        1093344 non-null  object
 3   timestamp  1093360 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 33.4+ MB
None
userId        14592
movieId       45251
tag           73050
timestamp    907730
dtype: int64


Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [7]:
print(tags_df.info())
print(tags_df.agg('nunique'))
tags_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128 entries, 0 to 1127
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tagId   1128 non-null   int64 
 1   tag     1128 non-null   object
dtypes: int64(1), object(1)
memory usage: 17.8+ KB
None
tagId    1128
tag      1128
dtype: int64


Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [8]:
print(movie_df.info())
print(movie_df.agg('nunique'))
movie_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB
None
movieId    62423
title      62325
genres      1639
dtype: int64


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
print(links_df.info())
print(links_df.agg('nunique'))
links_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  62423 non-null  int64  
 1   imdbId   62423 non-null  int64  
 2   tmdbId   62316 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 1.4 MB
None
movieId    62423
imdbId     62423
tmdbId     62281
dtype: int64


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [10]:
print(ratings_df.info())
print(ratings_df.agg('nunique'))
ratings_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB
None
userId         162541
movieId         59047
rating             10
timestamp    20115267
dtype: int64


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


# Collaborative Filtering
<h4>Item Based</h4>
Here we use groupby to establish a bare bones collaborative filtering across the large dataset. <br> 
Methods used will include KNN, with a cosine metric.

In [30]:
mean_rating = ratings_df.groupby('movieId')[['rating']].mean()
lowest_rated = mean_rating['rating'].idxmin()
highest_rated = mean_rating['rating'].idxmax()
print(f'lowest rated movie is\n{movie_df.query(f"movieId == {lowest_rated}")}')
print(f'It has {len(ratings_df.query(f"movieId == {lowest_rated}"))} ratings')
print(f'Highest rated movie is\n{movie_df.query(f"movieId == {highest_rated}")}')
print(f'It has {len(ratings_df.query(f"movieId == {highest_rated}"))} ratings')

lowest rated movie is
      movieId            title genres
5693     5805  Besotted (2001)  Drama
It has 2 ratings
Highest rated movie is
      movieId                                              title       genres
9416    27914  Hijacking Catastrophe: 9/11, Fear & the Sellin...  Documentary
It has 1 ratings


In [50]:
stats = ratings_df.groupby('movieId')[['rating']].agg(['count', 'mean'])
stats.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,count,mean
movieId,Unnamed: 1_level_2,Unnamed: 2_level_2
1,57309,3.893708
2,24228,3.251527
3,11804,3.142028
4,2523,2.853547
5,11714,3.058434


In [51]:
stats.columns = stats.columns.droplevel()
stats.head()

Unnamed: 0_level_0,count,mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,57309,3.893708
2,24228,3.251527
3,11804,3.142028
4,2523,2.853547
5,11714,3.058434


In [127]:
from scipy.sparse import csr_matrix

def create_matrix(df):
    n_user = len(df.userId.unique())
    n_movie = len(df.movieId.unique())
    user_map = dict(zip(np.unique(df.userId), list(range(n_user))))
    movie_map = dict(zip(np.unique(df.movieId), list(range(n_movie))))
#     inverted maps
    user_imap = dict(zip(list(range(n_user)), np.unique(df.userId)))
    movie_imap = dict(zip(list(range(n_movie)), np.unique(df.movieId)))
    
    user_idx = [user_map[i] for i in df.userId]
    movie_idx = [movie_map[i] for i in df.movieId]
    
    X = csr_matrix((df['rating'], (movie_idx, user_idx)), shape=(n_movie, n_user))
    return X, user_map, movie_map, user_imap, movie_imap

X, user_map, movie_map, user_imap, movie_imap = create_matrix(ratings_df)

162541


In [129]:
len(user_map)
user_map[162541]

162540

In [77]:
len(movie_map)
movie_imap[59046]
movie_map[209171]

59046

In [136]:
from sklearn.neighbors import NearestNeighbors
def find_similar_items(movie_id, X, k, metric='cosine', show=False):
    neighbors = []
    movie_idx = movie_map[movie_id]
    movie_vec = X[movie_idx]
    k+=1
    kNN = NearestNeighbors(n_neighbors=k, algorithm='brute', metric=metric)
    kNN.fit(X)
    movie_vec = movie_vec.reshape(1,-1)
    neighbor = kNN.kneighbors(movie_vec, return_distance=False)
    for i in range(0, k):
        n = neighbor.item(i)
        neighbors.append(movie_imap[n])
    neighbors.pop(0)
    return neighbors

movie_titles = dict(zip(movie_df['movieId'], movie_df['title']))
movie_id = 5500
similar_ids = find_similar_items(movie_id, X, k=5)
movie_title = movie_titles[movie_id]

In [137]:
for i in similar_ids:
    print(movie_titles[i])

Naked Gun: From the Files of Police Squad!, The (1988)
Hot Shots! (1991)
Naked Gun 2 1/2: The Smell of Fear, The (1991)
Dirty Rotten Scoundrels (1988)
Airplane! (1980)


## Finding Recommendations per User

In [135]:
movie_df.query(f'title.str.contains("Top Secret", case=False)')

Unnamed: 0,movieId,title,genres
5392,5500,Top Secret! (1984),Comedy
44509,168372,Top Secret Affair (1957),Comedy
55048,190987,The Top Secret : Murder in Mind (2016),Thriller
55625,192351,Top Secret Rosies: The Female 'Computers' of W...,Documentary|War


# IDK

In [14]:
from sklearn import datasets, svm
iris = datasets.load_iris()
# Params are the parameters for the KNeighborsRegressor
params = {'n_neighbors':[5,10], 'weights':['distance']}
from sklearn import neighbors
KN = neighbors.KNeighborsRegressor()
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(KN, params)
clf.fit(iris.data, iris.target)

In [15]:
from sklearn import neighbors
NC = neighbors.NearestCentroid()
p = {'metric':['euclidean', 'cosine']}
clf = GridSearchCV(NC, p)

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(ratings_df,  test_size=0.2)
clf.fit(X_train, y_train)

ValueError: not enough values to unpack (expected 4, got 2)

In [50]:
sorted(clf.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_n_neighbors',
 'param_weights',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

## IMDB Rating System
$(WR) = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C$
<br>
<h4>Where:</h4>
<ul>
    <li>R = average for the movie (mean) = (Rating)</li>
    <li>v = number of votes for the movie = (votes)</li>
    <li>m = minimum votes required to be listed in the Top 250 (currently 25000)</li>
    <li>C = the mean vote across the whole report (currently 7.0)</li>
</ul>

# Feature Creation

## Bad Tags
Alot of movies have objectively bad/wrong tags assigned to them. By limiting tags to only the most accurate, we eliminate bad information and reduce the cost of running.<br>
<ul>
    <li><u>scores_df</u> - movieId|tagId|relevance -- 15584448 rows × 3 columns</li>
    <li><u>tagged_df</u> - userId|movieId|tag|timestamp -- 1093360 rows × 4 columns</li>
    <li><u>tags_df</u> -   tagId|tag -- 1128 rows × 2 columns</li>
    <li><u>links_df</u> -  movieId|imdbId|tmdbId< -- 62423 rows × 3 columns/li>
    <li><u>movie_df</u> -  movieId|title|genres -- 62423 rows × 3 columns</li>
    <li><u>ratings_df</u>- userId|movieId|rating|timestamp -- 25000095 rows × 4 columns</li>
</ul>

In [10]:
# scores_df.merge(tags_df, on='tagId')
tagged_df.drop(tagged_df.query('tag != tag').index, inplace=True)
null_tags = tagged_df.merge(tags_df, on='tag', how='left').query('tagId != tagId')
tagged_df.drop(tagged_df.query(f'tag in {list(null_tags.tag.unique())}').index, inplace=True)
df = tagged_df.merge(tags_df, on='tag', how='left')

In [11]:
#Garbage Collection
del [tagged_df, tags_df]
gc.collect()

3

In [12]:
df

Unnamed: 0,userId,movieId,tag,timestamp,tagId
0,3,260,classic,1439472355,215
1,3,260,sci-fi,1439472256,887
2,4,1732,dark comedy,1573943598,286
3,4,1732,great dialogue,1573943604,467
4,4,7569,so bad it's good,1573943455,934
...,...,...,...,...,...
501950,162462,260,scifi,1427470029,890
501951,162462,260,space,1427470029,942
501952,162501,112556,crime,1421990253,268
501953,162534,189169,comedy,1527518175,230


In [27]:
# Drop movieId -> tagId relationships where relevance confidence is below 50%
scores_df.drop(scores_df.query('relevance < .5').index, inplace=True)
scores_df.merge(df.drop(['userId', 'timestamp'], axis=1),
                on=['movieId', 'tagId'], how='inner', copy=False).merge(movie_df, on='movieId', how='inner', copy=False).drop_duplicates()

Unnamed: 0,movieId,tagId,relevance,tag,title,genres
0,1,19,0.66250,action,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,29,0.89375,adventure,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
15,1,63,0.94725,animated,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
30,1,64,0.98425,animation,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
102,1,170,0.55075,buddy movie,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
396058,205383,268,0.67050,crime,El Camino: A Breaking Bad Movie (2019),Crime|Drama|Thriller
396059,205383,323,0.70625,drama,El Camino: A Breaking Bad Movie (2019),Crime|Drama|Thriller
396060,205425,230,0.81975,comedy,Dave Chappelle: Sticks & Stones (2019),Comedy
396062,205425,963,0.91825,stand-up comedy,Dave Chappelle: Sticks & Stones (2019),Comedy


In [60]:
ratings_df.groupby('movieId').rating.agg(['mean', 'count']).reset_index().query('count == 10')

Unnamed: 0,movieId,mean,count
394,399,2.25,10
1731,1815,2.80,10
2528,2619,2.25,10
3548,3647,2.25,10
4204,4309,2.35,10
...,...,...,...
57346,203928,3.50,10
57564,204496,3.00,10
57718,204868,3.20,10
58112,205992,2.25,10


In [48]:
movie_df.merge(ratings_df.groupby('movieId').rating.agg('mean').to_frame('avg_rating').reset_index(), on='movieId').sort_values(by='avg_rating', ascending=False)

Unnamed: 0,movieId,title,genres,avg_rating
29523,136782,The Girl is in Trouble (2015),Thriller,5.0
49654,186119,A Gift Horse (2015),Children,5.0
29643,137032,The Perfect Neighbor (2005),Drama|Thriller,5.0
49041,184643,Relentless (2018),Thriller,5.0
29646,137038,The Perfect Wife (2001),Drama|Thriller,5.0
...,...,...,...,...
5693,5805,Besotted (2001),Drama,0.5
55757,199922,Bon Bini Holland 2 (2018),Comedy,0.5
53387,194608,The Black Book (2018),Drama|Romance,0.5
58517,207153,Buffering (2011),Comedy,0.5


In [50]:
df[df['movieId'] == 260].tag.value_counts().head(10)

sci-fi          648
space           293
classic         259
action          164
adventure       152
fantasy         145
space opera     137
cult classic     76
aliens           53
scifi            53
Name: tag, dtype: int64

In [42]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


Relevancy was predetermined by a ML algorithm which we do not have access to, so we will be removing tags without a relevance or tagid.

In [99]:
tags_df

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s
...,...,...
1123,1124,writing
1124,1125,wuxia
1125,1126,wwii
1126,1127,zombie


In [65]:
scores_df.merge(tags_df, on='tagId', how='left')

Unnamed: 0,movieId,tagId,relevance,tag
0,1,1,0.02875,007
1,1,2,0.02375,007 (series)
2,1,3,0.06250,18th century
3,1,4,0.07575,1920s
4,1,5,0.14075,1930s
...,...,...,...,...
15584443,206499,1124,0.11000,writing
15584444,206499,1125,0.04850,wuxia
15584445,206499,1126,0.01325,wwii
15584446,206499,1127,0.14025,zombie


In [25]:
# Keep only most relevant tags


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [19]:
#remove tags that no longer have an assigned movie
