<a href="https://colab.research.google.com/github/AbdullahiMo938/movie-recommendation/blob/main/movierecomend.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# https://files.grouplens.org/datasets/movielens/ml-25m.zip
from google.colab import files
uploaded = files.upload()

Saving movies.csv to movies.csv


In [5]:
import pandas as pd
movies  = pd.read_csv("movies.csv")

In [6]:
movies


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [7]:

import re
# to get rid of special characters which will make searching for the movie name difficult (such as '('))
def clean_title(title):
      return re.sub("[^a-zA-Z0-9 ]", "", title)

In [8]:

movies["clean_title"] = movies["title"].apply(clean_title)

In [9]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [12]:
# creating a search engine that would essentially change the names into numbers and would make a search engine taht would change the name to numbers in likleness
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 2))

tfidf = vectorizer.fit_transform(movies["clean_title"])


In [23]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec,tfidf).flatten()
    indices = np.argpartition(similarity,-5)[-5:]
    results = movies.iloc[indices][::-1]
    return results



In [24]:
# Import the cool widgets for interactivity
import ipywidgets as widgets
from IPython.display import display_html, clear_output

# Create a text box for users to type in their favorite movie
movie_input = widgets.Text(
    description="Movie Title",  # Label the text box
    disabled=False  # Keep it enabled for typing
)

# Create a space to show off the search results
movie_list = widgets.Output()

# Define a function to run when someone types in the box
def on_type(data):
    with movie_list:
        clear_output()  # Clear out the old results to keep it tidy
        title = data["new"]  # Get the movie title someone just typed
        if len(title) > 5:
            display(search(title))  # If it's long enough, show some movie suggestions

# Connect the typing function to the text box
movie_input.observe(on_type, names='value')

# Show the text box and the results space
display(movie_input, movie_list)


Text(value='', description='Movie Title')

Output()

In [26]:
from google.colab import files
uploaded = files.upload()

Saving ratings.csv to ratings.csv


In [27]:
ratings = pd.read_csv("ratings.csv")

In [28]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [29]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [31]:
movie_id = 1

In [33]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 5)]["userId"].unique()


In [74]:
similar_users

array([    36,     75,     86, ..., 162518, 162519, 162530])

In [76]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]



In [75]:
similar_users_recs

Series([], dtype: float64)

In [70]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs1[similar_user_recs > .10]

In [77]:
similar_user_recs

5101           1
5105          34
5111         110
5114         150
5127         260
            ... 
24998388    3706
24998389    3735
24998391    3763
24998392    4187
24998393    4321
Name: movieId, Length: 912084, dtype: int64

In [119]:

all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [120]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
620,3,52950,4.5,1566089429
820,3,106782,4.5,1439473659
882,3,136449,5.0,1484753762
2416,12,52952,4.5,1209130342
2769,13,59126,4.5,1238633789
...,...,...,...,...
24998510,162532,87232,5.0,1378258098
24999503,162536,105844,4.5,1572259526
24999737,162538,56367,4.5,1438781838
24999762,162538,93988,4.5,1438785545


In [123]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [124]:
all_users_recs


72998     0.226748
106782    0.184023
56367     0.159609
87232     0.091047
97921     0.076626
            ...   
106770    0.000032
106078    0.000032
103974    0.000032
133533    0.000032
130800    0.000032
Name: movieId, Length: 1097, dtype: float64

In [125]:
rec_percentages = pd.concat([similar_user_recs,  all_users_rec], axis =1)
rec_percentages.columns = ["similar","all"]


In [126]:
rec_percentages

Unnamed: 0,similar,all
5101,1,0.002340
5105,34,0.009677
5111,110,0.001391
5114,150,0.002815
5127,260,0.002024
...,...,...
24998388,3706,
24998389,3735,
24998391,3763,
24998392,4187,


In [97]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [98]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)


In [99]:
rec_percentages

Unnamed: 0,similar,all,score
208615,201588,0.000032,6.374414e+09
127234,183869,0.000032,5.814122e+09
173513,179491,0.000032,5.675685e+09
173501,177593,0.000032,5.615668e+09
208567,168250,0.000032,5.320233e+09
...,...,...,...
24998388,3706,,
24998389,3735,,
24998391,3763,,
24998392,4187,,


In [127]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,movieId,title,genres,clean_title
4995,1,0.00234,5101,Richard Pryor Here and Now (1983),Comedy|Documentary,Richard Pryor Here and Now 1983
4999,34,0.009677,5105,Don't Look Now (1973),Drama|Horror|Thriller,Dont Look Now 1973
5005,110,0.001391,5111,"Good Son, The (1993)",Drama|Thriller,Good Son The 1993
5008,150,0.002815,5114,"Bad and the Beautiful, The (1952)",Drama,Bad and the Beautiful The 1952
5021,260,0.002024,5127,Dragonfly (2002),Drama|Fantasy|Mystery|Romance|Thriller,Dragonfly 2002
5048,497,9.5e-05,5154,"Arena, The (a.k.a. Naked Warriors) (1974)",Action|Adventure,Arena The aka Naked Warriors 1974
5059,590,0.001771,5165,Zombie (a.k.a. Zombie 2: The Dead Are Among Us...,Horror,Zombie aka Zombie 2 The Dead Are Among Us Zomb...
5080,1101,9.5e-05,5186,Honeysuckle Rose (a.k.a. On the Road Again) (1...,Drama|Romance,Honeysuckle Rose aka On the Road Again 1980
5084,1196,0.000379,5190,Inside Moves (1980),Drama,Inside Moves 1980
5086,1210,0.000506,5193,"Jazz Singer, The (1980)",Musical,Jazz Singer The 1980


In [129]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [131]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [132]:
#end of code