In [118]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [119]:
df=pd.read_csv("movies.csv")

In [120]:
import re

def clean_title(title):
    return re.sub("[^a-zA-z0-9 ]","",title)

In [121]:
df["clean_title"]=df["title"].apply(clean_title)

In [122]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(df["clean_title"])

In [144]:
import numpy as np

def search(title):
    title = clean_title(title)
    user_query = vectorizer.transform([title])

    similarity = cosine_similarity(user_query,tfidf).flatten()

    top_5_indices = np.argpartition(similarity, -5)[-5:]
    result=df.iloc[top_5_indices]
    result=result[::-1]
    return result

In [145]:
search("toy")

Unnamed: 0,movieId,title,genres,clean_title
3595,4929,"Toy, The (1982)",Comedy,Toy The 1982
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
7355,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
2355,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
4089,5843,Toy Soldiers (1991),Action|Drama,Toy Soldiers 1991


In [146]:
ratings = pd.read_csv("ratings.csv")

In [147]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [148]:
similar_users = ratings[(ratings["movieId"]==1) & (ratings["rating"]>=4)]["userId"].unique()

In [149]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users))&(ratings["rating"]>=4)]["movieId"]

In [150]:
similar_user_recs = similar_user_recs.value_counts()/len(similar_users)
similar_user_recs

1         1.000000
318       0.585034
356       0.551020
260       0.530612
296       0.503401
            ...   
187595    0.006803
182639    0.006803
69140     0.006803
179119    0.006803
161634    0.006803
Name: movieId, Length: 4309, dtype: float64

In [151]:
##similar_user_recs = similar_user_recs.reset_index()
##similar_user_recs.columns = ['movieId', 'recommendation_score']
##similar_user_recs

In [152]:
##similar_user_recs.index

In [153]:
all_users=ratings[(ratings["movieId"].isin(similar_user_recs.index))&(ratings["rating"]>=4)]

In [154]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100830,610,166528,4.0,1493879365
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047


In [155]:
all_user_recs=all_users["movieId"].value_counts()/len(all_users['userId'].unique())

In [156]:
all_user_recs

318       0.449918
356       0.408867
296       0.400657
593       0.369458
2571      0.364532
            ...   
4475      0.001642
4478      0.001642
4488      0.001642
4583      0.001642
161634    0.001642
Name: movieId, Length: 4309, dtype: float64

In [157]:
rec_percentage = pd.concat([similar_user_recs,all_user_recs],axis=1)
rec_percentage.columns=["similar","all"]

In [158]:
rec_percentage

Unnamed: 0,similar,all
1,1.000000,0.241379
318,0.585034,0.449918
356,0.551020,0.408867
260,0.530612,0.330049
296,0.503401,0.400657
...,...,...
187595,0.006803,0.004926
182639,0.006803,0.001642
69140,0.006803,0.001642
179119,0.006803,0.001642


In [159]:
rec_percentage["score"] = rec_percentage["similar"]/rec_percentage["all"]

In [160]:
rec_percentage = rec_percentage.sort_values("score",ascending=False)

In [161]:
rec_percentage

Unnamed: 0,similar,all,score
161634,0.006803,0.001642,4.142857
7061,0.006803,0.001642,4.142857
7053,0.006803,0.001642,4.142857
7219,0.006803,0.001642,4.142857
7217,0.006803,0.001642,4.142857
...,...,...,...
4367,0.006803,0.018062,0.376623
80906,0.006803,0.018062,0.376623
3285,0.006803,0.018062,0.376623
3863,0.006803,0.019704,0.345238


In [162]:
rec_percentage.head(10).merge(df,left_index=True,right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
9359,0.006803,0.001642,4.142857,161634,Don't Breathe (2016),Thriller,Dont Breathe 2016
4739,0.006803,0.001642,4.142857,7061,Dark Victory (1939),Drama|Romance,Dark Victory 1939
4731,0.006803,0.001642,4.142857,7053,Roberta (1935),Comedy|Musical|Romance,Roberta 1935
4837,0.006803,0.001642,4.142857,7219,They Drive by Night (1940),Drama,They Drive by Night 1940
4835,0.006803,0.001642,4.142857,7217,Dark Passage (1947),Crime|Drama|Film-Noir|Romance|Thriller,Dark Passage 1947
4793,0.006803,0.001642,4.142857,7141,My Flesh and Blood (2003),Documentary,My Flesh and Blood 2003
4746,0.006803,0.001642,4.142857,7070,Red River (1948),Action|Adventure|Western,Red River 1948
4742,0.006803,0.001642,4.142857,7064,Beauty and the Beast (La belle et la bête) (1946),Drama|Fantasy,Beauty and the Beast La belle et la bte 1946
4740,0.006803,0.001642,4.142857,7062,Birdman of Alcatraz (1962),Drama,Birdman of Alcatraz 1962
4737,0.006803,0.001642,4.142857,7059,National Velvet (1944),Children|Drama,National Velvet 1944


In [170]:
def search(title):
    title = clean_title(title)
    user_query = vectorizer.transform([title])

    similarity = cosine_similarity(user_query,tfidf).flatten()

    top_5_indices = np.argpartition(similarity, -5)[-5:]
    result=df.iloc[top_5_indices]
    result=result[::-1]
    return result


def find_movies(movie_id):
    similar_users = ratings[(ratings["movieId"]==movie_id) & (ratings["rating"]>=4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users))&(ratings["rating"]>=4)]["movieId"]

    similar_user_recs = similar_user_recs.value_counts()/len(similar_users)
    all_users=ratings[(ratings["movieId"].isin(similar_user_recs.index))&(ratings["rating"]>=4)]
    all_user_recs=all_users["movieId"].value_counts()/len(all_users['userId'].unique())

    rec_percentage = pd.concat([similar_user_recs,all_user_recs],axis=1)
    rec_percentage.columns=["similar","all"]
    rec_percentage["score"] = rec_percentage["similar"]/rec_percentage["all"]



    rec_percentage = rec_percentage.sort_values("score",ascending=False)

 

    return rec_percentage.head(10).merge(df,left_index=True,right_on="movieId")[["score","title","genres"]]
    

In [192]:
title = input("write movie name :    ")
results = search(title)
results = results.iloc[0]["movieId"]
find_movies(results)

write movie name :    toy


Unnamed: 0,score,title,genres


In [191]:
search("toy story")

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
7355,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
2355,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
3595,4929,"Toy, The (1982)",Comedy,Toy The 1982
4089,5843,Toy Soldiers (1991),Action|Drama,Toy Soldiers 1991
