In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Reading the files

In [2]:
movies = pd.read_csv(r"C:\Users\Lenovo\Downloads\movies.csv")
ratings = pd.read_csv(r"C:\Users\Lenovo\Downloads\ratings.csv")

In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  10329 non-null  int64 
 1   title    10329 non-null  object
 2   genres   10329 non-null  object
dtypes: int64(1), object(2)
memory usage: 242.2+ KB


In [4]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105339 entries, 0 to 105338
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     105339 non-null  int64  
 1   movieId    105339 non-null  int64  
 2   rating     105339 non-null  float64
 3   timestamp  105339 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.2 MB


# Loading data files
The data consists of 105339 ratings applied over 10329 movies.
The movies.csv dataset contains three columns:

movieId: the ID of the movie
title: movies title
genres: movies genres
The ratings.csv dataset contains four columns:

userId: the ID of the user who rated the movie.
movieId: the ID of the movie
ratings: ratings given by each user (from 0 to 5)
Timstamp: The time the movie was rated.

In [5]:
movies.describe()

Unnamed: 0,movieId
count,10329.0
mean,31924.282893
std,37734.741149
min,1.0
25%,3240.0
50%,7088.0
75%,59900.0
max,149532.0


In [6]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,105339.0,105339.0,105339.0,105339.0
mean,364.924539,13381.312477,3.51685,1130424000.0
std,197.486905,26170.456869,1.044872,180266000.0
min,1.0,1.0,0.5,828565000.0
25%,192.0,1073.0,3.0,971100800.0
50%,383.0,2497.0,3.5,1115154000.0
75%,557.0,5991.0,4.0,1275496000.0
max,668.0,149532.0,5.0,1452405000.0


### From the above table we can conclue :

The average rating is 3.5 and minimum and maximum rating is 0.5 and 5 respectively.
There are 668 user who has given their ratings for 149532 movies.

In [9]:
gen=[]
for i in movies.genres:
    x = i.split('|')
    for j in x:
        if j not in gen:
            gen.append(str(j))
gen= str(gen)
movietitle = []
for t in movies.title:
    movietitle.append(t[0:-7])
movietitle = str(movietitle)

In [15]:
df = pd.merge(ratings,movies, how= 'left', on = 'movieId')

In [16]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,16,4.0,1217897793,Casino (1995),Crime|Drama
1,1,24,1.5,1217895807,Powder (1995),Drama|Sci-Fi
2,1,32,4.0,1217896246,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,4.0,1217896556,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,4.0,1217896523,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [18]:
newdf = df.groupby('title')[['rating']].count()

In [19]:
numrating = newdf.nlargest(20,'rating')

In [20]:
numrating

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
Pulp Fiction (1994),325
Forrest Gump (1994),311
"Shawshank Redemption, The (1994)",308
Jurassic Park (1993),294
"Silence of the Lambs, The (1991)",290
Star Wars: Episode IV - A New Hope (1977),273
"Matrix, The (1999)",261
Terminator 2: Judgment Day (1991),253
Braveheart (1995),248
Schindler's List (1993),248


In [21]:
cv = TfidfVectorizer()
coded = cv.fit_transform(movies['genres'])

## Suppose a user wants to watch a movie similar to Toy Story (1995) then we can reccommend the user by calculating the similarity between Toy Story and other movies. So we have to first find the similarity score

In [22]:
similarity = linear_kernel(coded,coded)

In [23]:
indices = pd.Series(movies.index, index=movies['title'])
titles = movies['title']

In [25]:
def reccomendation(title):
    ind = indices[title]
    simscore = list(enumerate(similarity[ind]))
    simscore = sorted(simscore, key = lambda x :x[1], reverse = True)
    simscore = simscore[1:21]
    movieindices = [i[0] for i in simscore]
    return titles.iloc[movieindices]

In [26]:
reccomendation('Toy Story (1995)')

1815                                           Antz (1998)
2496                                    Toy Story 2 (1999)
2967        Adventures of Rocky and Bullwinkle, The (2000)
3166                      Emperor's New Groove, The (2000)
3811                                 Monsters, Inc. (2001)
6617     DuckTales: The Movie - Treasure of the Lost La...
6997                                      Wild, The (2006)
7382                                Shrek the Third (2007)
7987                        Tale of Despereaux, The (2008)
9215     Asterix and the Vikings (Astérix et les Viking...
9732                                          Turbo (2013)
10052                                Boxtrolls, The (2014)
1595                            Black Cauldron, The (1985)
1675                         Lord of the Rings, The (1978)
2696                 We're Back! A Dinosaur's Story (1993)
3420                      Atlantis: The Lost Empire (2001)
3535                          Land Before Time, The (198

## In the above output we can see that we got our desired result result of movie reccommendation. 

In [27]:
from pandasql import sqldf

In [30]:
sqldf("select distinct title from df where title like 'Toy%'")

Unnamed: 0,title
0,Toy Story (1995)
1,Toy Story 2 (1999)
2,Toys (1992)
3,Toy Story 3 (2010)
4,Toy Soldiers (1991)
5,"Toy, The (1982)"


In [86]:
def getmovie(name):
    qr = "select distinct title from df where title like '%{0}%'".format(name)
    test = sqldf(qr,globals() )
    return test

In [90]:
getmovie("Batman")

Unnamed: 0,title
0,Batman (1989)
1,Batman Begins (2005)
2,Batman: Year One (2011)
3,Batman Forever (1995)
4,Batman Returns (1992)
5,Batman & Robin (1997)
6,Batman: Mask of the Phantasm (1993)
7,Batman (1966)
8,Batman Beyond: Return of the Joker (2000)
9,Batman & Mr. Freeze: Subzero (1998)
