In [3]:
import pandas as pd
import numpy as np

Reading the Database

In [4]:
movies = pd.read_csv("/content/imdb_movies.csv")
movies.head()

Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,03/02/2023,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,English,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,12/15/2022,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Avatar: The Way of Water,Released,English,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,04/05/2023,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",The Super Mario Bros. Movie,Released,English,100000000.0,724459000.0,AU
3,Mummies,01/05/2023,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",Momias,Released,"Spanish, Castilian",12300000.0,34200000.0,AU
4,Supercell,03/17/2023,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Supercell,Released,English,77000000.0,340942000.0,US


In [5]:
movies.columns

Index(['names', 'date_x', 'score', 'genre', 'overview', 'crew', 'orig_title',
       'status', 'orig_lang', 'budget_x', 'revenue', 'country'],
      dtype='object')

In [6]:
movies.isnull().sum()

names          0
date_x         0
score          0
genre         85
overview       0
crew          56
orig_title     0
status         0
orig_lang      0
budget_x       0
revenue        0
country        0
dtype: int64

In [7]:
movies.shape

(10178, 12)

Dropping columns -> orig_title, status, budget_x

In [8]:
movies.drop(['status','orig_title','budget_x'],axis=1,inplace=True)

In [9]:
movies.shape

(10178, 9)

Dropping all the null value rows from the columns

In [10]:
for i in movies.columns:
  movies.dropna(subset=i, axis = 0, inplace = True)

In [11]:
movies.isnull().sum()

names        0
date_x       0
score        0
genre        0
overview     0
crew         0
orig_lang    0
revenue      0
country      0
dtype: int64

In [12]:
movies.shape

(10052, 9)

Checking for duplicate values

In [13]:
movies.duplicated().sum()

0

In [14]:
movies.head(1)

Unnamed: 0,names,date_x,score,genre,overview,crew,orig_lang,revenue,country
0,Creed III,03/02/2023,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",English,271616668.0,AU


Converting the data into processing form

In [15]:
movies['crew'] = movies['crew'].apply(lambda x: x.split(","))

In [16]:
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ","") for i in x[0:5]])

In [17]:
movies['crew'] = movies['crew'].apply(lambda x: " ".join(x))

In [18]:
movies['date_x'] = movies['date_x'].apply(lambda x: x[-5:-1])

In [19]:
movies['genre'] = movies['genre'].apply(lambda x: x.replace(",",""))

In [20]:
movies.dtypes

names         object
date_x        object
score        float64
genre         object
overview      object
crew          object
orig_lang     object
revenue      float64
country       object
dtype: object

In [21]:
movies['score'] = movies['score'].astype('object')
movies['revenue'] = movies['revenue'].astype('object')

In [22]:
movies.columns

Index(['names', 'date_x', 'score', 'genre', 'overview', 'crew', 'orig_lang',
       'revenue', 'country'],
      dtype='object')

In [23]:
movies['genre'] = movies['genre'].apply(lambda x: x.replace('\xa0',' '))

In [25]:
movies['score'] = movies['score'].apply(lambda x: str(x))

In [26]:
movies['tags'] = movies['date_x']  + " " + movies['score'] + " " + movies['genre'] +" " + movies['overview'] +" " + movies['crew'] +" " + movies['orig_lang'] +" " +  movies['country']

In [27]:
movies['tags'][0]

'2023 73.0 Drama Action After dominating the boxing world, Adonis Creed has been thriving in both his career and family life. When a childhood friend and former boxing prodigy, Damien Anderson, resurfaces after serving a long sentence in prison, he is eager to prove that he deserves his shot in the ring. The face-off between former friends is more than just a fight. To settle the score, Adonis must put his future on the line to battle Damien — a fighter who has nothing to lose. MichaelB.Jordan AdonisCreed TessaThompson BiancaTaylor JonathanMajors  English AU'

In [28]:
movies ['tags'] = movies['tags'].apply(lambda x: x.lower())

In [29]:
new_movies = movies[['names','tags']]

In [30]:
new_movies.shape

(10052, 2)

In [31]:
from nltk.stem.porter import PorterStemmer
st = PorterStemmer()

In [32]:
def stem(txt):
  y = []
  for i in txt.split():
    y.append(st.stem(i))
  return " ".join(y)

In [33]:
new_movies['tags'] = new_movies['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies['tags'] = new_movies['tags'].apply(stem)


In [34]:
new_movies['tags'][0]

'2023 73.0 drama action after domin the box world, adoni creed ha been thrive in both hi career and famili life. when a childhood friend and former box prodigy, damien anderson, resurfac after serv a long sentenc in prison, he is eager to prove that he deserv hi shot in the ring. the face-off between former friend is more than just a fight. to settl the score, adoni must put hi futur on the line to battl damien — a fighter who ha noth to lose. michaelb.jordan adoniscre tessathompson biancataylor jonathanmajor english au'

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 7000, stop_words='english')

In [36]:
vectors = cv.fit_transform(new_movies['tags']).toarray()

In [37]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [38]:
len(cv.get_feature_names_out())

7000

In [39]:
from sklearn.metrics.pairwise import cosine_similarity
cos_similarity = cosine_similarity(vectors)

In [40]:
cos_similarity[0]

array([1.        , 0.14149999, 0.09362574, ..., 0.04637389, 0.08712137,
       0.08712137])

In [41]:
def recommend(movie):
  ind = new_movies[new_movies['names'] == movie].index[0]
  similar = cos_similarity[ind]
  close_movies = sorted(list(enumerate(similar)),reverse=True, key=lambda x: x[1])[1:6]
  name = []
  for i in close_movies:
    name.append(new_movies['names'][i[0]])
  return name

In [42]:
recommend('Puss in Boots: The Last Wish')

['Die Hard',
 "Mother's Friend",
 'Separation',
 'Godzilla: Planet of the Monsters',
 'Me contro Te: Il film - Persi nel tempo']

In [43]:
import pickle
with open("Database.pkl","wb") as f:
  pickle.dump(new_movies,f)

with open("Similarity.pkl","wb") as f:
  pickle.dump(cos_similarity,f)