# Data Preprocessing

## Importing libraries and data

In [1]:
import numpy as np
import pandas as pd
import ast # it will help our 'genre' to be passed as a list of dictionary and not strings

In [2]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

In [3]:
# movies.head(1)

In [4]:
movies = movies.merge(credits, on = "title")
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

## Only keeping relevant attributes

***These are all string based***

```
Genres, Keywords, Title, Overview, Cast, Crew
```


In [5]:
movies = movies[["movie_id", "genres", "keywords", "title", "overview", "cast", "crew", "runtime", "vote_average", "vote_count", "release_date", "revenue"]]
# movies.info()

In [6]:
# movies.isnull().sum()

In [7]:
movies.dropna(inplace=True) # It will drop all rows with null values

In [8]:
# movies.duplicated().sum()

In [9]:
movies.iloc[0].keywords

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

## Only taking names from 'genres' and 'keywords'

[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]

to this

["Action", "Adventure", "Fantasy", "Science Fiction"]

In [10]:
def convert(obj): # takes 'object'
  L = []
  for i in ast.literal_eval(obj): # ast is used cauz doing things this way
                                  # apparantly convert obj to string but it convert them back to Dictionary
    L.append(i["name"])
  return L


In [11]:
movies['genres'] = movies['genres'].apply(convert) # It applies the function on it

Also for Keywords

In [12]:
movies['keywords'] = movies['keywords'].apply(convert) # It applies the function on it

## For 'cast' we will only use top 3


In [13]:
def convert3(obj):
  L = []
  count = 0
  for i in ast.literal_eval(obj):# ast is used cauz doing things this way
                                  # apparantly convert obj to string but it convert them back to Dictionary
    if count != 3:
      L.append(i["name"])
      count+=1
    else:
      break
  return L


In [14]:
movies['cast'] = movies['cast'].apply(convert3)

In [15]:
# movies.head()

## For 'crew' we will only be taking Director

In [16]:
def Director(obj):
  L = []
  for i in ast.literal_eval(obj):# ast is used cauz doing things this way
                                 # apparantly convert obj to string but it convert them back to Dictionary
    if i['job'] == 'Director':
      L.append(i['name'])
      break
  return L

In [17]:
movies['crew'] = movies['crew'].apply(Director)
movies.head()

Unnamed: 0,movie_id,genres,keywords,title,overview,cast,crew,runtime,vote_average,vote_count,release_date,revenue
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],162.0,7.2,11800,2009-12-10,2787965087
1,285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski],169.0,6.9,4500,2007-05-19,961000000
2,206647,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",Spectre,A cryptic message from Bond’s past sends him o...,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes],148.0,6.3,4466,2015-10-26,880674609
3,49026,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",The Dark Knight Rises,Following the death of District Attorney Harve...,"[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan],165.0,7.6,9106,2012-07-16,1084939099
4,49529,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",John Carter,"John Carter is a war-weary, former military ca...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton],132.0,6.1,2124,2012-03-07,284139100


## Here we need the overview as list (Explained later why)

In [18]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())
# movies.head()

## Removing all the spaces between words.
Lets take an example, 'Tom Holland' is not same person as 'Tom Cruise' but our model will make a assumption that as 'Tom' is common these movies are correlated and hence similar.

To eliminate this possibility we will conjugate the names by replacing " " to ""(nothing)

In [19]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ", "") for i in x])
# movies.head()

## Making TAGS
This will represent all the attributes in one,
1. cast
2. crew
3. overview
4. keywords
5. genres

Thats because doing this we only will have one list of relavent words to our model.

The Explaination : Because of this we were converting the string (overview) in list.

In [20]:
movies['tags'] = movies['cast'] + movies['crew'] + movies['overview'] + movies['keywords'] + movies['genres']

In [21]:
movies.drop(['cast', 'crew', 'overview', 'keywords', 'genres'], axis = 1) # I didnt droped all the numerical
                                                                          # column cauz i will use them in future.

Unnamed: 0,movie_id,title,runtime,vote_average,vote_count,release_date,revenue,tags
0,19995,Avatar,162.0,7.2,11800,2009-12-10,2787965087,"[SamWorthington, ZoeSaldana, SigourneyWeaver, ..."
1,285,Pirates of the Caribbean: At World's End,169.0,6.9,4500,2007-05-19,961000000,"[JohnnyDepp, OrlandoBloom, KeiraKnightley, Gor..."
2,206647,Spectre,148.0,6.3,4466,2015-10-26,880674609,"[DanielCraig, ChristophWaltz, LéaSeydoux, SamM..."
3,49026,The Dark Knight Rises,165.0,7.6,9106,2012-07-16,1084939099,"[ChristianBale, MichaelCaine, GaryOldman, Chri..."
4,49529,John Carter,132.0,6.1,2124,2012-03-07,284139100,"[TaylorKitsch, LynnCollins, SamanthaMorton, An..."
...,...,...,...,...,...,...,...,...
4804,9367,El Mariachi,81.0,6.6,238,1992-09-04,2040920,"[CarlosGallardo, JaimedeHoyos, PeterMarquardt,..."
4805,72766,Newlyweds,85.0,5.9,5,2011-12-26,0,"[EdwardBurns, KerryBishé, MarshaDietlein, Edwa..."
4806,231617,"Signed, Sealed, Delivered",120.0,7.0,6,2013-10-13,0,"[EricMabius, KristinBooth, CrystalLowe, ScottS..."
4807,126186,Shanghai Calling,98.0,5.7,7,2012-05-03,0,"[DanielHenney, ElizaCoupe, BillPaxton, DanielH..."


In [22]:
movies['tags'] = movies['tags'].apply(lambda x:" ".join(x)) # using lambda function to
                                                            # convert the list into string again.

In [23]:
movies['tags'] = movies['tags'].apply(lambda x:x.lower()) # its recommended to conert
                                                          # all letters to lower letters
# movies['tags']

## Stemming
Its the process of eleminating words that have same essence but different form like act, acting or activity, activities.

These dont need to be taken accounted for twice.

In [24]:
import nltk
from nltk.stem.porter import PorterStemmer # These are the library that helps in this
ps = PorterStemmer()

In [25]:
def Stem(text):
  y = []
  for i in text.split(): # split the text in list form and then i iterrates them
    y.append(ps.stem(i)) # every word , 'i' is being checked by a method of PorterStemmer
                         # and then appended at the end of list 'y' we created

  return " ".join(y)     # then we return the y in string form

In [26]:
movies['tags'] = movies['tags'].apply(Stem) # appling Stem fumction

In [27]:
# print(movies['tags'])

# Vectorization by Bag of Words

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000, stop_words='english') # here only this can remove
                                                                # all stop word in english language from our model
                                                                # and we are keeping our bag size 5000
vectors = cv.fit_transform(movies['tags']).toarray()

In [30]:
print(vectors)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [31]:
words = cv.get_feature_names_out()
print(words)

['000' '007' '10' ... 'zone' 'zoo' 'zooeydeschanel']


# Angle difference of each vector created
Now this case is way different than my training as i dont have any final data to label but just have to make a educated guess.

In my nlp models i just used 1/0 but here i have to give a definitive anwser and the correctness will be judged in real world.

So , in other words i dont have any way to check my models accuracy or any thing.

In [32]:
# print(vectors[0])
# print(vectors[0].max())

In [33]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)
print(similarity)

[[1.         0.08346223 0.0860309  ... 0.04499213 0.         0.        ]
 [0.08346223 1.         0.06063391 ... 0.02378257 0.         0.02615329]
 [0.0860309  0.06063391 1.         ... 0.02451452 0.         0.        ]
 ...
 [0.04499213 0.02378257 0.02451452 ... 1.         0.03962144 0.04229549]
 [0.         0.         0.         ... 0.03962144 1.         0.08714204]
 [0.         0.02615329 0.         ... 0.04229549 0.08714204 1.        ]]


In [39]:
def Recommend(movie):
  movie_index = movies[movies['title'] == movie].index[0]
  movies_list = sorted(list(enumerate(similarity[movie_index])), reverse = True, key= lambda x:x[1])[1:6]
  for i in movies_list:
    print(movies.iloc[i[0]].title)



In [40]:
print(movies_list)

[(1216, 0.28676966733820225), (2409, 0.26901379342448517), (3730, 0.2605130246476754), (507, 0.255608593705383), (539, 0.25038669783359574)]


Now lets find 5 closest movies of every movies

In [None]:
Recommend('Interstellar')