In [1]:
import pandas as pd

In [2]:
movies = pd.read_csv('data\\movies_dataset.csv').drop('Unnamed: 0', axis=1)

In [3]:
movies.head()

Unnamed: 0,id,title,genres,status,overview,popularity,original_language,vote_average,vote_count
0,862,Toy Story,"Animation,Comedy,Family",Released,"Led by Woody, Andy's toys live happily in his ...",21.946943,en,7.7,5415
1,8844,Jumanji,"Adventure,Fantasy,Family",Released,When siblings Judy and Peter discover an encha...,17.015539,en,6.9,2413
2,15602,Grumpier Old Men,"Romance,Comedy",Released,A family wedding reignites the ancient feud be...,11.7129,en,6.5,92
3,31357,Waiting to Exhale,"Comedy,Drama,Romance",Released,"Cheated on, mistreated and stepped on, the wom...",3.859495,en,6.1,34
4,11862,Father of the Bride Part II,Comedy,Released,Just when George Banks has recovered from his ...,8.387519,en,5.7,173


In [4]:
movies.describe()

Unnamed: 0,id,popularity,vote_average,vote_count
count,43006.0,43006.0,43006.0,43006.0
mean,104584.0,3.070111,5.695954,115.983932
std,112344.0,6.13946,1.807456,504.445506
min,2.0,0.0,0.0,0.0
25%,25022.75,0.450422,5.1,4.0
50%,55734.5,1.234917,6.0,11.0
75%,143838.8,4.02194,6.8,37.0
max,2012929.0,547.488298,10.0,14075.0


In [5]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43006 entries, 0 to 43005
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 43006 non-null  int64  
 1   title              43006 non-null  object 
 2   genres             43006 non-null  object 
 3   status             43006 non-null  object 
 4   overview           43006 non-null  object 
 5   popularity         43006 non-null  float64
 6   original_language  43006 non-null  object 
 7   vote_average       43006 non-null  float64
 8   vote_count         43006 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 3.0+ MB


<h2>Features Selection Part</h2>

In [6]:
movies = movies[['id', 'title', 'overview', 'genres']]

In [7]:
movies.sample(5)

Unnamed: 0,id,title,overview,genres
35051,33297,Her Name Is Sabine,"A sensitive portrait of Sabine Bonnaire, the a...",Documentary
17576,147105,The Enchanted World of Danny Kaye: The Emperor...,This is one of the 'Animagical' titles from th...,"Animation,Family"
20737,191231,Sol,A group of students are sent to a faraway plan...,Science Fiction
28255,37582,Sleeping Dogs,Recluse Smith (Sam Neill) is drawn into a revo...,"Action,Thriller"
35421,296225,Drei weiße Birken,Not thing,Comedy


In [8]:
movies['tags'] = movies['overview'] + ' ' + movies['genres']

In [9]:
df = movies[['id', 'title', 'tags']]

In [10]:
df

Unnamed: 0,id,title,tags
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...
...,...,...,...
43001,222848,Caged Heat 3000,It's the year 3000 AD. The world's most danger...
43002,30840,Robin Hood,"Yet another version of the classic epic, with ..."
43003,439050,Subdue,Rising and falling between a man and woman. Dr...
43004,111109,Century of Birthing,An artist struggles to finish his work while a...


<h2>Convert text to vector</h2>

<ol>
<p><h5>Techniques</h5></p>
<li>Bag of word</li>
<li>TF-IDF</li></ol>

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
# Create count vectorizer with max features=43006
cvt = CountVectorizer(max_features=10000, stop_words='english')

In [13]:
cvt

In [14]:
feature_vector = cvt.fit_transform(df['tags'].values.astype('U')).toarray()

In [15]:
feature_vector.shape

(43006, 10000)

<h2>Calculate to cosine</h2>

<p><strong>Target:</strong> To get the theta between two vector, and show the similarity between two movies</p>

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
similar = cosine_similarity(feature_vector)

In [18]:
similar

array([[1.        , 0.04174829, 0.04875149, ..., 0.0571662 , 0.        ,
        0.        ],
       [0.04174829, 1.        , 0.07784989, ..., 0.06085806, 0.0496904 ,
        0.        ],
       [0.04875149, 0.07784989, 1.        , ..., 0.07106691, 0.        ,
        0.        ],
       ...,
       [0.0571662 , 0.06085806, 0.07106691, ..., 1.        , 0.13608276,
        0.10540926],
       [0.        , 0.0496904 , 0.        , ..., 0.13608276, 1.        ,
        0.0860663 ],
       [0.        , 0.        , 0.        , ..., 0.10540926, 0.0860663 ,
        1.        ]])

In [19]:
similar.shape

(43006, 43006)

In [25]:
pd.DataFrame(similar)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42996,42997,42998,42999,43000,43001,43002,43003,43004,43005
0,1.000000,0.041748,0.048751,0.029854,0.026003,0.000000,0.036155,0.040423,0.000000,0.000000,...,0.028583,0.032125,0.00000,0.000000,0.000000,0.000000,0.000000,0.057166,0.000000,0.000000
1,0.041748,1.000000,0.077850,0.000000,0.000000,0.080322,0.000000,0.064550,0.175682,0.039841,...,0.030429,0.000000,0.00000,0.000000,0.000000,0.039148,0.000000,0.060858,0.049690,0.000000
2,0.048751,0.077850,1.000000,0.074227,0.064651,0.000000,0.089893,0.050252,0.000000,0.000000,...,0.035533,0.039936,0.00000,0.000000,0.000000,0.000000,0.034139,0.071067,0.000000,0.000000
3,0.029854,0.000000,0.074227,1.000000,0.039590,0.076584,0.110096,0.061546,0.000000,0.000000,...,0.043519,0.097823,0.00000,0.000000,0.000000,0.055989,0.083624,0.174078,0.071067,0.055048
4,0.026003,0.000000,0.064651,0.039590,1.000000,0.000000,0.095893,0.000000,0.065653,0.000000,...,0.037905,0.042601,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43001,0.000000,0.039148,0.000000,0.055989,0.000000,0.047167,0.000000,0.037905,0.030949,0.035093,...,0.080408,0.000000,0.00000,0.035093,0.000000,1.000000,0.025751,0.000000,0.000000,0.067806
43002,0.000000,0.000000,0.034139,0.083624,0.000000,0.070447,0.050637,0.056614,0.046225,0.052414,...,0.040032,0.044992,0.00000,0.000000,0.113228,0.025751,1.000000,0.080064,0.065372,0.101274
43003,0.057166,0.060858,0.071067,0.174078,0.000000,0.146647,0.000000,0.176777,0.000000,0.000000,...,0.166667,0.093659,0.00000,0.000000,0.000000,0.000000,0.080064,1.000000,0.136083,0.105409
43004,0.000000,0.049690,0.000000,0.071067,0.000000,0.059868,0.000000,0.048113,0.000000,0.089087,...,0.068041,0.076472,0.00000,0.000000,0.000000,0.000000,0.065372,0.136083,1.000000,0.086066


<h2>Using the distance between two movies to recommend the relative movies</h2>

In [20]:
distances = sorted(list(enumerate(similar[12400])), reverse=True, key=lambda x: x[1])

In [21]:
distances

[(12400, 1.0),
 (41885, 0.40422604172722154),
 (20296, 0.37573457465108967),
 (25706, 0.3666793988112845),
 (12834, 0.36380343755449945),
 (22168, 0.36380343755449945),
 (25705, 0.358853500273005),
 (25713, 0.35007002100700246),
 (30761, 0.35007002100700246),
 (2446, 0.3363363969981562),
 (9258, 0.32539568672798425),
 (34438, 0.32539568672798425),
 (37956, 0.32539568672798425),
 (37957, 0.32539568672798425),
 (41643, 0.32539568672798425),
 (14890, 0.32338083338177726),
 (18841, 0.3067859955389482),
 (27993, 0.3067859955389482),
 (33491, 0.3067859955389482),
 (7333, 0.3034330424545042),
 (19629, 0.30316953129541624),
 (22670, 0.30316953129541624),
 (41768, 0.29704426289300234),
 (42088, 0.29704426289300234),
 (3496, 0.29411764705882354),
 (14308, 0.29411764705882354),
 (25303, 0.29411764705882354),
 (27741, 0.2941176470588235),
 (3654, 0.29250896965085227),
 (8196, 0.29250896965085227),
 (13360, 0.29250896965085227),
 (42016, 0.29250896965085227),
 (25702, 0.2858309752375148),
 (40885, 

In [22]:
for idx, distance in distances[0:6]:
    print(df.iloc[idx].title)

Iron Man
The Wild World of Batwoman
Iron Man 3
Avatar 2
Fallout
Ultraman
