[Reference](https://medium.com/rahasak/recommendation-system-with-content-based-filtering-500231e31a60)

In [3]:
!pip install platypus

Collecting platypus
  Downloading Platypus-1.0.tar.gz (9.1 kB)
Collecting ply
  Downloading ply-3.11-py2.py3-none-any.whl (49 kB)
[K     |████████████████████████████████| 49 kB 2.5 MB/s 
[?25hBuilding wheels for collected packages: platypus
  Building wheel for platypus (setup.py) ... [?25l[?25hdone
  Created wheel for platypus: filename=Platypus-1.0-py3-none-any.whl size=12131 sha256=c1aa704c69a7df0a8a4d95d6a83e3da81fca13cc444214b8149e7fe7508f0a34
  Stored in directory: /root/.cache/pip/wheels/6c/d7/9b/814f078a14758933a87a5d450635b9646932a685838eff281a
Successfully built platypus
Installing collected packages: ply, platypus
Successfully installed platypus-1.0 ply-3.11


In [6]:
import pandas as pd
from ast import literal_eval

# read dataframe from csv on gitlab
url = 'https://gitlab.com/rahasak-labs/dot/-/raw/master/src/main/resources/movie.csv'
df = pd.read_csv(url)

# pandas read list type colomns(genres, cast, keywords) as string, so convert them back to list type 
df.genres = df.genres.apply(literal_eval)
df.cast = df.cast.apply(literal_eval)
df.keywords = df.keywords.apply(literal_eval)

# pretty print features dataframe
from tabulate import tabulate
print(tabulate(df[['title', 'cast', 'director', 'keywords', 'genres']].head(5), headers='keys', tablefmt='psql'))

+----+------------------------------------------+--------------------------------------------------------+-------------------+---------------------------------------------+--------------------------------------------+
|    | title                                    | cast                                                   | director          | keywords                                    | genres                                     |
|----+------------------------------------------+--------------------------------------------------------+-------------------+---------------------------------------------+--------------------------------------------|
|  0 | Avatar                                   | ['Sam Worthington', 'Zoe Saldana', 'Sigourney Weaver'] | James Cameron     | ['culture clash', 'future', 'space war']    | ['Action', 'Adventure', 'Fantasy']         |
|  1 | Pirates of the Caribbean: At World's End | ['Johnny Depp', 'Orlando Bloom', 'Keira Knightley']    | Gore Verbinski    | [

# Clean Data


In [7]:
# convert all strings to lower case and remove the spaces
def clean_feature(x):
    if isinstance(x, list):
        # apply for all list items
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        # apply for string items, if not string return an empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

# apply clean function to the features
features = ['cast', 'keywords', 'director', 'genres']
for feature in features:
    df[feature] = df[feature].apply(clean_feature)
    
# pretty print features dataframe
from tabulate import tabulate
print(tabulate(df[['title', 'cast', 'director', 'keywords', 'genres']].head(5), headers='keys', tablefmt='psql'))

+----+------------------------------------------+-----------------------------------------------------+------------------+-------------------------------------------+-------------------------------------------+
|    | title                                    | cast                                                | director         | keywords                                  | genres                                    |
|----+------------------------------------------+-----------------------------------------------------+------------------+-------------------------------------------+-------------------------------------------|
|  0 | Avatar                                   | ['samworthington', 'zoesaldana', 'sigourneyweaver'] | jamescameron     | ['cultureclash', 'future', 'spacewar']    | ['action', 'adventure', 'fantasy']        |
|  1 | Pirates of the Caribbean: At World's End | ['johnnydepp', 'orlandobloom', 'keiraknightley']    | goreverbinski    | ['ocean', 'drugabuse', 'exoticisl

# Add Feature Column


In [8]:
def add_feature_col(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
df['features'] = df.apply(add_feature_col, axis=1)
df['features'].head(5)

0    cultureclash future spacewar samworthington zo...
1    ocean drugabuse exoticisland johnnydepp orland...
2    spy basedonnovel secretagent danielcraig chris...
3    dccomics crimefighter terrorist christianbale ...
4    basedonnovel mars medallion taylorkitsch lynnc...
Name: features, dtype: object

# Create Matrix


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# create count matrix and cosine similarity
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['features'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)

# reset index of the data frame and construct reverse mapping
df = df.reset_index()
indices = pd.Series(df.index, index=df['title'])

# Find Recommendations


In [10]:
# function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim):
    # get the index of the movie that matches the title
    idx = indices[title]

    # get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
        
    # sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # return the top 10 most similar movies from the data frame
    return df['title'].iloc[movie_indices]
  
  
# recommendatain of movie JFK
get_recommendations('JFK', cosine_sim)

884              Zero Dark Thirty
1528                     Criminal
647            World Trade Center
737     Jack Ryan: Shadow Recruit
2008        In the Valley of Elah
3172                The Contender
940                       Syriana
991                     Fair Game
1091                        Nixon
1187              Bridge of Spies
Name: title, dtype: object

In [11]:
# recommendatain of movie JFK
get_recommendations('The Godfather', cosine_sim)

867      The Godfather: Part III
2731      The Godfather: Part II
4638    Amidst the Devil's Wings
2649           The Son of No One
1525              Apocalypse Now
1018             The Cotton Club
1170     The Talented Mr. Ripley
1209               The Rainmaker
1394               Donnie Brasco
1850                    Scarface
Name: title, dtype: object

In [12]:
# get recommendation of movie Avatar
get_recommendations('Avatar',cosine_sim)

206                         Clash of the Titans
71        The Mummy: Tomb of the Dragon Emperor
786                           The Monkey King 2
103                   The Sorcerer's Apprentice
131                                     G-Force
215      Fantastic 4: Rise of the Silver Surfer
466                            The Time Machine
715                           The Scorpion King
1      Pirates of the Caribbean: At World's End
5                                  Spider-Man 3
Name: title, dtype: object