In [1]:
import pandas as pd
import numpy as np
import pickle
import re
import mysql.connector
from mysql.connector import Error

In [2]:
# Connecting to the database
# Enter your own host name, user name, password, database name
mymoviesafrica = mysql.connector.connect(host="localhost", user="root", passwd="#####", database="mymoviesafrica")

In [3]:
# Loading the data
content = pd.read_sql("SELECT id, title, synopsis, genres, tags FROM content", mymoviesafrica)

In [4]:
# Information on the data
content.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        101 non-null    int64 
 1   title     101 non-null    object
 2   synopsis  101 non-null    object
 3   genres    101 non-null    object
 4   tags      101 non-null    object
dtypes: int64(1), object(4)
memory usage: 4.1+ KB


In [5]:
# Previewing the data
content.head()

Unnamed: 0,id,title,synopsis,genres,tags
0,1,NAIROBI HALF LIFE,"The Award-winning Story of a young, aspiring A...","[""#OwnForLife Offer: KES199 or $1.99 (Diaspora...","Joseph Wairimu, Olwenya Maina, Nancy Wanjiku K..."
1,2,KATUTURA,"In this Township of Windhoek, ex-convict Dangi...","[""Action"",""Crime"",""Drama"",""Suspense""]","Chops Tshoopara, Obed Emvula, Gift Uzera, Odil..."
2,3,SOUL BOY,This is the story of 14 year-old Abila (Samson...,"[""#OwnForLife Offer: KES199 or $1.99 (Diaspora...","Samson Odhiambo, Leila Dayan Opou, Krysteen Sa..."
3,4,KATI KATI,"A young woman - Kaleche (Nyokabi Gethaiga), wi...","[""Drama"",""Madaraka Day Weekend Movie Marathon ...","Nyokabi Gethaiga, Elsaphan Njora, Paul Ogola, ..."
4,5,SOMETHING NECESSARY,"Award-winning Director Judy Kibinge, tells the...","[""Crime"",""Drama"",""Family"",""Madaraka Day Weeken...","Hilda Jepkoech, Kipngeno Kirui Duncan, Carolyn..."


# Data Cleaning

Cleaning synopsis

In [76]:
# Sample synopsis to track cleaning process
content['synopsis'][3]

'A young woman - Kaleche (Nyokabi Gethaiga), with No Memory of her Life or Death, is Helped with Assimilation to the Afterlife by a Ghost called Thoma (Elsaphan Njora).'

In [77]:
# Removing actor's names from the synopsis which are in brackets. Their names are already in tags
content['synopsis'] = content['synopsis'].apply(lambda x : re.sub(r"\([^()]*\)", "", x))

In [78]:
# Sample synopsis to track cleaning process
content['synopsis'][3]

'A young woman - Kaleche , with No Memory of her Life or Death, is Helped with Assimilation to the Afterlife by a Ghost called Thoma .'

In [79]:
# Lowercasing and removing all punctuation marks
content['synopsis'] = content['synopsis'].apply(lambda x : str.lower(re.sub('[^\w\s]', '', x)))

In [80]:
# Sample synopsis to track cleaning process
content['synopsis'][3]

'a young woman  kaleche  with no memory of her life or death is helped with assimilation to the afterlife by a ghost called thoma '

In [81]:
# Removing stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [82]:
content['synopsis'] = content['synopsis'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [83]:
# Sample synopsis to track cleaning process
content['synopsis'][3]

'young woman kaleche memory life death helped assimilation afterlife ghost called thoma'

Cleaning genre

In [84]:
# Sample genres to track cleaning process
content['genres'][0]

'["#OwnForLife Offer: KES199 or $1.99 (Diaspora)","26% Off in our Birthday Month","Action","Adventure","Crime","Drama","Suspense"]'

In [85]:
# Removing the quotation marks
content['genres'] = content['genres'].apply(lambda x: x.replace('"', ''))

In [86]:
# Sample genres to track cleaning process
content['genres'][0]

'[#OwnForLife Offer: KES199 or $1.99 (Diaspora),26% Off in our Birthday Month,Action,Adventure,Crime,Drama,Suspense]'

In [87]:
# The brackets are interpretted as part of the string rather than as a list
content.genres[0][0]

'['

In [88]:
# Convert genres into a list of strings
content['genres'] = content['genres'].apply(lambda x:x[1:-1].split(','))

In [89]:
# Sample genres to track cleaning process
content.genres[0]

['#OwnForLife Offer: KES199 or $1.99 (Diaspora)',
 '26% Off in our Birthday Month',
 'Action',
 'Adventure',
 'Crime',
 'Drama',
 'Suspense']

In [90]:
# Filtering in the main genres
genres = ["Action", "Drama", "Romance", "Comedy", "Crime", "Family", "Adventure", "Thriller", "Suspense", "Supernatural",
"Political", "Activism", "True Story", "Chick Flick", "Sports", "Short & Sweet", "Feel Good", "Musical", "Animation", 
"History", "Super Hero", "Fantasy", "Feel Good", ]
content['genres'] = content['genres'].apply(lambda x :[i for i in x if i in genres])

In [91]:
# Sample genres to track cleaning process
content['genres'][0]

['Action', 'Adventure', 'Crime', 'Drama', 'Suspense']

In [92]:
# lowercasing the genres
content['genres'] = content['genres'].apply(lambda x : [str.lower(i.replace(" ", "")) for i in x])

In [93]:
# Sample genres to track cleaning process
content['genres'][0]

['action', 'adventure', 'crime', 'drama', 'suspense']

In [94]:
# Converting the list to strings for easier joining later on
content['genres'] = [' '.join(map(str, l)) for l in content['genres']]

In [95]:
# Sample genres to track cleaning process
content['genres'][0]

'action adventure crime drama suspense'

Cleaning tags

In [96]:
# Removing the spacing between words 
# so that the vectorizer does not count the Brenda in "Brenda Wairimu" and "Brenda Shiru" as the same
content['tags'] = content['tags'].apply(lambda x : str.lower(x.replace(" ", "").replace(",", " ")))

In [97]:
content['tags'][0]

'josephwairimu olwenyamaina nancywanjikukaranja mugambinthiga toshgitonga'

In [98]:
# Joining all the columns together by a space so that we can use all the columns in making recommendations
# Below is a function that we do this
def create_soup(x):
    return ''.join(x['synopsis']) + ' ' + ''.join(x['genres']) + ' ' + ''.join(x['tags'])
# New column with the soup
content['soup'] = content.apply(create_soup, axis=1)

In [99]:
content['soup'][0]

'awardwinning story young aspiring actor upcountry kenya dreams becoming success big city pursuit chagrin brother parents makes way nairobi city opportunity action adventure crime drama suspense josephwairimu olwenyamaina nancywanjikukaranja mugambinthiga toshgitonga'

Building a recommender based on synopsis only

In [100]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
#Replace NaN with an empty string
content['synopsis'] = content['synopsis'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(content['synopsis'])

tfidf_matrix.shape

(101, 1887)

In [101]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
synopsis_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)

In [102]:
# Converting it to a dataframe with the movie ids as the indices and columns
synopsis_similarity = pd.DataFrame(synopsis_similarity, columns=content['id'].values, index=content['id'].values)

In [139]:
synopsis_similarity

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,99,100,101,102,103,104,105,106,107,108
1,1.000000,0.000000,0.008599,0.019240,0.058873,0.059856,0.010513,0.029560,0.009745,0.065129,...,0.032757,0.024163,0.012853,0.012532,0.056005,0.029064,0.049161,0.015818,0.009830,0.012388
2,0.000000,1.000000,0.000000,0.024457,0.034286,0.023484,0.000000,0.024765,0.000000,0.008724,...,0.016578,0.000000,0.016661,0.000000,0.043163,0.000000,0.000000,0.000000,0.085778,0.015746
3,0.008599,0.000000,1.000000,0.023152,0.029668,0.041518,0.034606,0.000000,0.005591,0.033381,...,0.015073,0.000000,0.037792,0.008206,0.077684,0.008337,0.009892,0.028886,0.030605,0.007107
4,0.019240,0.024457,0.023152,1.000000,0.044683,0.040194,0.000000,0.012775,0.000000,0.018909,...,0.010233,0.000000,0.000000,0.008044,0.031629,0.018655,0.000000,0.020306,0.048939,0.018227
5,0.058873,0.034286,0.029668,0.044683,1.000000,0.043433,0.005754,0.016177,0.075271,0.041501,...,0.037514,0.021552,0.016444,0.006859,0.036246,0.015906,0.000000,0.008657,0.063192,0.014550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104,0.029064,0.000000,0.008337,0.018655,0.015906,0.048355,0.010194,0.009649,0.009448,0.006655,...,0.014299,0.000000,0.000000,0.047126,0.049157,1.000000,0.025681,0.015337,0.040096,0.012011
105,0.049161,0.000000,0.009892,0.000000,0.000000,0.031762,0.000000,0.000000,0.040728,0.014873,...,0.048715,0.000000,0.051392,0.007208,0.013206,0.025681,1.000000,0.000000,0.016388,0.026844
106,0.015818,0.000000,0.028886,0.020306,0.008657,0.008520,0.040579,0.010503,0.000000,0.018771,...,0.012142,0.000000,0.034977,0.006613,0.031399,0.015337,0.000000,1.000000,0.026884,0.000000
107,0.009830,0.085778,0.030605,0.048939,0.063192,0.020535,0.088286,0.006527,0.000000,0.056781,...,0.054048,0.000000,0.062907,0.040190,0.019512,0.040096,0.016388,0.026884,1.000000,0.000000


In [140]:
# Pickling the similarity dataframe for use in deployment.
synopsis_similarity.to_pickle("./synopsis_similarity.pkl")

In [141]:
# Putting it all in a function
def synopsis_recommender(title):
    title_id = content[content['title'] == title]['id'].values
    similar_id = synopsis_similarity[title_id].nlargest(6, title_id).index[1:]
    similar_title = content[content['id'].isin(similar_id)]['title'].values.tolist()
    return similar_title

In [142]:
synopsis_recommender('DISCONNECT')

['KISSING SHADOWS',
 'Living in Bondage: Breaking Free',
 'BLURRED',
 'LOVE, ZAWADI x BLURRED',
 'Christmas Spirit']

Building a recommender based on genre only

In [106]:
from sklearn.feature_extraction.text import CountVectorizer

genre_count = CountVectorizer()
genre_matrix = genre_count.fit_transform(content['genres'])

In [107]:
# Compute the cosine similarity matrix based on the genre_matrix
from sklearn.metrics.pairwise import cosine_similarity

genre_similarity = pd.DataFrame(cosine_similarity(genre_matrix, genre_matrix), index=content['id'].values, columns=content['id'].values)

In [108]:
genre_similarity.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,99,100,101,102,103,104,105,106,107,108
1,1.0,0.894427,0.547723,0.447214,0.447214,0.547723,0.730297,0.0,0.2,0.223607,...,0.0,0.0,0.258199,0.0,0.258199,0.258199,0.258199,0.223607,0.258199,0.316228
2,0.894427,1.0,0.408248,0.5,0.5,0.612372,0.612372,0.0,0.223607,0.25,...,0.0,0.0,0.288675,0.0,0.288675,0.288675,0.288675,0.25,0.288675,0.353553
3,0.547723,0.408248,1.0,0.816497,0.408248,0.333333,0.666667,0.0,0.365148,0.612372,...,0.0,0.0,0.235702,0.0,0.235702,0.235702,0.235702,0.408248,0.235702,0.288675
4,0.447214,0.5,0.816497,1.0,0.25,0.408248,0.408248,0.0,0.223607,0.5,...,0.0,0.0,0.288675,0.0,0.288675,0.288675,0.288675,0.25,0.288675,0.353553
5,0.447214,0.5,0.408248,0.25,1.0,0.612372,0.612372,0.288675,0.67082,0.5,...,0.0,0.0,0.288675,0.0,0.288675,0.288675,0.288675,0.5,0.288675,0.353553


In [143]:
# Pickling the similarity dataframe for use in deployment.
genre_similarity.to_pickle("./genre_similarity.pkl")

In [144]:
def genre_reccomender(title):
    title_id = content[content['title'] == title]['id'].values
    similar_id = genre_similarity[title_id].nlargest(6, title_id).index[1:]
    similar_title = content[content['id'].isin(similar_id)]['title'].values.tolist()
    return similar_title

In [145]:
genre_reccomender('DISCONNECT')

['SIPHO DLAMINI',
 'KISSING SHADOWS',
 'YOU AGAIN',
 'QUEEN OF THE SUN',
 'THE STRAIGHT PATH']

Building a recommender based on tags i.e diretors and cast

In [111]:
# 
tag_count = CountVectorizer()
tag_matrix = tag_count.fit_transform(content['tags'])

In [112]:
tag_similarity = pd.DataFrame(cosine_similarity(tag_matrix, tag_matrix), index=content['id'].values, columns=content['id'].values)
tag_similarity.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,99,100,101,102,103,104,105,106,107,108
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [146]:
# Pickling the similarity dataframe for use in deployment.
tag_similarity.to_pickle("./tag_similarity.pkl")

In [147]:
def tag_reccomender(title):
    title_id = content[content['title'] == title]['id'].values
    similar_id = tag_similarity[title_id].nlargest(6, title_id).index[1:]
    similar_title = content[content['id'].isin(similar_id)]['title'].values.tolist()
    return similar_title

In [148]:
tag_reccomender('BLURRED')

['NAIROBI HALF LIFE',
 'KATUTURA',
 'DISCONNECT',
 'LOVE, ZAWADI',
 'LOVE, ZAWADI x BLURRED']

Using synopsis, genres and tags to make recommendations

In [115]:
soup_count = CountVectorizer()
soup_matrix = soup_count.fit_transform(content['soup'])

In [116]:
soup_similarity = pd.DataFrame(cosine_similarity(soup_matrix, soup_matrix), index=content['id'].values, columns=content['id'].values)
soup_similarity.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,99,100,101,102,103,104,105,106,107,108
1,1.0,0.142134,0.08444,0.113961,0.124838,0.119808,0.120701,0.048751,0.046114,0.086607,...,0.049041,0.046524,0.026624,0.031521,0.082637,0.092319,0.057236,0.063564,0.043863,0.053722
2,0.142134,1.0,0.049507,0.133631,0.09759,0.093659,0.084921,0.028583,0.027037,0.040622,...,0.023002,0.027277,0.03122,0.0,0.072675,0.036084,0.033558,0.037268,0.102869,0.062994
3,0.08444,0.049507,1.0,0.132314,0.072471,0.069552,0.134535,0.0,0.048187,0.108599,...,0.047829,0.0,0.064915,0.021958,0.115135,0.042875,0.039873,0.088561,0.09167,0.037424
4,0.113961,0.133631,0.132314,1.0,0.104328,0.100125,0.060523,0.030557,0.028904,0.086854,...,0.02459,0.0,0.016688,0.019757,0.077693,0.077152,0.035875,0.079682,0.082479,0.067344
5,0.124838,0.09759,0.072471,0.104328,1.0,0.109682,0.082874,0.05021,0.158312,0.107037,...,0.087547,0.015972,0.036561,0.021642,0.070924,0.063387,0.019649,0.065465,0.120468,0.055328


In [138]:
# Pickling the similarity dataframe for use in deployment.
soup_similarity.to_pickle("./soup_similarity.pkl")

In [135]:
def general_reccomender(title):
    title_id = content[content['title'] == title]['id'].values
    similar_id = soup_similarity[title_id].nlargest(6, title_id).index[1:]
    similar_title = content[content['id'].isin(similar_id)]['title'].values.tolist()
    return similar_title

In [136]:
general_reccomender('DISCONNECT')

['JONAROBI',
 'Living in Bondage: Breaking Free',
 'YOU AGAIN',
 'BLURRED',
 'Christmas Spirit']

We can make recommendation based on other content features like director, production company. My Movies Africa don't have this data or in the case of directors, not in a suitable form. Therefore we will use dummmy data to build a model based on these features so that it can be used once the data is available.


Building a recommender based on similar directors

In [119]:
# Creating dummy director names
from faker import Faker
fake = Faker()
fake_directors = []
for director in range(30):
    fake_directors.append(fake.name())
print(fake_directors)

['Sheila Boyd', 'Jasmine Richards', 'Darrell Jones', 'Mary Hamilton', 'Pamela Barnes', 'Abigail Cox', 'Mathew Phillips', 'Benjamin Dominguez', 'Tiffany Cooper', 'Johnny Hawkins', 'Jeremiah Kirk', 'Robin Henderson', 'Eric Schwartz', 'Courtney Sharp', 'Austin Patrick', 'Eric Norman', 'Michael Brewer', 'Jessica Cline', 'Sarah Patton', 'Heather Mcdowell', 'John Leonard', 'Katrina Herrera', 'Mary Kemp', 'Joseph Ruiz', 'Greg Andrade', 'Kelly Wiley', 'Robert Johnson', 'Lisa Graham', 'Rachel King', 'Kaitlyn Simpson']


In [120]:
content['director'] = ''

In [121]:
# Creating a column in content with the directors name
import random
for i in range(len(content) + 1):
    content['director'][i] = random.choice(fake_directors)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [122]:
content.head()

Unnamed: 0,id,title,synopsis,genres,tags,soup,director
0,1,NAIROBI HALF LIFE,awardwinning story young aspiring actor upcoun...,action adventure crime drama suspense,josephwairimu olwenyamaina nancywanjikukaranja...,awardwinning story young aspiring actor upcoun...,Courtney Sharp
1,2,KATUTURA,township windhoek exconvict dangi deal living ...,action crime drama suspense,chopstshoopara obedemvula giftuzera odillemull...,township windhoek exconvict dangi deal living ...,Jasmine Richards
2,3,SOUL BOY,story 14 yearold abila one morning teenager di...,adventure drama family supernatural suspense t...,samsonodhiambo leiladayanopou krysteensavane f...,story 14 yearold abila one morning teenager di...,Sarah Patton
3,4,KATI KATI,young woman kaleche memory life death helped a...,drama supernatural suspense thriller,nyokabigethaiga elsaphannjora paulogola peterk...,young woman kaleche memory life death helped a...,Mary Kemp
4,5,SOMETHING NECESSARY,awardwinning director judy kibinge tells story...,crime drama family political,hildajepkoech kipngenokiruiduncan carolynecheb...,awardwinning director judy kibinge tells story...,Eric Schwartz


In [149]:
def director_recommender(title):
    director = content[content['title'] == title]['director'].values.tolist()
    director_movies = content[content['director'].isin(director)][['title']]
    return director_movies[director_movies['title'] != title]['title'].values.tolist()

In [150]:
director_recommender('DISCONNECT')

['TWENDE BERLIN',
 'THE XYZ SHOW presents THE BEST OF ARUTODE, VOLUME 1',
 'THE XYZ SHOW presents THE BEST OF TINGA, VOLUME 1',
 '6:59',
 'The Cinderella  Effect: The Angela Ellington Story']

Building a recommender based on similar production companies

In [125]:
# Creating fake production companies
fake_companies = ['Tribe Studios', 'Arrowhead', 'Allies Brothers', 'Capital Two', 'Pulse Comics', 'XYZ Nation', 'Fluent',
'Boiiii', 'Jingle Bells', 'Emotional Damage!', 'Funny Entertainment', 'Okay Pictures', 'Good Guys', 'The Truth']

content['production_company'] = ''

for i in range(len(content) + 1):
    content['production_company'][i] = random.choice(fake_companies)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [126]:
content.head()

Unnamed: 0,id,title,synopsis,genres,tags,soup,director,production_company
0,1,NAIROBI HALF LIFE,awardwinning story young aspiring actor upcoun...,action adventure crime drama suspense,josephwairimu olwenyamaina nancywanjikukaranja...,awardwinning story young aspiring actor upcoun...,Courtney Sharp,The Truth
1,2,KATUTURA,township windhoek exconvict dangi deal living ...,action crime drama suspense,chopstshoopara obedemvula giftuzera odillemull...,township windhoek exconvict dangi deal living ...,Jasmine Richards,Okay Pictures
2,3,SOUL BOY,story 14 yearold abila one morning teenager di...,adventure drama family supernatural suspense t...,samsonodhiambo leiladayanopou krysteensavane f...,story 14 yearold abila one morning teenager di...,Sarah Patton,Pulse Comics
3,4,KATI KATI,young woman kaleche memory life death helped a...,drama supernatural suspense thriller,nyokabigethaiga elsaphannjora paulogola peterk...,young woman kaleche memory life death helped a...,Mary Kemp,Tribe Studios
4,5,SOMETHING NECESSARY,awardwinning director judy kibinge tells story...,crime drama family political,hildajepkoech kipngenokiruiduncan carolynecheb...,awardwinning director judy kibinge tells story...,Eric Schwartz,XYZ Nation


In [155]:
def prod_company_recommender(title):
    director = content[content['title'] == title]['production_company'].values.tolist()
    director_movies = content[content['production_company'].isin(director)][['title']]
    return director_movies[director_movies['title'] != title]['title'].values.tolist()

In [157]:
prod_company_recommender('SOUL BOY')

['KIZINGO', 'LONDON FEVER', 'FROM HERE TO TIMBUKTU', 'WHY U HATE']