In [93]:
# Importing the nesessary libraries
import pandas as pd
import sqlalchemy 
# Constructing an engine to work with in python as i query the database
engine=sqlalchemy.create_engine("mysql+pymysql://root:password@localhost:3306/movies") 

In [94]:
# Reading the sql table containing content
df2=pd.read_sql_table("content",engine)
df2.head(3)

Unnamed: 0,id,ref,service,type,title,synopsis,duration,endsat,year,trailer_url,...,pvod_price,pvodusd,premiereDate,rentalKesDiscount,rentalUsdDiscount,estKesDiscount,estUsdDiscount,isDiscountActive,pvodKesDiscount,pvodUsdDiscount
0,1,01af1718424ae08b,mctv,Feature Film,NAIROBI HALF LIFE,"The Award-winning Story of a young, aspiring A...",96,,2012,_Z_51N3PIB4,...,0.0,0.0,,0.0,0.0,67.0,0.0,0,0.0,0.0
1,2,174e049ecf3957a9,mctv,Feature Film,KATUTURA,"In this Township of Windhoek, ex-convict Dangi...",101,,2016,My3tUYzIj2Q,...,,,,0.0,0.0,0.0,0.0,1,0.0,0.0
2,3,ba662ae6e76c60c9,mctv,Feature Film,SOUL BOY,This is the story of 14 year-old Abila (Samson...,61,,2010,2GT4Rgq-cOY,...,0.0,0.0,,0.0,0.0,67.0,0.0,0,0.0,0.0


In [95]:
# Assessing the columns we can use to obtain a similarity matrix
df2.columns

Index(['id', 'ref', 'service', 'type', 'title', 'synopsis', 'duration',
       'endsat', 'year', 'trailer_url', 'rating', 'genres', 'countries',
       'language', 'tags', 'est', 'rental', 'est_price', 'rental_price',
       'estusd', 'rentalusd', 'downloadable', 'dubbing', 'classification',
       'published', 'keyref', 'createdby', 'createdon', 'lasteditedby',
       'lasteditedon', 'pvod', 'pvod_price', 'pvodusd', 'premiereDate',
       'rentalKesDiscount', 'rentalUsdDiscount', 'estKesDiscount',
       'estUsdDiscount', 'isDiscountActive', 'pvodKesDiscount',
       'pvodUsdDiscount'],
      dtype='object')

In [96]:
# Columns illustrating the content in the movies
df2[['title','synopsis','genres','tags','language','countries','type']].head() 

Unnamed: 0,title,synopsis,genres,tags,language,countries,type
0,NAIROBI HALF LIFE,"The Award-winning Story of a young, aspiring A...","[""#OwnForLife Offer: KES199 or $1.99 (Diaspora...","Joseph Wairimu, Olwenya Maina, Nancy Wanjiku K...","""Swahili,English""","""KE""",Feature Film
1,KATUTURA,"In this Township of Windhoek, ex-convict Dangi...","[""Action"",""Crime"",""Drama"",""Suspense""]","Chops Tshoopara, Obed Emvula, Gift Uzera, Odil...","""English""","""NA""",Feature Film
2,SOUL BOY,This is the story of 14 year-old Abila (Samson...,"[""#OwnForLife Offer: KES199 or $1.99 (Diaspora...","Samson Odhiambo, Leila Dayan Opou, Krysteen Sa...","""Swahili,English""","""KE""",Feature Film
3,KATI KATI,"A young woman - Kaleche (Nyokabi Gethaiga), wi...","[""Drama"",""Madaraka Day Weekend Movie Marathon ...","Nyokabi Gethaiga, Elsaphan Njora, Paul Ogola, ...","""English,Swahili""","""KE""",Feature Film
4,SOMETHING NECESSARY,"Award-winning Director Judy Kibinge, tells the...","[""Crime"",""Drama"",""Family"",""Madaraka Day Weeken...","Hilda Jepkoech, Kipngeno Kirui Duncan, Carolyn...","""Swahili,English""","""KE""",Feature Film


In [97]:
# Reading the sql table containing content
df3=pd.read_sql_table("contenttypes",engine)
df3.head(3)

Unnamed: 0,id,name,createdon,createdby
0,1,Short Film,2018-02-24 15:20:46,1
1,2,Feature Film,2018-02-24 15:20:46,1
2,3,Series,2018-02-24 15:20:46,1


#### Using Feature Extraction to constuct item vectors from movie content as features 

In [98]:
# Extracting features from the synopsis so that we can compute similarity or disimilarity between them
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel,sigmoid_kernel
# We want to create vectors from the content in the the synopsis
Tfid=TfidfVectorizer(
    strip_accents="unicode",
    lowercase=True,
    stop_words="english",
    min_df=2,
    ngram_range=(1,3))
# We will remove all the empty spaces and fill it with an empty string to avoid error
df2['synopsis']=df2['synopsis'].fillna('')
# Creating a sparse matrix of term frequencies in the synopsis
tfid_mat=Tfid.fit_transform(df2['synopsis'])
tfid_mat
# Creating a similarity matrix using dot product to compute similarity between the vectors created
sim_mat=sigmoid_kernel(tfid_mat,tfid_mat) 

In [99]:
# Now that we have a similarity matrix . We form a pandas series with movies and indices 
movies=pd.Series(df2.index,index=df2['title']).drop_duplicates()
movies.head(10) 

title
NAIROBI HALF LIFE        0
KATUTURA                 1
SOUL BOY                 2
KATI KATI                3
SOMETHING NECESSARY      4
VEVE                     5
THE BODA BODA THIEVES    6
MUSIC IS OUR WEAPON      7
NI SISI                  8
WATATU                   9
dtype: int64

In [100]:
# Creating a function to obtain the 10 most similar movies according to the similarity matrix created
def get_similar_movies(title,matrix):
    index=movies[title]
    # Listing the movie index and the specific matrix matched
    movie_sim_score=list(enumerate(matrix[index]))
    # Sorting it in descending order
    sorted_score=sorted(movie_sim_score,key=lambda x:x[1],reverse=True)
    # Obtaining the top ten scores
    top_ten=sorted_score[1:11]
    # Movie indexes
    movie_indexes= [i[0] for i in top_ten]
    # Obtaining similar movies
    return df2["title"].iloc[movie_indexes]
    

In [101]:
# Calling the function
get_similar_movies('NAIROBI HALF LIFE',sim_mat) 

4       SOMETHING NECESSARY
58               WHY U HATE
5                      VEVE
29                 JONAROBI
31                 WAZI? FM
47    FROM HERE TO TIMBUKTU
33               DISCONNECT
35             LONDON FEVER
9                    WATATU
23                   LUSALA
Name: title, dtype: object

In [102]:
get_similar_movies('LONDON FEVER',sim_mat)

83                UNLOVE ME
79            THE SOUND MAN
0         NAIROBI HALF LIFE
2                  SOUL BOY
29                 JONAROBI
93                   PUAADA
19          CAHIER AFRICAIN
85              THE OUTCAST
47    FROM HERE TO TIMBUKTU
62                      KIU
Name: title, dtype: object

In [103]:
get_similar_movies('WHY U HATE',sim_mat)

29                  JONAROBI
0          NAIROBI HALF LIFE
94     Victims On Lake Volta
81                  REKINDLE
99                  Dog City
45                   NAPUNYI
95    LOVE, ZAWADI x BLURRED
84          QUEEN OF THE SUN
67                   BLURRED
37           TRUTH & TIDINGS
Name: title, dtype: object

### Building a recomender based on the genres and actors present

In [104]:
other_content=df2[['title','synopsis','genres','tags']]
other_content.head() 

Unnamed: 0,title,synopsis,genres,tags
0,NAIROBI HALF LIFE,"The Award-winning Story of a young, aspiring A...","[""#OwnForLife Offer: KES199 or $1.99 (Diaspora...","Joseph Wairimu, Olwenya Maina, Nancy Wanjiku K..."
1,KATUTURA,"In this Township of Windhoek, ex-convict Dangi...","[""Action"",""Crime"",""Drama"",""Suspense""]","Chops Tshoopara, Obed Emvula, Gift Uzera, Odil..."
2,SOUL BOY,This is the story of 14 year-old Abila (Samson...,"[""#OwnForLife Offer: KES199 or $1.99 (Diaspora...","Samson Odhiambo, Leila Dayan Opou, Krysteen Sa..."
3,KATI KATI,"A young woman - Kaleche (Nyokabi Gethaiga), wi...","[""Drama"",""Madaraka Day Weekend Movie Marathon ...","Nyokabi Gethaiga, Elsaphan Njora, Paul Ogola, ..."
4,SOMETHING NECESSARY,"Award-winning Director Judy Kibinge, tells the...","[""Crime"",""Drama"",""Family"",""Madaraka Day Weeken...","Hilda Jepkoech, Kipngeno Kirui Duncan, Carolyn..."


In [105]:
other_content=other_content.rename(columns={"title":"Title","synopsis":"Description","genres":"Genres","tags":"Cast"})
other_content.head()

Unnamed: 0,Title,Description,Genres,Cast
0,NAIROBI HALF LIFE,"The Award-winning Story of a young, aspiring A...","[""#OwnForLife Offer: KES199 or $1.99 (Diaspora...","Joseph Wairimu, Olwenya Maina, Nancy Wanjiku K..."
1,KATUTURA,"In this Township of Windhoek, ex-convict Dangi...","[""Action"",""Crime"",""Drama"",""Suspense""]","Chops Tshoopara, Obed Emvula, Gift Uzera, Odil..."
2,SOUL BOY,This is the story of 14 year-old Abila (Samson...,"[""#OwnForLife Offer: KES199 or $1.99 (Diaspora...","Samson Odhiambo, Leila Dayan Opou, Krysteen Sa..."
3,KATI KATI,"A young woman - Kaleche (Nyokabi Gethaiga), wi...","[""Drama"",""Madaraka Day Weekend Movie Marathon ...","Nyokabi Gethaiga, Elsaphan Njora, Paul Ogola, ..."
4,SOMETHING NECESSARY,"Award-winning Director Judy Kibinge, tells the...","[""Crime"",""Drama"",""Family"",""Madaraka Day Weeken...","Hilda Jepkoech, Kipngeno Kirui Duncan, Carolyn..."


In [106]:
cols=['Christmas Under Curfew','#OwnForLife Offer: KES199 or $1.99 (Diaspora)','26% Off in our Birthday Month','Madaraka Day Weekend Movie Marathon Offer',
'Throwback Thursday Offer','GET 20% OFF THIS EID-AL-ADHA','Father\u2019s Day Offer','JENGA JIRANI CHARITY FESTIVAL','30% OFF THIS WACKY WEEKEND!',
'30% Diwali Discounts','Family Screen Time','Huduma Day Offer!','Father\\u2019s Day Offer','Internationally Acclaimed','Kalasha Award Winners',
'New Releases']
for i in cols:
   other_content['Genres']=other_content['Genres'].apply(lambda x : x.replace(i,''))

In [107]:
# Replacing null string with a nan
import numpy as np
other_content['Genres']=other_content['Genres'].replace('null',np.NaN)
# Filling the null values in the genre category with uncategorized
other_content['Genres'].fillna("['Test']",axis=0,inplace=True)

In [108]:
# Stripping off string like elements in the  genres column
from ast import literal_eval
# The feature to be applied in
features = ['Genres']
for feature in features:
    other_content[feature] = other_content[feature].apply(literal_eval)

In [109]:
other_content['Genres'] 

0        [, , Action, Adventure, Crime, Drama, Suspense]
1                       [Action, Crime, Drama, Suspense]
2      [, Adventure, , Drama, Family, , , Supernatura...
3            [Drama, , Supernatural, Suspense, Thriller]
4                    [Crime, Drama, Family, , Political]
                             ...                        
96                              [Drama, , Short & Sweet]
97                              [Drama, , Short & Sweet]
98                      [Drama, Family, , Short & Sweet]
99                              [Drama, , Short & Sweet]
100               [Bollywood, Drama, Dramedy, , Romance]
Name: Genres, Length: 101, dtype: object

In [110]:
# Cleaning our genres column and removing the spaces between words
def clean_genres(x):
    if isinstance(x,list):
        return [str.lower(i.replace(' ',''))for i in x]

In [111]:
# The features to be applied in
features=['Genres']
for feature in features:
    other_content[feature] = other_content[feature].apply(clean_genres)

In [112]:
# Creating a function to form the soup.
def create_soup(x):
    return ' '.join(x['Genres'])

In [113]:
other_content['soup'] =other_content.apply(create_soup, axis=1)
# Getting the soup of genres
other_content['soup'].head()

0                action adventure crime drama suspense
1                          action crime drama suspense
2     adventure  drama family   supernatural suspen...
3                drama  supernatural suspense thriller
4                        crime drama family  political
Name: soup, dtype: object

In [114]:
# Using the Count Vectorizer to find out the count of certain genres per movie
from sklearn.feature_extraction.text import CountVectorizer
count=CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(other_content['soup'])
count_matrix.todense()
count_matrix.shape 

(101, 51)

In [115]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim2 = cosine_similarity(count_matrix, count_matrix) 

In [116]:
get_similar_movies('NAIROBI HALF LIFE',cosine_sim2)

1                  KATUTURA
26          COERCED REVENGE
6     THE BODA BODA THIEVES
31                 WAZI? FM
38                 CODE 254
52             TAIFA TUKUFU
2                  SOUL BOY
5                      VEVE
25                   SUBIRA
30                  KIZINGO
Name: title, dtype: object

In [117]:
get_similar_movies('WHY U HATE',cosine_sim2)

19        CAHIER AFRICAIN
26        COERCED REVENGE
1                KATUTURA
4     SOMETHING NECESSARY
10          Short & Sweet
68           LOVE, ZAWADI
69                  BENTA
0       NAIROBI HALF LIFE
31               WAZI? FM
64                   6:59
Name: title, dtype: object

In [118]:
quiver_df=pd.read_excel("Quiver_MyMovies.Africa_180920.xlsx",header=5)
quiver_df.head(2)

Unnamed: 0,Title,Minutes,Genre 1,Genre 2,Synopsis,Talent,Director,Produced,Origin,Language,Internet Download to Own,Transactional VOD,Subscription VOD,Free VOD
0,"305, The",84,Comedy Spoof,History,"Based on the online smash hit, ‘305' is a mock...","Cast:Tim Larson, Brandon Tyra, David Leo Schul...","Daniel Holechek, David M. Holechek",2008,United States of America,English,RESALE,RESALE,Available,Available
1,"4%: Film’s Gender Problem, The",4,Comedy Spoof,Arts,From Alex Gibney’s Jigsaw Productions and feat...,"Anjelica Huston, James Franco, Kristen Wiig, M...",Caroline Suh,2016,United States of America,English,Available,Available,Available,Available


In [119]:
quiver_df.shape

(313, 14)

In [120]:
kenyan_df=pd.read_csv("kenyan_movies.csv").drop(columns={"Unnamed: 0","Year of Release","Watch Time","Movie Rating","Metascore of movie","Votes"}).rename(columns={"Genre":"Genres"})
kenyan_df.head()

Unnamed: 0,Title,Genres,Director,Cast,Description
0,The Constant Gardener,"\nDrama, Mystery, Romance",Fernando Meirelles,"Ralph Fiennes,Rachel Weisz,Danny Huston,Hubert...",A widower is determined to get to the bottom o...
1,Endangered Species,"\nAction, Thriller",M.J. Bassett,"Rebecca Romijn,Philip Winchester,Isabel Basset...","Jack Halsey takes his wife, their adult kids, ..."
2,Samsara,"\nDocumentary, Music",Ron Fricke,"Balinese Tari Legong Dancers,Ni Made Megahadi ...",Filmed over nearly five years in twenty-five c...
3,Uradi,\nThriller,Kang'ethe Mungai,"Mwaura Bilal,Chris Kamau,Shix Kapyenga,Peter Kawa",A university students looks for a way to make ...
4,The Pirates of Somalia,"\nBiography, Drama",Bryan Buckley,"Al Pacino,Evan Peters,Melanie Griffith,Barkhad...","In 2008, rookie journalist Jay Bahadur forms a..."


In [121]:
kenyan_df.shape 

(700, 5)

In [122]:
dataframes=[other_content,kenyan_df]
final_df=pd.concat(dataframes,axis=0,join="outer",ignore_index=True)
final_df.head()

Unnamed: 0,Title,Description,Genres,Cast,soup,Director
0,NAIROBI HALF LIFE,"The Award-winning Story of a young, aspiring A...","[, , action, adventure, crime, drama, suspense]","Joseph Wairimu, Olwenya Maina, Nancy Wanjiku K...",action adventure crime drama suspense,
1,KATUTURA,"In this Township of Windhoek, ex-convict Dangi...","[action, crime, drama, suspense]","Chops Tshoopara, Obed Emvula, Gift Uzera, Odil...",action crime drama suspense,
2,SOUL BOY,This is the story of 14 year-old Abila (Samson...,"[, adventure, , drama, family, , , supernatura...","Samson Odhiambo, Leila Dayan Opou, Krysteen Sa...",adventure drama family supernatural suspen...,
3,KATI KATI,"A young woman - Kaleche (Nyokabi Gethaiga), wi...","[drama, , supernatural, suspense, thriller]","Nyokabi Gethaiga, Elsaphan Njora, Paul Ogola, ...",drama supernatural suspense thriller,
4,SOMETHING NECESSARY,"Award-winning Director Judy Kibinge, tells the...","[crime, drama, family, , political]","Hilda Jepkoech, Kipngeno Kirui Duncan, Carolyn...",crime drama family political,


In [123]:
#Checking for null values
final_df.isnull().any()

Title          False
Description    False
Genres          True
Cast            True
soup            True
Director        True
dtype: bool

In [124]:
final_df.fillna('',inplace=True)
final_df.head()

Unnamed: 0,Title,Description,Genres,Cast,soup,Director
0,NAIROBI HALF LIFE,"The Award-winning Story of a young, aspiring A...","[, , action, adventure, crime, drama, suspense]","Joseph Wairimu, Olwenya Maina, Nancy Wanjiku K...",action adventure crime drama suspense,
1,KATUTURA,"In this Township of Windhoek, ex-convict Dangi...","[action, crime, drama, suspense]","Chops Tshoopara, Obed Emvula, Gift Uzera, Odil...",action crime drama suspense,
2,SOUL BOY,This is the story of 14 year-old Abila (Samson...,"[, adventure, , drama, family, , , supernatura...","Samson Odhiambo, Leila Dayan Opou, Krysteen Sa...",adventure drama family supernatural suspen...,
3,KATI KATI,"A young woman - Kaleche (Nyokabi Gethaiga), wi...","[drama, , supernatural, suspense, thriller]","Nyokabi Gethaiga, Elsaphan Njora, Paul Ogola, ...",drama supernatural suspense thriller,
4,SOMETHING NECESSARY,"Award-winning Director Judy Kibinge, tells the...","[crime, drama, family, , political]","Hilda Jepkoech, Kipngeno Kirui Duncan, Carolyn...",crime drama family political,


In [125]:
final_df.isnull().any()

Title          False
Description    False
Genres         False
Cast           False
soup           False
Director       False
dtype: bool

In [126]:
final_df.shape

(801, 6)

In [127]:
# Creating a soup of the important features
def get_extra_soup(data):
    soup=[]
    for i in range(0,final_df.shape[0]):
        soup.append(data['Description'][i]+' '+data['soup'][i]+' '+data['Cast'][i]+' '+data['Director'][i])
    return soup

In [128]:
final_df['extra_soup']=get_extra_soup(final_df) 
final_df['extra_soup']

0      The Award-winning Story of a young, aspiring A...
1      In this Township of Windhoek, ex-convict Dangi...
2      This is the story of 14 year-old Abila (Samson...
3      A young woman - Kaleche (Nyokabi Gethaiga), wi...
4      Award-winning Director Judy Kibinge, tells the...
                             ...                        
796    Add a Plot  Patricia Kihoro,Willy Mwangi,Emma ...
797    An outcast settles in a village far away from ...
798    Rift Valley, Kenya. Three women from the same ...
799    A rejected film student, who knows only the mo...
800    'Love, Zawadi' highlights how the enforced loc...
Name: extra_soup, Length: 801, dtype: object

In [129]:
# Extracting features from the synopsis so that we can compute similarity or disimilarity between them
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel,sigmoid_kernel
# We want to create vectors from the content in the the synopsis
Tfid_final=TfidfVectorizer(
    strip_accents="unicode",
    lowercase=True,
    stop_words="english",
    min_df=2,
    ngram_range=(1,3))
# We will remove all the empty spaces and fill it with an empty string to avoid error
# df2['synopsis']=df2['synopsis'].fillna('')
# Creating a sparse matrix of term frequencies in the synopsis
tfid_mat_final=Tfid_final.fit_transform(final_df['extra_soup'])
tfid_mat_final
# Creating a similarity matrix using dot product to compute similarity between the vectors created
sim_mat_final=sigmoid_kernel(tfid_mat_final,tfid_mat_final) 