In [535]:
# Importing the nesessary libraries
import pandas as pd
import sqlalchemy 
# Constructing an engine to work with in python as i query the database
engine=sqlalchemy.create_engine("mysql+pymysql://root:password@localhost:3306/movies") 

In [536]:
# Reading the sql table containing content
df2=pd.read_sql_table("content",engine)
df2.head(3)

Unnamed: 0,id,ref,service,type,title,synopsis,duration,endsat,year,trailer_url,...,pvod_price,pvodusd,premiereDate,rentalKesDiscount,rentalUsdDiscount,estKesDiscount,estUsdDiscount,isDiscountActive,pvodKesDiscount,pvodUsdDiscount
0,1,01af1718424ae08b,mctv,Feature Film,NAIROBI HALF LIFE,"The Award-winning Story of a young, aspiring A...",96,,2012,_Z_51N3PIB4,...,0.0,0.0,,0.0,0.0,67.0,0.0,0,0.0,0.0
1,2,174e049ecf3957a9,mctv,Feature Film,KATUTURA,"In this Township of Windhoek, ex-convict Dangi...",101,,2016,My3tUYzIj2Q,...,,,,0.0,0.0,0.0,0.0,1,0.0,0.0
2,3,ba662ae6e76c60c9,mctv,Feature Film,SOUL BOY,This is the story of 14 year-old Abila (Samson...,61,,2010,2GT4Rgq-cOY,...,0.0,0.0,,0.0,0.0,67.0,0.0,0,0.0,0.0


In [537]:
# Assessing the columns we can use to obtain a similarity matrix
df2.columns

Index(['id', 'ref', 'service', 'type', 'title', 'synopsis', 'duration',
       'endsat', 'year', 'trailer_url', 'rating', 'genres', 'countries',
       'language', 'tags', 'est', 'rental', 'est_price', 'rental_price',
       'estusd', 'rentalusd', 'downloadable', 'dubbing', 'classification',
       'published', 'keyref', 'createdby', 'createdon', 'lasteditedby',
       'lasteditedon', 'pvod', 'pvod_price', 'pvodusd', 'premiereDate',
       'rentalKesDiscount', 'rentalUsdDiscount', 'estKesDiscount',
       'estUsdDiscount', 'isDiscountActive', 'pvodKesDiscount',
       'pvodUsdDiscount'],
      dtype='object')

In [538]:
# Columns illustrating the content in the movies
df2[['title','synopsis','genres','tags','language','countries','type']].head() 

Unnamed: 0,title,synopsis,genres,tags,language,countries,type
0,NAIROBI HALF LIFE,"The Award-winning Story of a young, aspiring A...","[""#OwnForLife Offer: KES199 or $1.99 (Diaspora...","Joseph Wairimu, Olwenya Maina, Nancy Wanjiku K...","""Swahili,English""","""KE""",Feature Film
1,KATUTURA,"In this Township of Windhoek, ex-convict Dangi...","[""Action"",""Crime"",""Drama"",""Suspense""]","Chops Tshoopara, Obed Emvula, Gift Uzera, Odil...","""English""","""NA""",Feature Film
2,SOUL BOY,This is the story of 14 year-old Abila (Samson...,"[""#OwnForLife Offer: KES199 or $1.99 (Diaspora...","Samson Odhiambo, Leila Dayan Opou, Krysteen Sa...","""Swahili,English""","""KE""",Feature Film
3,KATI KATI,"A young woman - Kaleche (Nyokabi Gethaiga), wi...","[""Drama"",""Madaraka Day Weekend Movie Marathon ...","Nyokabi Gethaiga, Elsaphan Njora, Paul Ogola, ...","""English,Swahili""","""KE""",Feature Film
4,SOMETHING NECESSARY,"Award-winning Director Judy Kibinge, tells the...","[""Crime"",""Drama"",""Family"",""Madaraka Day Weeken...","Hilda Jepkoech, Kipngeno Kirui Duncan, Carolyn...","""Swahili,English""","""KE""",Feature Film


In [539]:
# Reading the sql table containing content
df3=pd.read_sql_table("contenttypes",engine)
df3.head(3)

Unnamed: 0,id,name,createdon,createdby
0,1,Short Film,2018-02-24 15:20:46,1
1,2,Feature Film,2018-02-24 15:20:46,1
2,3,Series,2018-02-24 15:20:46,1


#### Using Feature Extraction to constuct item vectors from movie content as features 

In [540]:
# Extracting features from the synopsis so that we can compute similarity or disimilarity between them
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel,sigmoid_kernel
# We want to create vectors from the content in the the synopsis
Tfid=TfidfVectorizer(
    strip_accents="unicode",
    lowercase=True,
    stop_words="english",
    min_df=2,
    ngram_range=(1,3))
# We will remove all the empty spaces and fill it with an empty string to avoid error
df2['synopsis']=df2['synopsis'].fillna('')
# Creating a sparse matrix of term frequencies in the synopsis
tfid_mat=Tfid.fit_transform(df2['synopsis'])
tfid_mat
# Creating a similarity matrix using dot product to compute similarity between the vectors created
sim_mat=sigmoid_kernel(tfid_mat,tfid_mat) 

In [541]:
# Now that we have a similarity matrix . We form a pandas series with movies and indices 
movies=pd.Series(df2.index,index=df2['title']).drop_duplicates()
movies.head(10) 

title
NAIROBI HALF LIFE        0
KATUTURA                 1
SOUL BOY                 2
KATI KATI                3
SOMETHING NECESSARY      4
VEVE                     5
THE BODA BODA THIEVES    6
MUSIC IS OUR WEAPON      7
NI SISI                  8
WATATU                   9
dtype: int64

In [542]:
# Creating a function to obtain the 10 most similar movies according to the similarity matrix created
def get_similar_movies(title,matrix):
    index=movies[title]
    # Listing the movie index and the specific matrix matched
    movie_sim_score=list(enumerate(matrix[index]))
    # Sorting it in descending order
    sorted_score=sorted(movie_sim_score,key=lambda x:x[1],reverse=True)
    # Obtaining the top ten scores
    top_ten=sorted_score[1:11]
    # Movie indexes
    movie_indexes= [i[0] for i in top_ten]
    # Obtaining similar movies
    return df2["title"].iloc[movie_indexes]
    

In [543]:
# Calling the function
get_similar_movies('NAIROBI HALF LIFE',sim_mat)

4       SOMETHING NECESSARY
58               WHY U HATE
5                      VEVE
29                 JONAROBI
31                 WAZI? FM
47    FROM HERE TO TIMBUKTU
33               DISCONNECT
35             LONDON FEVER
9                    WATATU
23                   LUSALA
Name: title, dtype: object

In [544]:
get_similar_movies('LONDON FEVER',sim_mat)

83                UNLOVE ME
79            THE SOUND MAN
0         NAIROBI HALF LIFE
2                  SOUL BOY
29                 JONAROBI
93                   PUAADA
19          CAHIER AFRICAIN
85              THE OUTCAST
47    FROM HERE TO TIMBUKTU
62                      KIU
Name: title, dtype: object

In [545]:
get_similar_movies('WHY U HATE',sim_mat)

29                  JONAROBI
0          NAIROBI HALF LIFE
94     Victims On Lake Volta
81                  REKINDLE
99                  Dog City
45                   NAPUNYI
95    LOVE, ZAWADI x BLURRED
84          QUEEN OF THE SUN
67                   BLURRED
37           TRUTH & TIDINGS
Name: title, dtype: object

### Building a recomender based on the genres and actors present

In [546]:
other_content=df2[['title','synopsis','genres','tags','language','countries','type']]
other_content.head() 

Unnamed: 0,title,synopsis,genres,tags,language,countries,type
0,NAIROBI HALF LIFE,"The Award-winning Story of a young, aspiring A...","[""#OwnForLife Offer: KES199 or $1.99 (Diaspora...","Joseph Wairimu, Olwenya Maina, Nancy Wanjiku K...","""Swahili,English""","""KE""",Feature Film
1,KATUTURA,"In this Township of Windhoek, ex-convict Dangi...","[""Action"",""Crime"",""Drama"",""Suspense""]","Chops Tshoopara, Obed Emvula, Gift Uzera, Odil...","""English""","""NA""",Feature Film
2,SOUL BOY,This is the story of 14 year-old Abila (Samson...,"[""#OwnForLife Offer: KES199 or $1.99 (Diaspora...","Samson Odhiambo, Leila Dayan Opou, Krysteen Sa...","""Swahili,English""","""KE""",Feature Film
3,KATI KATI,"A young woman - Kaleche (Nyokabi Gethaiga), wi...","[""Drama"",""Madaraka Day Weekend Movie Marathon ...","Nyokabi Gethaiga, Elsaphan Njora, Paul Ogola, ...","""English,Swahili""","""KE""",Feature Film
4,SOMETHING NECESSARY,"Award-winning Director Judy Kibinge, tells the...","[""Crime"",""Drama"",""Family"",""Madaraka Day Weeken...","Hilda Jepkoech, Kipngeno Kirui Duncan, Carolyn...","""Swahili,English""","""KE""",Feature Film


In [547]:
# Cleaning the datasets
def removing(df,col,text1,text2,text3,text4,text5,text6,text7,text8,text9):
    df[col]=df[col].apply(lambda x : x.replace(text1,''))
    df[col]=df[col].apply(lambda x : x.replace(text2,''))
    df[col]=df[col].apply(lambda x : x.replace(text3,''))
    df[col]=df[col].apply(lambda x : x.replace(text4,''))
    df[col]=df[col].apply(lambda x : x.replace(text5,''))
    df[col]=df[col].apply(lambda x : x.replace(text6,''))
    df[col]=df[col].apply(lambda x : x.replace(text7,''))
    df[col]=df[col].apply(lambda x : x.replace(text8,''))
    df[col]=df[col].apply(lambda x : x.replace(text9,''))
    return df[col]

In [548]:
# Cleaning of the data
removing(other_content,
'genres',
'#OwnForLife Offer: KES199 or $1.99 (Diaspora)',
'26% Off in our Birthday Month',
'Madaraka Day Weekend Movie Marathon Offer',
'Throwback Thursday Offer',
'GET 20% OFF THIS EID-AL-ADHA',
'Father\u2019s Day Offer',
'JENGA JIRANI CHARITY FESTIVAL',
'30% OFF THIS WACKY WEEKEND!',
'30% Diwali Discounts')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col]=df[col].apply(lambda x : x.replace(text1,''))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col]=df[col].apply(lambda x : x.replace(text2,''))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col]=df[col].apply(lambda x : x.replace(text3,''))
A value is trying to be set on a copy of a 

0      ["","","Action","Adventure","Crime","Drama","S...
1                  ["Action","Crime","Drama","Suspense"]
2      ["","Adventure","Christmas Under Curfew","Dram...
3      ["Drama","","Supernatural","Suspense","Thriller"]
4              ["Crime","Drama","Family","","Political"]
                             ...                        
96              ["Drama","New Releases","Short & Sweet"]
97              ["Drama","New Releases","Short & Sweet"]
98     ["Drama","Family","New Releases","Short & Sweet"]
99              ["Drama","New Releases","Short & Sweet"]
100    ["Bollywood","Drama","Dramedy","New Releases",...
Name: genres, Length: 101, dtype: object

In [549]:
# Replacing null string with a nan
import numpy as np
other_content['genres']=other_content['genres'].replace('null',np.NaN)
# Filling the null values in the genre category with uncategorized
other_content['genres'].fillna("['Test']",axis=0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  other_content['genres']=other_content['genres'].replace('null',np.NaN)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [550]:
# Stripping off string like elements in the  genres column
from ast import literal_eval
# The feature to be applied in
features = ['genres']
for feature in features:
    other_content[feature] = other_content[feature].apply(literal_eval)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  other_content[feature] = other_content[feature].apply(literal_eval)


In [553]:
# Cleaning our genres column and removing the spaces between words
def clean_genres(x):
    if isinstance(x,list):
        return [str.lower(i.replace(' ',''))for i in x]

In [554]:
# The features to be applied in
features=['genres','tags']
for feature in features:
    other_content[feature] = other_content[feature].apply(clean_genres)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  other_content[feature] = other_content[feature].apply(clean_genres)


In [555]:
# Creating a function to form the soup.
def create_soup(x):
    return ' '.join(x['genres'])

In [556]:
other_content['soup'] =other_content.apply(create_soup, axis=1)
# Getting the soup of genres
other_content['soup'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  other_content['soup'] =other_content.apply(create_soup, axis=1)


In [558]:
# Using the Count Vectorizer to find out the count of certain genres per movie
from sklearn.feature_extraction.text import CountVectorizer
count=CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(other_content['soup'])
count_matrix.todense()

matrix([[0, 1, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [559]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [560]:
get_similar_movies('NAIROBI HALF LIFE',cosine_sim2)

1                  KATUTURA
6     THE BODA BODA THIEVES
26          COERCED REVENGE
38                 CODE 254
52             TAIFA TUKUFU
5                      VEVE
25                   SUBIRA
31                 WAZI? FM
19          CAHIER AFRICAIN
20               Joe Bullet
Name: title, dtype: object

In [561]:
get_similar_movies('WHY U HATE',cosine_sim2)

31               WAZI? FM
19        CAHIER AFRICAIN
1                KATUTURA
4     SOMETHING NECESSARY
10          Short & Sweet
30                KIZINGO
0       NAIROBI HALF LIFE
26        COERCED REVENGE
29               JONAROBI
68           LOVE, ZAWADI
Name: title, dtype: object