### Scrapped Set

In [8]:
# Importing the nesessary libraries
import pandas as pd
import numpy as np
import sqlalchemy
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import linear_kernel,sigmoid_kernel,cosine_similarity
from ast import literal_eval

In [9]:
# Loading the Kenyan Data Set
kenyan_df=pd.read_csv("kenyan_movies.csv").drop(columns={"Unnamed: 0","Year of Release","Watch Time","Movie Rating","Metascore of movie","Votes"})
kenyan_df.head()
# Loading the Ugandan Dataset
ugandan_df=pd.read_csv("ugandan_movies.csv").drop(columns={"Unnamed: 0","Year of Release","Watch Time","Movie Rating","Metascore of movie","Votes"})
ugandan_df.head()
# Tanzanian Dataset
tanzanian_df=pd.read_csv("tanzanian_movies.csv").drop(columns={"Unnamed: 0","Year of Release","Watch Time","Movie Rating","Metascore of movie","Votes"})
tanzanian_df.head()
# Concatenating the set on rows
dataframes=[kenyan_df,ugandan_df,tanzanian_df]
final_df=pd.concat(dataframes,axis=0,join="outer",ignore_index=True)
# Filling the null values with an empty string
final_df.fillna('',inplace=True)
final_df.isnull().any()
# Creating a soup of the important features
def get_extra_soup(data):
    soup=[]
    for i in range(0,final_df.shape[0]):
        soup.append(data['Description'][i]+' '+data['Genre'][i]+' '+data['Cast'][i]) 
    return soup
#Creating soup with description genre and cast members
final_df['extra_soup']=get_extra_soup(final_df) 
final_df['extra_soup']  
# Cleaning the extra soup
cols=['\n','(',')','-',"'",',','[','.','?']
for item in cols:
    final_df['extra_soup']=final_df['extra_soup'].apply(lambda x : x.replace(item,''))
    final_df['Genre']=final_df['Genre'].apply(lambda x : x.replace(item,''))

### Synopsis and Cased based recomender


In [10]:
def scrapped_set_recommender(title):
    Tfid=TfidfVectorizer(
    strip_accents="unicode",
    lowercase=True,
    stop_words="english",
    min_df=2,
    ngram_range=(1,3))
    # Creating a sparse matrix of term frequencies in the synopsis
    tfid_mat=Tfid.fit_transform(final_df['extra_soup'])
    tfid_mat
    # Creating a similarity matrix using dot product to compute similarity between the vectors created
    scrapped_matrix=sigmoid_kernel(tfid_mat,tfid_mat) 
    # Now that we have a similarity matrix . We form a pandas series with movies and indices 
    movies_final=pd.Series(final_df.index,index=final_df['Title']).drop_duplicates()
    movies_final
    # Mapping indexes to the movies
    index=movies_final[title]
    # Listing the movie index and the specific matrix matched
    movie_sim_score=list(enumerate(scrapped_matrix[index]))
    # Sorting it in descending order
    sorted_score=sorted(movie_sim_score,key=lambda x:x[1],reverse=True)
    # Obtaining the top ten scores
    top_ten=sorted_score[1:11]
    # Movie indexes
    movie_indexes= [i[0] for i in top_ten]
    similar_movies=final_df[["Title","Genre","Cast"]].iloc[movie_indexes]
    # Obtaining similar movies
    return similar_movies 

In [11]:
scrapped_set_recommender("Disconnect")

Unnamed: 0,Title,Genre,Cast
43,Plan B,Comedy Romance,"Sarah Hassan,Catherine Kamau,Daniel Etim Effio..."
155,The Captain of Nakara,Comedy Romance,"Bernard Safari,Shirleen Wangari,Charles Kiarie..."
44,Sincerely Daisy,Comedy Drama Romance,"Ellah Maina,Sam Psenjen,Mbeki Mwalimu,Brian Ab..."
572,Windswayer,Short Comedy,"Josette Adams,Pamela Barstow,Jason Eckert,Mish..."
458,Bufis,Comedy,Vincenzo Cavallo
543,The Little World of Father Baláa,Comedy,Tobias Schmutzler
282,A Guide to Dining Out in Nairobi,Short Comedy,"Yafesi Musoke,Felix Erasto,Martha Kago,John Ka..."
640,Face Off Stand Up Comedy,Short Comedy,
690,I just missed you,Short Comedy,
31,Khel,Comedy Drama Romance,"Aparajita,Prem Chopra,Madhuri Dixit,Vijayendra..."


In [12]:
def director_based_recomender(title):
    Tfid=TfidfVectorizer(
    strip_accents="unicode",
    lowercase=True,
    stop_words="english",
    min_df=2,
    ngram_range=(1,3))
    # Creating a sparse matrix of term frequencies in the synopsis
    tfid_mat=Tfid.fit_transform(final_df['Director'])
    tfid_mat
    # Creating a similarity matrix using dot product to compute similarity between the vectors created
    scrapped_matrix=sigmoid_kernel(tfid_mat,tfid_mat) 
    # Now that we have a similarity matrix . We form a pandas series with movies and indices 
    movies_final=pd.Series(final_df.index,index=final_df['Title']).drop_duplicates()
    movies_final
    # Mapping indexes to the movies
    index=movies_final[title]
    # Listing the movie index and the specific matrix matched
    movie_sim_score=list(enumerate(scrapped_matrix[index]))
    # Sorting it in descending order
    sorted_score=sorted(movie_sim_score,key=lambda x:x[1],reverse=True)
    # Obtaining the top ten scores
    top_ten=sorted_score[1:11]
    # Movie indexes
    movie_indexes= [i[0] for i in top_ten]
    similar_movies=final_df[["Title","Director"]].iloc[movie_indexes]
    # Obtaining similar movies
    return similar_movies 

In [13]:
director_based_recomender("Nairobi Half Life")

Unnamed: 0,Title,Director
20,Disconnect,David 'Tosh' Gitonga
78,Kifaru,David Hambridge
142,The Sea Turtles of Lamu,D. David Morin
281,Climate Exodus,David Baute
429,Hearing Colour,David Varga
438,Pirátské síte,David Calek
603,Slum Survivors,David Gough
132,My Africa,David Allen
665,The Dawn Will Break,David Alexander
166,The Wedding Camels,David MacDougall
