# Recommendation System

In [2]:
!pip install pandas-gbq



In [3]:
# Imporing libraries

import pandas as pd
import numpy as np
from google.oauth2 import service_account

from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from sklearn.metrics.pairwise import linear_kernel

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\elena\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Reading dataset
TABLE = 'jedha_recommendation.user_view_videos_enriched_v2'
def query_db(query):
    return pd.io.gbq.read_gbq(query.format(table = TABLE), dialect="standard", project_id='salto-datalab-pid2')

The assumptions to determine if the program is liked by a profile:
* The program was seen for over an hour,
* The program was viewed at 75%,
* More than 5 episodes of the program have been seen,
If one of these criteria is met, the program can be considered to be liked.

Here are the details of the important fields:
* count_profile_like: number of profiles who liked the program,
* count_profile_not_like: number of profiles who did not like the program (the profile started to watch the program, but did not fulfill one of the criteria),
* count_profile_view: number of profiles having watched the program (excluding views less than 30 seconds),
* ratio_liked: ratio between count_profile_like and count_profile_not_like

In [5]:
DATASET = """
SELECT 
    program.program_id,
    program.title,
    tag_genre, 
    tag_sous_genre,
    ROUND(mean_age, 0) as mean_age,
    count_episodes,
    ROUND(MAX(program_total_duration) / count_episodes, 2) as mean_episode_duration,
    
    SUM(CASE WHEN profile_type = 'DEFAULT_ADULT' OR profile_type = 'STANDARD_ADULT' THEN 1 ELSE 0 END) as count_profile_adult,
    SUM(CASE WHEN profile_type = 'DEFAULT_KID' OR profile_type = 'STANDARD_KID' THEN 1 ELSE 0 END) as count_profile_kid,
    SUM(CASE WHEN profile_type = 'DEFAULT_HOME' THEN 1 ELSE 0 END) as count_profile_home,
    
    SUM(CASE WHEN gender = 'f' THEN 1 ELSE 0 END) as count_profile_f,
    SUM(CASE WHEN gender = 'm' THEN 1 ELSE 0 END) as count_profile_m,
    SUM(CASE WHEN count_viewed = 1 THEN 1 ELSE 0 END) as count_profile_like,
    SUM(CASE WHEN count_viewed = 0 THEN 1 ELSE 0 END) as count_profile_not_like,
    COUNT(count_viewed) as count_profile_view,
    ROUND(MAX(program_total_duration), 2) as program_seconds_duration, 
    ROUND(SUM(total_seconds_viewed), 2) as total_seconds_viewed, 
    ROUND(SUM(CASE WHEN count_viewed = 1 THEN total_seconds_viewed ELSE 0 END), 2) as seconds_viewed_liked,
    ROUND(SUM(CASE WHEN count_viewed = 0 THEN total_seconds_viewed ELSE 0 END), 2) as seconds_viewed_not_like,
    program_description

FROM (
    # Generating 'count_viewed' (= 1 if viewed for more than 1 hour or 3/4 of program duration)
    # Seconds view of each profile on a program (keeping only seconds viewed > 120s.)
    SELECT 
        pid,
        gender,
        profile_type,
        uv.program_id,
        AVG(count_episodes) as count_episodes,
        mean_age,
        SUM(seconds_viewed) as total_seconds_viewed, 
        MIN(program.program_total_duration) as program_total_duration,
        CASE 
            # IF viewed more than 1 hour
            # WHEN SUM(seconds_viewed) > 3600
            # IF viewed 75% of program length
            WHEN SUM(seconds_viewed) / MIN(program.program_total_duration) > 0.75
            # IF viewed more than 4 episodes
            OR SUM(seconds_viewed) > 4 * (MIN(program.program_total_duration) / AVG(count_episodes))
            THEN 1 
            ELSE 0 
        END AS count_viewed
    FROM {table} uv
    LEFT JOIN (
        # Duration of a program
        SELECT 
            program_id,
            COUNT(video_id) as count_episodes,
            AVG(mean_age) as mean_age,
            SUM(duration) as program_total_duration
        FROM (
            # Duration of each video
            SELECT 
                program_id,
                video_id,
                AVG(age) as mean_age,
                MAX(video_duration) as duration
            FROM {table}
            WHERE video_format IN ('preview', 'svod', 'freevod', 'replay')
            GROUP BY program_id, video_id
        )
        GROUP BY program_id
        ORDER BY program_total_duration desc
    ) program ON program.program_id  = uv.program_id
    WHERE video_format IN ('preview', 'svod', 'freevod', 'replay')
    GROUP BY pid, program_id, mean_age, gender, profile_type
    HAVING total_seconds_viewed > 120
) stats
LEFT JOIN (
    SELECT 
        program_id, 
        MIN(program_title) as title,
        tag_genre, 
        tag_sous_genre, 
        program_description
    FROM {table}
    GROUP BY program_id, tag_genre, tag_sous_genre, program_description
) program ON program.program_id = stats.program_id
GROUP BY program.program_id, program.title, tag_genre, tag_sous_genre, mean_age, count_episodes, program_description
HAVING count_profile_view >= 50
ORDER BY count_profile_like desc
"""
dataset = query_db(DATASET)
dataset['ratio_liked'] = round((dataset.count_profile_like / dataset.count_profile_view) * 100, 2)
dataset.sort_values(by='ratio_liked', ascending=False).head(2)

Unnamed: 0,program_id,title,tag_genre,tag_sous_genre,mean_age,count_episodes,mean_episode_duration,count_profile_adult,count_profile_kid,count_profile_home,...,count_profile_m,count_profile_like,count_profile_not_like,count_profile_view,program_seconds_duration,total_seconds_viewed,seconds_viewed_liked,seconds_viewed_not_like,program_description,ratio_liked
563,48070,Monsieur Bout-de-Bois,jeunesse,3-5-ans,34.0,1.0,1640.0,182,15,61,...,42,227,33,260,1640.0,561547.0,541920.0,19627.0,Monsieur Bout-de-Bois vit heureux avec sa femm...,87.31
347,47490,Meurtres au pays des Maures,telefilms,"drame,policier",49.0,1.0,5428.0,535,0,7,...,157,471,72,543,5428.0,2983475.0,2852360.0,131115.0,"Au coeur du massif de La Sainte Victoire, théâ...",86.74


In [6]:
# Saving filtered dataset
dataset.to_csv('dataset.csv', index = False)

The NLP model will be based on the 3 following features: 
* program_id 
* title 
* program_description

In [7]:
# Keeping the selected features
dataset = dataset[['program_id', 'title', 'program_description']]
dataset.head()

Unnamed: 0,program_id,title,program_description
0,50753,Les Marseillais,"Pour cette nouvelle saison, la grande famille ..."
1,50636,La Villa des Cœurs Brisés,"Depuis 5 saisons déjà, des cœurs brisés s’envo..."
2,47981,Ici tout commence,L’institut Auguste Armand est l’une des meille...
3,46079,Clem,Clem se réveille après avoir passé 6 ans dans ...
4,49623,Demain nous appartient,"La série ""Demain nous appartient"" se déroule à..."


In [8]:
# Removing empty values
dataset = dataset.drop_duplicates(subset=['title']) 
dataset.reset_index(drop=True, inplace=True)
dataset.head()

Unnamed: 0,program_id,title,program_description
0,50753,Les Marseillais,"Pour cette nouvelle saison, la grande famille ..."
1,50636,La Villa des Cœurs Brisés,"Depuis 5 saisons déjà, des cœurs brisés s’envo..."
2,47981,Ici tout commence,L’institut Auguste Armand est l’une des meille...
3,46079,Clem,Clem se réveille après avoir passé 6 ans dans ...
4,49623,Demain nous appartient,"La série ""Demain nous appartient"" se déroule à..."


In [9]:
# Transforming all letters to lowercase
dataset["cleaned_description"] = dataset["program_description"].str.lower()

# Declaring stopwords
StopWords = stopwords.words('french')


# Defining a TF-IDF Vectorizer Object and Removing all french stop words
tfidf = TfidfVectorizer(stop_words= StopWords)

# Replacing NaN with an empty string
dataset['cleaned_description'] = dataset['cleaned_description'].fillna('')

# Constructing the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(dataset['cleaned_description'])

# Outputing the shape of tfidf_matrix
tfidf_matrix.shape

(1500, 12895)

From the above output, we observe that 12895 different vocabularies or words in our dataset have 1500 programs.

In [10]:
# Array mapping from feature integer indices to feature name.
tfidf.get_feature_names()[3000:3010]

['couple',
 'couples',
 'coups',
 'coupé',
 'coupée',
 'coupées',
 'coupés',
 'cour',
 'courage',
 'courageuse']



With this matrix in hand, we can now compute a similarity score.

We will be using the cosine similarity to calculate a numeric quantity that denotes the similarity between two movies. 

Since we have used the TF-IDF vectorizer, calculating the dot product between each vector will directly give us the cosine similarity score.

In [11]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

We will define a function that takes in a movie title as an input and outputs a list of the 10 most similar movies. 

Firstly, for this, we need a reverse mapping of movie titles and DataFrame indices.

In [12]:
# Constructing a reverse map of indices and program titles
indices = pd.Series(dataset.index, index=dataset['title'])

Now we will define our recommendation function. These are the following steps we'll follow:

* Getting the index of the program given its title.

* Getting the list of cosine similarity scores for that particular program with all program. Converting it into a list of tuples where the first element is its position, and the second is the similarity score.

* Sorting this list of tuples based on the similarity scores; that is, the second element.

* Getting the top 10 elements of this list. We will ignore the first element as it refers to self (the program most similar to a particular movie is the program itself).

* Returning the titles corresponding to the indices of the top elements.

In [13]:
# Function that takes in a program title as input and outputs most similar programs

def get_recommendations(title, cosine_sim=cosine_sim):
  
    # Getting the index of the progarm that matches the program_id
    idx = indices[title]

    # Getting the pairwise similarity scores of all programs with that program
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sorting the programs based on the similarity scores (ascending order)
    # First index and after the similarity
    # We start from 1 because 0 its the same element
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Getting the scores of the 10 most similar programs
    sim_scores = sim_scores[1:11]

    # Getting the program indices
    program_indices = [i[0] for i in sim_scores]

    # Returning the top 10 most similar programs
    print(sim_scores)
    Recommendations = dataset['title'].iloc[program_indices]
    Recommendations.to_csv('reccomendations.csv', index = False)
    return Recommendations

## Getting Recommendations

In [14]:
dataset['title']

0                                         Les Marseillais
1                               La Villa des Cœurs Brisés
2                                       Ici tout commence
3                                                    Clem
4                                  Demain nous appartient
                              ...                        
1495    Conférence de presse du premier ministre Jean ...
1496                             Des racines et des ailes
1497                                  13h15, le samedi...
1498                                          C politique
1499                            Décollage pour l'Amérique
Name: title, Length: 1500, dtype: object

In [15]:
get_recommendations('Les Marseillais')

[(30, 0.9999999999999999), (311, 0.19646775947582687), (259, 0.15313054411560545), (1081, 0.14494680052576261), (745, 0.1438644225146043), (822, 0.10404071136426045), (310, 0.08474851810811188), (408, 0.08266542785895836), (50, 0.08126033643617471), (51, 0.07854813078189367)]


30                   Les Marseillais à Dubaï
311                           Manon + Julien
259     Les Marseillais vs le Reste du monde
1081                   Le zapping des fratés
745                       Tattoo Confessions
822                               En Famille
310           JLC Family : Un nouveau départ
408                  La Bataille des Couples
50                       Tropiques criminels
51                             Pékin Express
Name: title, dtype: object

The first number corresponds to the index of the title and the second the similarity score between the films.

In [16]:
get_recommendations('Décollage pour l\'Amérique')

[(1413, 0.1641237699639023), (1116, 0.11037424141107538), (370, 0.10952301010267688), (1428, 0.10034272450817727), (1264, 0.100251409789067), (1355, 0.09811406746046435), (107, 0.09639761142285809), (799, 0.08521583408838361), (1424, 0.07928081740195136), (1073, 0.07476300956260717)]


1413    1941-1945 : au cœur de la guerre du pacifique
1116                            Le salaire de la peur
370                                          Kon-Tiki
1428        L'histoire du film d'horreur par Eli Roth
1264            Frank Sinatra : la voix de l'Amérique
1355           Dans le sillage des grands navigateurs
107                                             Fargo
799       Chirurgie à tout prix : l'espoir à domicile
1424                                       The movies
1073                                  Rosemary's baby
Name: title, dtype: object

In [17]:
get_recommendations('Ici tout commence')

[(1195, 0.11572165153547943), (1010, 0.0965119006267714), (1308, 0.09276980258598755), (601, 0.08729368796594686), (148, 0.07931105381584251), (614, 0.07654265573851121), (978, 0.07638671525671498), (1290, 0.07638671525671498), (1131, 0.0720303113577377), (4, 0.07125426713641853)]


1195                                     Doctor Who
1010                       La sirène du Mississippi
1308                                Cuisine ouverte
601                               La grande lessive
148                                           Chefs
614                                         WAGS LA
978                 ASKIP, le collège se la raconte
1290              ASKIP, le collège se la raconte !
1131    Cuisine libanaise : chiche ou pois chiche ?
4                            Demain nous appartient
Name: title, dtype: object