# Flow - Basiline models
Models for [Datathon2021 - Recomendador](https://github.com/Datathon2021/Recomendador)

In [3]:
#### import json

import pickle
import datetime as dt
from itertools import chain
from collections import Counter

import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

np.random.seed(16)
np.set_printoptions(precision=2)
pd.set_option('display.max_columns', None)

pd.set_option('display.precision', 2)

In [4]:
%run Functions.ipynb

In [5]:
# Varibles
is_test = False
clean = False
sample_data = None # 100_000

if is_test:
    filter_date = dt.datetime(2021, 3, 10, 0, 0, 0).date()
else:
    filter_date = dt.datetime(2021, 4, 1, 0, 0, 0).date()

categories_list = ['accion', 'animacion', 'animales', 'aventura', 'belico', 'biografia', 'ciencia',
                   'ciencia ficcion', 'cocina', 'comedia', 'competencia', 'crimen', 'cultura', 'deporte',
                   'dibujos animados', 'documental', 'drama', 'entretenimiento', 'entrevistas', 'espectaculo',
                   'familia', 'fantasia', 'historia', 'humor', 'infantil', 'interes general', 'investigacion',
                   'magazine', 'moda', 'musica', 'naturaleza', 'periodistico', 'policial', 'politico', 'reality',
                   'religion', 'restauracion', 'romance', 'suspenso', 'teatro', 'terror', 'viajes', 'western'
                  ]

In [6]:
# if is_test:
#     df, df_test = create_dfs(sample_data=sample_data, clean=clean)  # to test
# else:
#     df, _ = create_dfs(ret_test=False, clean=clean)  # to submit
# df.tail(3)

df, _ = create_dfs(ret_test=False, clean=clean)  # to submit
df.tail(3)

Unnamed: 0,customer_id,account_id,device_type,asset_id,tunein,tuneout,resume,content_id,released_year,description,cast_first_name,credits_first_name,audience,made_for_tv,pay_per_view,pack_premium_1,pack_premium_2,create_date,modify_date,start_vod_date,end_vod_date,run_time_min,show_type,country_of_origin,accion,animacion,animales,aventura,belico,biografia,ciencia,ciencia ficcion,cocina,comedia,competencia,crimen,cultura,deporte,dibujos animados,documental,drama,entretenimiento,entrevistas,espectaculo,familia,fantasia,historia,humor,infantil,interes general,investigacion,magazine,moda,musica,naturaleza,periodistico,policial,politico,reality,religion,restauracion,romance,suspenso,teatro,terror,viajes,western,title,keywords
3657798,112339,3386,STB,29929.0,2021-03-31 22:34:00,2021-03-31 22:46:00,0,8.0,2017.0,"Mickey, Minnie, Donald, Daisy, Goofy y Pluto r...",unknow-actors,unknow-director,Preescolar,0,0,0,0,2021-03-28 20:53:39+00:00,2021-03-28 21:06:28+00:00,2021-03-25 00:00:00+00:00,2021-04-24 23:59:00+00:00,24.0,TV,US,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,mickey: aventuras sobre ruedas,"aventuras,autos,deportistas"
3657799,112339,3386,STB,29929.0,2021-03-31 23:09:00,2021-03-31 23:13:00,1,8.0,2017.0,"Mickey, Minnie, Donald, Daisy, Goofy y Pluto r...",unknow-actors,unknow-director,Preescolar,0,0,0,0,2021-03-28 20:53:39+00:00,2021-03-28 21:06:28+00:00,2021-03-25 00:00:00+00:00,2021-04-24 23:59:00+00:00,24.0,TV,US,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,mickey: aventuras sobre ruedas,"aventuras,autos,deportistas"
3657800,112339,3386,STB,29929.0,2021-03-31 23:13:00,2021-03-31 23:24:00,0,8.0,2017.0,"Mickey, Minnie, Donald, Daisy, Goofy y Pluto r...",unknow-actors,unknow-director,Preescolar,0,0,0,0,2021-03-28 20:53:39+00:00,2021-03-28 21:06:28+00:00,2021-03-25 00:00:00+00:00,2021-04-24 23:59:00+00:00,24.0,TV,US,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,mickey: aventuras sobre ruedas,"aventuras,autos,deportistas"


In [7]:
df.shape

(3657801, 69)

In [8]:
df.account_id.nunique()

113881

### 1) Baseline Datathon model -
- Model proposed in the competition.  \
Steps:
    - Find most popular content (in terms of the number of different profiles who viewed them at least once) and return it for each profile.

In [20]:
# Create list with most viewed content orderes in descending order

most_popular = df.groupby(['content_id'])['account_id'].agg(lambda x: x.nunique())
most_popular = most_popular.sort_values(ascending=False).index.values

most_popular = pd.Series(most_popular, name='content_id')

most_popular.iloc[:5]  # top 5 content (five most seen)

0    2040.0
1    3806.0
2    3900.0
3    2942.0
4    3598.0
Name: content_id, dtype: float64

In [21]:
# Select in order, top 20 assets filtering the assest that the user has already seen

submit = df.groupby(['account_id']
                   )['content_id'].agg(lambda xs: most_popular[~most_popular.isin(xs)].values.tolist()[:20])

submit.iloc[:5]  # top 5 user with recomendations

account_id
0    [2040.0, 3806.0, 3900.0, 2942.0, 3598.0, 3381....
1    [2040.0, 3806.0, 3900.0, 2942.0, 3598.0, 3381....
2    [2040.0, 3806.0, 3900.0, 2942.0, 3598.0, 3381....
3    [2040.0, 3806.0, 2942.0, 3598.0, 3381.0, 2160....
4    [2040.0, 3806.0, 3900.0, 2942.0, 3598.0, 3381....
Name: content_id, dtype: object

In [22]:
submit.shape

(103622,)

In [23]:
if is_test:
    print('map', mean_avg_precision(df_test, submit))
else:
    write_submit(submit)

map 0.028149991700341413


In [28]:
# with open('models_results/Naive_model_Datathon.pk', 'wb') as fp:
#     pickle.dump(submit, fp)

### 2) Most popular content by category.
- Most popular content in category that the user viewed the most\
Steps:
    - Find the first ten categories that each user most saw
    - Find content most viewed in each category
    - Return top 20 content for 1st, 2nd, 3rd, ... category, if do not reach 20, complete with random content

In [9]:
# Count the content seen in each category by user

df_account_seen = df.drop_duplicates(subset=['asset_id', 'account_id'])\
                    .groupby(['account_id'])[categories_list].sum()

df_account_seen.tail(3)

Unnamed: 0_level_0,accion,animacion,animales,aventura,belico,biografia,ciencia,ciencia ficcion,cocina,comedia,competencia,crimen,cultura,deporte,dibujos animados,documental,drama,entretenimiento,entrevistas,espectaculo,familia,fantasia,historia,humor,infantil,interes general,investigacion,magazine,moda,musica,naturaleza,periodistico,policial,politico,reality,religion,restauracion,romance,suspenso,teatro,terror,viajes,western
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1
113878,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
113879,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
113880,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
df_account_seen.shape

(113881, 43)

In [11]:
# Find "n_categories" categories most viewed by user

n_categories = 4
n_cat_most_seen_by_user = df_account_seen.apply(lambda s: s[s>0].nlargest(n_categories).index.tolist(), axis=1)

n_cat_most_seen_by_user

account_id
0                [comedia, accion, animacion, drama]
1                   [accion, drama, cocina, reality]
2               [drama, romance, aventura, fantasia]
3          [accion, drama, comedia, ciencia ficcion]
4               [comedia, infantil, drama, suspenso]
                             ...                    
113876                                       [drama]
113877                                       [drama]
113878    [biografia, documental, drama, naturaleza]
113879                            [comedia, familia]
113880                    [comedia, drama, fantasia]
Length: 113881, dtype: object

In [12]:
# Find content most viewed by category by differents profiles and will keep available after "filter_date"

top_content_by_cat_diff_user = {}

# We create a dictionary with list of top 10 content by category
for category in categories_list:

    df_content = df[(df[category] != 0) & (df['end_vod_date'].dt.date > filter_date)]
    contents = df_content.drop_duplicates(subset=['asset_id', 'account_id'])["content_id"].value_counts()

    # Select content ordered by "most seen by differents profiles"
    top_content_by_cat_diff_user[category] = contents.index.astype(int).values[:4]


# Create pandas series with all content
all_content = df[(df['end_vod_date'].dt.date > filter_date)].content_id
all_content = np.array([x[0] for x in Counter(df.content_id.values).most_common()])

In [14]:
# top_content_by_cat_diff_user

In [15]:
# Create recomendartion list by each user base in their categories and top content by category

recomendations = n_cat_most_seen_by_user.map(lambda xs: 
    np.fromiter(chain(*[ top_content_by_cat_diff_user[x] for x in xs ]), dtype=int)
)

recomendations.name = 'recomendations'
recomendations.iloc[:5]

account_id
0    [2040, 1800, 774, 2299, 2012, 2942, 1462, 1573...
1    [2012, 2942, 1462, 1573, 2040, 4133, 2012, 390...
2    [2040, 4133, 2012, 3900, 4133, 3353, 774, 2299...
3    [2012, 2942, 1462, 1573, 2040, 4133, 2012, 390...
4    [2040, 1800, 774, 2299, 2160, 1139, 2178, 20, ...
Name: recomendations, dtype: object

In [16]:
# Find the content already seen by each profile

content_seen = df.groupby(['account_id'])['content_id'].agg(lambda x: list(set(x)))
content_seen = content_seen.map(np.array)

content_seen.name = "content_seen"

content_seen.tail(3)

account_id
113878            [2892.0, 4343.0]
113879                    [1800.0]
113880    [2183.0, 3810.0, 3663.0]
Name: content_seen, dtype: object

In [17]:
# Create a df to filter content_seen from recomendations

df_content_recommended = pd.merge(recomendations, content_seen, how="left", left_index=True, right_index=True)
df_content_recommended.tail(4)

Unnamed: 0_level_0,recomendations,content_seen
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1
113877,"[2040, 4133, 2012, 3900]",[2091.0]
113878,"[3382, 3433, 3863, 3057, 185, 3681, 1979, 184,...","[2892.0, 4343.0]"
113879,"[2040, 1800, 774, 2299, 1800, 3847, 2017, 3722]",[1800.0]
113880,"[2040, 1800, 774, 2299, 2040, 4133, 2012, 3900...","[2183.0, 3810.0, 3663.0]"


In [33]:
# Create submit removing content saw in each user and select top 20
submit = df_content_recommended.apply(lambda row: row['recomendations'][~np.in1d(row['recomendations'],
                                                                                 row['content_seen'])][:20],
                            axis=1)

In [34]:

# Complete with random or top content if recomendations are less than 20
submit = submit.map(lambda xs:
           xs if len(xs) == 20 
           else np.concatenate([xs, all_content])[:20]
)

submit.iloc[:5]

account_id
0    [2040.0, 1800.0, 774.0, 2299.0, 2012.0, 2942.0...
1    [2012.0, 2942.0, 1462.0, 1573.0, 2040.0, 4133....
2    [2040.0, 4133.0, 2012.0, 3900.0, 4133.0, 3353....
3    [2012.0, 2942.0, 1462.0, 1573.0, 2040.0, 4133....
4    [2040.0, 1800.0, 774.0, 2299.0, 2160.0, 20.0, ...
dtype: object

In [32]:
if is_test:
    print('map', mean_avg_precision(df_test, submit, check_diff=False))
else:
    write_submit(submit, check_diff=False)

map 0.03530930631893934


### 3) Most popular content from closer users.
- Most popular content viewed by 10 closer user\
Steps:
  - Find most popular content per user
  - Find the n most viewed categories by each user
  - Find closer users with KNN
  - Select content
    - 50% from non-seen categories
    - 50% from their principal categories

In [20]:
df.isnull().sum().any()

False

In [21]:
n_closer_users = 10

In [22]:
# Find most viewed content per user that will keep available after "filter_date"

top_content_by_user = df.groupby('account_id')['content_id'].agg(lambda xs: [x[0] for x in Counter(xs).most_common()])


top_content_by_user_keep = df[(df['end_vod_date'].dt.date > filter_date)
                             ].groupby('account_id')['content_id'].agg(lambda xs: [x[0] for x in Counter(xs).most_common()])

# Complete user with "[]"
top_content_by_user_keep = top_content_by_user_keep.append(
    top_content_by_user[~top_content_by_user.index.isin(top_content_by_user_keep.index)].map(lambda xs: [])
)


top_content_by_user.iloc[:5]

account_id
0             [3438.0, 2866.0, 3498.0, 1503.0, 3845.0]
1                             [1020.0, 1220.0, 1761.0]
2    [183.0, 6.0, 1099.0, 557.0, 1582.0, 1443.0, 43...
3    [3790.0, 3769.0, 3206.0, 2344.0, 3900.0, 1463....
4    [2178.0, 3910.0, 1008.0, 2231.0, 4338.0, 3037....
Name: content_id, dtype: object

In [23]:
# Find the "n_categories" most viewed categories by each user

n_categories = 10
n_cat_most_seen_by_user = df.groupby('account_id')[categories_list
                                                  ].sum().apply(lambda s: s[s>0].nlargest(n_categories).index.tolist(), axis=1)

n_cat_most_seen_by_user.iloc[:5]

account_id
0    [comedia, accion, animacion, drama, infantil, ...
1                     [accion, drama, cocina, reality]
2    [drama, romance, aventura, fantasia, accion, c...
3    [accion, comedia, drama, ciencia ficcion, aven...
4    [comedia, infantil, drama, suspenso, accion, r...
dtype: object

#### Find closer users with KNN


In [24]:
df.columns

Index(['customer_id', 'account_id', 'device_type', 'asset_id', 'tunein',
       'tuneout', 'resume', 'content_id', 'released_year', 'description',
       'cast_first_name', 'credits_first_name', 'audience', 'made_for_tv',
       'pay_per_view', 'pack_premium_1', 'pack_premium_2', 'create_date',
       'modify_date', 'start_vod_date', 'end_vod_date', 'run_time_min',
       'show_type', 'country_of_origin', 'accion', 'animacion', 'animales',
       'aventura', 'belico', 'biografia', 'ciencia', 'ciencia ficcion',
       'cocina', 'comedia', 'competencia', 'crimen', 'cultura', 'deporte',
       'dibujos animados', 'documental', 'drama', 'entretenimiento',
       'entrevistas', 'espectaculo', 'familia', 'fantasia', 'historia',
       'humor', 'infantil', 'interes general', 'investigacion', 'magazine',
       'moda', 'musica', 'naturaleza', 'periodistico', 'policial', 'politico',
       'reality', 'religion', 'restauracion', 'romance', 'suspenso', 'teatro',
       'terror', 'viajes', 'wester

In [25]:
df.drop(columns=['title', 'keywords', 'description', 'cast_first_name', 'credits_first_name', 'resume',
                               'end_vod_date', 'tunein', 'asset_id', 'customer_id', 'content_id']+['tuneout','create_date','modify_date', 'start_vod_date']).head(2)

Unnamed: 0,account_id,device_type,released_year,audience,made_for_tv,pay_per_view,pack_premium_1,pack_premium_2,run_time_min,show_type,country_of_origin,accion,animacion,animales,aventura,belico,biografia,ciencia,ciencia ficcion,cocina,comedia,competencia,crimen,cultura,deporte,dibujos animados,documental,drama,entretenimiento,entrevistas,espectaculo,familia,fantasia,historia,humor,infantil,interes general,investigacion,magazine,moda,musica,naturaleza,periodistico,policial,politico,reality,religion,restauracion,romance,suspenso,teatro,terror,viajes,western
0,90627,STATIONARY,2020.0,Mujeres,0,0,1,0,53.0,Serie,US,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,90627,STATIONARY,2020.0,Mujeres,0,0,1,0,53.0,Serie,US,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [26]:
df.loc[: , 'min_watching'] = (df.tuneout - df.tunein).dt.seconds / 60

In [27]:
# Creamos un perfil de usuarios

df_user_profile = pd.get_dummies(df, columns=['device_type', 'audience', 'show_type', 'country_of_origin'])

df_user_profile.tunein = df_user_profile.tunein.dt.hour

df_user_profile.drop(columns=['title', 'keywords', 'description', 'cast_first_name', 'credits_first_name', 'resume',
                           'end_vod_date', 'tunein', 'asset_id', 'customer_id', 'content_id'] + ['tuneout','create_date','modify_date', 'start_vod_date'],
                     inplace=True)

# creamos un array con las funciones de agregacion para cada columna

agg_functs = {col: 'sum' for col in df_user_profile.drop(columns='account_id').columns}
agg_functs.update({
        'min_watching':  ['mean', lambda xs: xs.std() if len(xs) > 2 else 0],
        'released_year': ['mean',  lambda xs: xs.std() if len(xs) > 2 else 0],
        'run_time_min': ['mean',  lambda xs: xs.std() if len(xs) > 2 else 0],
#     'tunein_hour': ['mean', lambda xs: xs.std() if len(xs) > 2 else 0],
#         'ranking': ['mean',  lambda xs: xs.std() if len(xs) > 2 else 0]
})

# Creamos un dataframe de perfil de usuario
df_user_profile = df_user_profile.groupby('account_id').agg(agg_functs)

# Join multi level columns
df_user_profile.columns = df_user_profile.columns.map('{0[0]}_{0[1]}'.format)

df_user_profile.head()

Unnamed: 0_level_0,released_year_mean,released_year_<lambda_0>,made_for_tv_sum,pay_per_view_sum,pack_premium_1_sum,pack_premium_2_sum,run_time_min_mean,run_time_min_<lambda_0>,accion_sum,animacion_sum,animales_sum,aventura_sum,belico_sum,biografia_sum,ciencia_sum,ciencia ficcion_sum,cocina_sum,comedia_sum,competencia_sum,crimen_sum,cultura_sum,deporte_sum,dibujos animados_sum,documental_sum,drama_sum,entretenimiento_sum,entrevistas_sum,espectaculo_sum,familia_sum,fantasia_sum,historia_sum,humor_sum,infantil_sum,interes general_sum,investigacion_sum,magazine_sum,moda_sum,musica_sum,naturaleza_sum,periodistico_sum,policial_sum,politico_sum,reality_sum,religion_sum,restauracion_sum,romance_sum,suspenso_sum,teatro_sum,terror_sum,viajes_sum,western_sum,min_watching_mean,min_watching_<lambda_0>,device_type_CLOUD_CLIENT_sum,device_type_PHONE_sum,device_type_STATIONARY_sum,device_type_STB_sum,device_type_TABLET_sum,audience_Familiar_sum,audience_Gaming_sum,audience_General_sum,audience_Hombres_sum,audience_Juvenil_sum,audience_Mujeres_sum,audience_NIños_sum,audience_Niños_sum,audience_Preescolar_sum,audience_Teens_sum,show_type_Gaming_sum,show_type_Película_sum,show_type_Rolling_sum,show_type_Serie_sum,show_type_TV_sum,show_type_Web_sum,country_of_origin_AR_sum,country_of_origin_AT_sum,country_of_origin_AU_sum,country_of_origin_BE_sum,country_of_origin_BG_sum,country_of_origin_BR_sum,country_of_origin_CA_sum,country_of_origin_CF_sum,country_of_origin_CH_sum,country_of_origin_CL_sum,country_of_origin_CN_sum,country_of_origin_CO_sum,country_of_origin_CZ_sum,country_of_origin_DE_sum,country_of_origin_DK_sum,country_of_origin_DO_sum,country_of_origin_EE_sum,country_of_origin_ES_sum,country_of_origin_FI_sum,country_of_origin_FM_sum,country_of_origin_FR_sum,country_of_origin_Francia_sum,country_of_origin_GB_sum,country_of_origin_HK_sum,country_of_origin_HU_sum,country_of_origin_IE_sum,country_of_origin_IL_sum,country_of_origin_IN_sum,country_of_origin_IR_sum,country_of_origin_IS_sum,country_of_origin_IT_sum,country_of_origin_JP_sum,country_of_origin_KR_sum,country_of_origin_MN_sum,country_of_origin_MU_sum,country_of_origin_MX_sum,country_of_origin_MY_sum,country_of_origin_NL_sum,country_of_origin_NO_sum,country_of_origin_NR_sum,country_of_origin_NZ_sum,country_of_origin_PE_sum,country_of_origin_PH_sum,country_of_origin_PL_sum,country_of_origin_PY_sum,country_of_origin_RS_sum,country_of_origin_RU_sum,country_of_origin_SE_sum,country_of_origin_SY_sum,country_of_origin_TR_sum,country_of_origin_UK_sum,country_of_origin_US_sum,country_of_origin_USA_sum,country_of_origin_UY_sum,country_of_origin_VE_sum,country_of_origin_ZA_sum
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1
0,2012.0,6.96,0,1,1,0,86.4,30.66,1,1,0,0,0,0,0,0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,67.2,41.32,1.0,0.0,0.0,4.0,0.0,1.0,0,4.0,0,0,0.0,0,0.0,0.0,0.0,0,4.0,0,0.0,1.0,0,0.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0,0.0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,5.0,0,0,0,0
1,2007.67,20.5,0,0,1,1,97.67,42.44,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,38.33,41.63,2.0,1.0,0.0,0.0,0.0,1.0,0,2.0,0,0,0.0,0,0.0,0.0,0.0,0,1.0,1,0.0,1.0,0,1.0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0,0.0,0,0,1,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,1.0,0,0,0,0
2,2018.52,3.69,0,0,1,4,60.96,18.42,2,1,0,6,0,1,0,0,0,0,0,2,0,0,0,0,15,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,14,0,0,0,0,0,52.0,45.37,0.0,0.0,7.0,1.0,15.0,0.0,0,18.0,0,4,0.0,0,0.0,0.0,1.0,0,3.0,0,18.0,2.0,0,1.0,0,0,0,0,14.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0,4.0,0,0,0,0,0,0,0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,3.0,0,0,0,0
3,2012.84,7.39,0,0,29,4,101.74,27.44,28,0,0,10,0,0,0,15,0,21,0,8,0,0,0,1,21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,1,0,5,5,0,1,0,0,82.21,45.25,70.0,0.0,0.0,0.0,0.0,0.0,0,50.0,9,8,3.0,0,0.0,0.0,0.0,0,60.0,0,9.0,1.0,0,11.0,0,1,1,0,0.0,1,0,0,0,0,0,0,0,0,0,0,0.0,0,0,3.0,0,6.0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.0,0,46.0,0,0,0,0
4,2009.53,9.28,23,0,13,6,72.0,39.62,9,4,0,2,0,0,0,2,0,32,0,2,0,0,3,1,24,0,0,0,0,0,0,0,29,0,0,0,0,1,0,0,0,1,2,0,0,5,13,0,3,0,0,19.56,19.95,0.0,3.0,6.0,66.0,0.0,4.0,0,36.0,9,0,0.0,0,2.0,0.0,24.0,0,44.0,0,5.0,26.0,0,10.0,0,0,0,0,0.0,0,0,0,0,5,0,0,1,0,0,0,3.0,0,0,0.0,0,3.0,2,0,0,0,0,0,0,0.0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,49.0,0,0,0,0


In [28]:
# Scale data and train KNN

scaler = StandardScaler()
X = scaler.fit_transform(df_user_profile.values)

pca = PCA(n_components=23)
X = pca.fit_transform(X)

# p=1: "manhattan_distance", p=1: "euclidean_distance"
knn = NearestNeighbors(n_neighbors=n_closer_users, p=2, n_jobs=-1)

knn.fit(X)

NearestNeighbors(n_jobs=-1, n_neighbors=10)

#### Select content functions

In [29]:
def get_neighbors(model, account_id, X_users, k=None):
    """ Return closer users index from model"""
    neighbors = knn.kneighbors([X_users.loc[account_id].values], n_neighbors=k,
                               return_distance=False)
    closer_users = X_users.iloc[neighbors[0], :].index
    return closer_users


def make_recomendation(model, user, X_users, user_content_keep, user_content, user_cat, n=20, k=None):
    """
    For a given user it returns a content recommendation that the closest users saw and he did not
    
    :param X_users: matrix with user profiles vectors
    :param users_content: content most seen by user
    :param user_cat: categories most seen by user
    :param n: amount recomendartions gets
    :param k: amount neighbors user use
    """
    # Get closer users
    closer_users = get_neighbors(model, user, X_users, k=k)

    # Get content seen by closer users
    recomendation = list(chain(*user_content_keep[closer_users].values))

    # remove content that user already seen and select top 20
    recomendation = np.array(recomendation)
    recomendation = recomendation[~np.in1d(recomendation, user_content[user])][:n]
    return  recomendation


def create_submit(model, X_users, user_content_keep, user_content, user_cat):
    """
    For a given user it returns a content recommendation that the closest users saw and he did not
    
    :param X_users: matrix with user profiles vectors
    :param user_content_keep: content most seen by user that will keep available
    :param users_content: content most seen by user
    :param user_cat: categories most seen by user
    """
    
    global is_test

    # Get closer users
#     print('Getting neighbors...')
    closer_users = model.kneighbors(X_users, return_distance=False)
    closer_users = list(map(lambda xs: X_users.iloc[xs].index.values, closer_users))
    
    # Get content seen by closer users
#     print('Mapping neighbors to content...')
    recomendation = list(map(lambda xs: list(chain(*user_content_keep[xs].values)),
                             closer_users))

    # Delete repeating content 
#     print('Deleting repeating content...')
    recomendation = list(map(lambda xs: [x[0] for x in Counter(xs).most_common()], recomendation))

    # remove content that user already seen and select top 20
    recomendation = list(map(np.array, recomendation))
    
#     print('Clean content already seen...')
    submit = list(map(lambda xs, user: xs[~np.in1d(xs, user_content[user])][:20],
                      recomendation, X_users.index))
    
#     print('Creating submit...')
    submit = pd.Series(dict(zip(X_users.index.values, submit)))
    
    if not is_test:
        # complete with random content if no reach 20 items
#         print('Completing with random content...')
        all_content = np.array(list(set(chain(*user_content_keep.values))))
    else:
        # complete with top content if no reach 20 items
#         print('Completing with top content...')
        all_content = np.array([x[0] for x in Counter(chain(*user_content_keep.values)).most_common()])

    submit = submit.map(lambda xs:
               xs if len(xs) == 20 
               else np.concatenate([xs, all_content[~np.in1d(all_content, xs)]])[:20]
    )
    
#     print('Done!\n\n')
    return submit

---

#### Tests functions

In [133]:
account_id = 109680

In [139]:
print('categories:\n', n_cat_most_seen_by_user[account_id])
print('content:\n', top_content_by_user[account_id])
print('Content most viewed:\n', df[df.content_id.isin(top_content_by_user[account_id])].title.unique())

categories:
 ['drama', 'romance', 'biografia', 'comedia', 'dibujos animados', 'documental', 'infantil', 'reality']
content:
 [718.0, 464.0, 3321.0, 171.0, 876.0, 580.0]
Content most viewed:
 ['gravity falls: un verano de misterios' 'diario de una pasion'
 'yo antes de ti' 'diana: detras de la pantalla' 'diario de un seductor'
 'dance moms']


In [130]:
closer_users = get_neighbors(knn, account_id, df_user_profile, 10)
n_cat_most_seen_by_user[closer_users]

account_id
92022                        [biografia, documental, drama]
59666                 [infantil, comedia, dibujos animados]
26196           [crimen, drama, dibujos animados, infantil]
109680    [drama, romance, biografia, comedia, dibujos a...
24233                             [comedia, drama, romance]
107470                            [drama, romance, comedia]
70661                 [romance, comedia, drama, documental]
21558                                       [crimen, drama]
107822                            [comedia, drama, romance]
65408                 [infantil, comedia, dibujos animados]
dtype: object

In [131]:
user_recomendations = make_recomendation(knn, account_id, df_user_profile, 
                                         top_content_by_user_keep, top_content_by_user, n_cat_most_seen_by_user)
user_recomendations

array([3650., 3122., 1139., 3739., 3091.,  268.,  100.,  269., 2782.,
       2972.,  176., 3806.,  533., 3176., 2323., 2234., 3955., 2356.,
        268., 1847.])

In [132]:
df[df.content_id.isin(user_recomendations)]['title'].unique()

array(['igualita a mi', 'bluey', 'moises', 'a roma con amor',
       'pongamos que hablo de', 'cosa de minas', 'el lobo de wall street',
       'viudas', 'post mortem', 'eso que tu me das: pau dones',
       'me case con un boludo', 'las aventuras de kid danger',
       'philadelphia', 'el padrino', 'el padrino 2', 'gone', 'pecezuelos',
       'tenias que ser tu', 'memorias de una geisha'], dtype=object)

---

#### submit

In [32]:
%%time

X = pd.DataFrame(X, index=df_user_profile.index.values)

submit_2 = create_submit(knn, X,
                       top_content_by_user_keep, top_content_by_user, n_cat_most_seen_by_user)

submit_2.iloc[:5]

CPU times: user 10min 55s, sys: 2.34 s, total: 10min 57s
Wall time: 2min 23s


0    [3035.0, 2104.0, 2810.0, 4233.0, 4223.0, 2848....
1    [4249.0, 2383.0, 546.0, 4362.0, 2405.0, 116.0,...
2    [4133.0, 729.0, 1539.0, 2020.0, 3048.0, 1462.0...
3    [2953.0, 3210.0, 2442.0, 2222.0, 1958.0, 1167....
4    [3710.0, 3707.0, 3709.0, 518.0, 3744.0, 36.0, ...
dtype: object

In [60]:
submit.name = "by_category"
submit_2.name = "by_users"
recomendations = pd.merge(submit, submit_2, left_index=True, right_index=True).apply(lambda row: np.array(list(row['by_category'][:10]) + list(row['by_users'][:10])), axis=1)
recomendations

account_id
0         [2040.0, 1800.0, 774.0, 2299.0, 2012.0, 2942.0...
1         [2012.0, 2942.0, 1462.0, 1573.0, 2040.0, 4133....
2         [2040.0, 4133.0, 2012.0, 3900.0, 4133.0, 3353....
3         [2012.0, 2942.0, 1462.0, 1573.0, 2040.0, 4133....
4         [2040.0, 1800.0, 774.0, 2299.0, 2160.0, 20.0, ...
                                ...                        
113876    [2040.0, 4133.0, 2012.0, 3900.0, 2040.0, 2160....
113877    [2040.0, 4133.0, 2012.0, 3900.0, 2040.0, 2160....
113878    [3382.0, 3433.0, 3863.0, 3057.0, 185.0, 3681.0...
113879    [2040.0, 774.0, 2299.0, 3847.0, 2017.0, 3722.0...
113880    [2040.0, 1800.0, 774.0, 2299.0, 2040.0, 4133.0...
Length: 113881, dtype: object

In [61]:
recomendations.map(len)

account_id
0         20
1         20
2         20
3         20
4         20
          ..
113876    20
113877    20
113878    20
113879    20
113880    20
Length: 113881, dtype: int64

In [62]:
recomendations = recomendations.map(lambda xs: [x[0] for x in Counter(xs).most_common()][:20])

In [67]:
n_add = 20 - recomendations.map(len).min()

In [68]:
recomendations = recomendations.map(lambda xs: 
                                    xs if len(xs) == 20 
                                    else np.concatenate([xs, all_content[:n_add]])[:20]
    )

In [69]:
recomendations[recomendations.map(len) != 20]

Series([], dtype: object)

In [71]:
if is_test:
    print('map', mean_avg_precision(df_test, submit))
else:
    write_submit(recomendations, file='recomendations.csv', check_diff=False)

#### Optimization models