In [9]:
import pandas as pd
import numpy as np

In [10]:
user_data = pd.read_csv('user_data.csv')
ads_data = pd.read_csv('ads_data.csv')
user_messages = pd.read_csv('user_messages.csv')

In [11]:
# convert to datetime fields to standard datetime format
import datetime as dt
user_data['event_time'] = pd.to_datetime(user_data['event_time']).map(dt.datetime.toordinal)
ads_data['creation_time'] = pd.to_datetime(ads_data['creation_time']).map(dt.datetime.toordinal)


In [12]:
# label encoding of categorical fields
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(user_data['event'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
#print(le_name_mapping) 
user_data['event'] = le.fit(user_data['event']).transform(user_data['event'])

In [13]:
le = preprocessing.LabelEncoder()
le.fit(user_data['channel'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
#print(le_name_mapping)
user_data['channel'] = le.fit(user_data['channel']).transform(user_data['channel'])

In [14]:
combined = pd.merge(user_data, ads_data, on='ad_id')

In [15]:
# generating output label
user_ad_dict = {}
for index, row in user_messages.iterrows():
    ads = row['ads'].replace('[','').replace(']','').split(',') 
    for ad in ads:
        user_ad_dict[str(row['user_id']) + '|' + str(ad)] = 1
        
def isRecommended(row):
    return user_ad_dict.get(str(row['user_id']) + '|' + str(row['ad_id']),0)

combined['isRecommended'] = combined.apply(isRecommended, axis = 1)

In [16]:
#  sort user data 
combined.sort_values('event_time',inplace=True)

# Recommending the most popular

In [17]:
df = user_data[['ad_id','ad_messages']].groupby('ad_id', as_index=False).agg('count')
most_popular_ads = df.nlargest(10,['ad_messages'])['ad_id'].tolist()

In [18]:
sub1 = user_messages 
sub1['recommend'] = str(most_popular_ads)
sub1.head()

Unnamed: 0,user_id,category_id,ads,recommend
0,1,859,[1806476],"[2804693, 2794985, 2809690, 2814436, 26400, 28..."
1,3,800,"[2131700, 2734107, 2877209, 2877209]","[2804693, 2794985, 2809690, 2814436, 26400, 28..."
2,4,815,[2883211],"[2804693, 2794985, 2809690, 2814436, 26400, 28..."
3,7,815,"[2429412, 2886810, 2886804]","[2804693, 2794985, 2809690, 2814436, 26400, 28..."
4,7,362,[2909301],"[2804693, 2794985, 2809690, 2814436, 26400, 28..."


# Model Evaluation

In [19]:
def evaluate_accuracy(df):
    
    count = 0
    for index, row in df.iterrows():
        ads = row['ads'].replace('[','').replace(']','').split(',')
        recommended_ads = row['recommend'].replace('[','').replace(']','').split(',')
        
        for ad in ads:
            if (ad in recommended_ads):
                count = count + 1
                
    return count
                

In [20]:
def evaluate_precision(df):
    
    score = 0
    for index, row in df.iterrows():
        count = 0
        ads = row['ads'].replace('[','').replace(']','').split(',')
        recommended_ads = row['recommend'].replace('[','').replace(']','').split(',')
        
        for i in range(0, len(recommended_ads)):
            if (recommended_ads[i] in ads):
                count = count + 1
                score = score + count
        
        #print(ads, ":", recommended_ads,":" ,score)
        score = 1.0*score/len(ads)
        
                
    return score
                

In [21]:
print('accuracy of model 1 : ',evaluate_accuracy(sub1))
print('precision of model 1 : ',evaluate_precision(sub1))

accuracy of model 1 :  4
precision of model 1 :  0.0


# Recommending category-wise most popular elements

In [22]:
df = user_data[['ad_id','ad_views']].groupby('ad_id', as_index=False).agg('sum')
most_popular_ads = df.nlargest(10,['ad_views'])['ad_id'].tolist()

In [23]:
df1 = user_data[['ad_id','ad_views']]
df2 = ads_data[['ad_id','category_id']]
merged_data = pd.merge(df1, df2, on='ad_id')

In [24]:
most_popular_ads_catwise = merged_data.groupby(['category_id','ad_id'], as_index=False).agg('count')

In [25]:
df = most_popular_ads_catwise
cats = df.category_id.unique()
cats

array([362, 800, 806, 811, 815, 853, 859, 881, 887, 888], dtype=int64)

In [26]:
cat_dict = {}
for cat in cats:
    df1 = df[df['category_id']==cat]
    cat_dict[cat] = df1.nlargest(10,['ad_views'])['ad_id'].tolist()
    
print(cat_dict)

{362: [2757104, 2836886, 2853312, 2560201, 2449079, 2806018, 2598159, 1704736, 2661821, 333573], 800: [2814436, 2806603, 2816630, 2816336, 2802819, 2031170, 2739908, 2649805, 2587910, 2737601], 806: [2804693, 2809690, 26400, 2825119, 2788868, 2764905, 2785533, 2516918, 2822691, 2336966], 811: [2626729, 2764838, 1928465, 2753896, 2597250, 2727592, 2839093, 2670023, 2746309, 2667189], 815: [2234932, 2434247, 2796911, 2812937, 2812849, 2719785, 2342155, 2816482, 2748424, 2434257], 853: [2853382, 2839832, 2082120, 2564683, 1190194, 2613818, 2632665, 2774269, 2832788, 2813016], 859: [2449447, 2567368, 2426549, 2274484, 1998464, 2740230, 2067241, 2778369, 1065275, 2842601], 881: [2794985, 2816864, 2817712, 2840833, 2727695, 2519508, 2662331, 2767340, 2846976, 2688815], 887: [2852082, 2677927, 2802433, 2634993, 2696625, 2714941, 2592955, 2810869, 1699375, 2031454], 888: [2254453, 2827386, 2740665, 2578937, 2826518, 2808111, 2826051, 2808047, 2808064, 2827584]}


In [27]:
def catWisePopular(row):
    return str(cat_dict[row['category_id']])

sub2 = user_messages 
sub2['recommend'] = sub2.apply(catWisePopular, axis = 1)
sub2.head()

Unnamed: 0,user_id,category_id,ads,recommend
0,1,859,[1806476],"[2449447, 2567368, 2426549, 2274484, 1998464, ..."
1,3,800,"[2131700, 2734107, 2877209, 2877209]","[2814436, 2806603, 2816630, 2816336, 2802819, ..."
2,4,815,[2883211],"[2234932, 2434247, 2796911, 2812937, 2812849, ..."
3,7,815,"[2429412, 2886810, 2886804]","[2234932, 2434247, 2796911, 2812937, 2812849, ..."
4,7,362,[2909301],"[2757104, 2836886, 2853312, 2560201, 2449079, ..."


In [28]:
print('accuracy of model 2 : ',evaluate_accuracy(sub2))
print('precision of model 2 : ',evaluate_precision(sub2))

accuracy of model 2 :  38
precision of model 2 :  8.356321959541373e-22


# Classifaction based model

In [29]:
#columns = [col for col in combined.columns if col not in ['user_lat', 
#'user_long', 'lat', 'long', 'title', 'description', 'origin', 'source']]

#combined = combined[columns]

In [30]:
import numpy as np
combined = combined.replace([np.inf, -np.inf], np.nan)
combined = combined.fillna(0)

In [31]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(combined, test_size = 0.2, random_state = 0)

In [32]:
features = ['lat','user_lat']
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

clf = DecisionTreeClassifier(random_state=0, max_depth=1)
clf.fit(train[features], train['isRecommended'])




Traning Accuracy :0.9993417335670985
Validation Accuracy :0.9994309409531272


In [37]:
train

Unnamed: 0,event_time,user_id,event,channel,user_lat,user_long,origin,ad_id,images_count,ad_impressions,...,seller_id,creation_time,title,description,price,lat,long,source,enabled,isRecommended
1353502,736467,10152,1,2,-34.596501,-58.431631,home,2630878,1.0,0.0,...,616357,736466,MUEBLE VANITORY COMPLETO OFERT,"Vanitory completo con bacha de vidrio, cajon y...",1997.0,0.000000,0.000000,android,0,0
36272,736480,4094,1,0,-34.646748,-58.504932,home,2725792,3.0,0.0,...,132465,736477,Camperon abrigado,"Camperon blanco super abrigado, de tela imperm...",650.0,-34.645530,-58.501153,android,1,0
2106238,736490,13809,1,0,-34.341278,-58.794456,notification_center,2826034,1.0,0.0,...,118755,736490,Ropa para perros,Chalecos para perro. Precio talle 5.. Otros ta...,190.0,0.000000,0.000000,android,0,0
1452962,736480,1629,1,0,-34.450321,-58.599049,home,2564088,3.0,0.0,...,273646,736459,REMATO samsung trend,es liberado y anda perfecto. astillado en una ...,1600.0,0.000000,0.000000,android,0,0
1206329,736492,4320,1,0,-34.593472,-58.402893,search,2817427,2.0,1647.0,...,146983,736489,Montgomery para Hombre,Marca: LNG Clothing (USA) - Color: Gris - Tall...,1000.0,-34.589336,-58.396202,apple,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045705,736473,13026,1,2,-34.468696,-58.502882,home,2679124,2.0,0.0,...,458809,736472,Reloj Nike malla metalica,0,550.0,0.000000,0.000000,apple,1,0
1271037,736491,5699,1,0,-34.629871,-58.470013,search,1970867,1.0,10.0,...,526234,736385,camara digital 12 mega pixels,"Camara digital 12megapixels, memoria 2gb inclu...",300.0,0.000000,0.000000,apple,0,0
1720748,736482,3579,1,0,-34.565849,-58.469189,push,2284901,4.0,0.0,...,221310,736425,Borcegos Prune,grises talle 36-37- muy buen estado,300.0,0.000000,0.000000,apple,0,0
1021560,736489,11086,1,0,-34.603733,-58.383766,home,1401514,2.0,0.0,...,159238,736308,Zapatillas nike nair talle 39,Estan en buen estados nuebas me quedaron chicas,700.0,0.000000,0.000000,android,1,0


In [None]:
combined['pred'] = clf.predict(combined[features])
print("Traning Accuracy :" + str(accuracy_score(train['isRecommended'], clf.predict(train[features]))))
print("Validation Accuracy :" + str(accuracy_score(val['isRecommended'], clf.predict(val[features]))))

In [33]:
personalized_dict = {}
counter = 0
for index, row in combined[combined['pred']==1].iterrows():
    key = str(row['user_id']) + "|" + str(row['category_id'])
    personalized_dict[key] = personalized_dict.get(key,[])
    personalized_dict[key].add(row['ad_id'])

# Merged Approach

In [34]:
user_dict_view = {}
user_dict_fmsg = {}
counter = 0
for index, row in user_data.iterrows():
    counter = counter + 1
    if (counter%100000==0):
        print(counter)
    if (row['event']=='first_message'):
        user_dict_fmsg[row['user_id']] = user_dict_fmsg.get(row['user_id'],[])
        user_dict_fmsg[row['user_id']].add('ad_id')
    if (row['event']=='view'):
        user_dict_view[row['user_id']] = user_dict_view.get(row['user_id'],[])
        user_dict_view[row['user_id']].add('ad_id')
    

100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000


In [35]:
def getRecommendations(row):
    user = str(row['user_id'])
    cat = str(row['category_id'])
    
    personalized_ads = personalized_dict.get(user+ '|'+cat,[])
    user_most_viewed = user_dict_view.get(int(user),[])
    user_messaged = user_dict_fmsg.get(int(user),[])
    
    most_popular = cat_dict[row['category_id']]
    
    allAds = user_most_viewed + personalized_ads + user_messaged + most_popular
    # here we can write some scoring logic for choosing the one to recommend among these
    return str(allAds[0:10])
    

In [36]:
sub = user_messages 
sub['recommend'] = sub.apply(catWisePopular, axis = 1)

print('accuracy of model : ',evaluate_accuracy(sub))
print('precision of model : ',evaluate_precision(sub))

accuracy of model :  38
precision of model :  8.356321959541373e-22
