In [55]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [56]:
# Take from https://stackoverflow.com/questions/37292872/how-can-i-one-hot-encode-in-python
# for one hot encoding 
# It does one_hot encoding and remove the original column for calculating similarity
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

In [57]:
# User Data Generation

# Simulating a pandas dataframe for user/personalities with attributes like age and product interest etc.

def simulate_user_data(min_age, max_age, dict_items_brands, n = 100):
    # Unique user_id 
    list_id = [i for i in range(1, n + 1)]
    # Physical attributes
    list_age = np.random.randint(min_age, max_age, n)
    list_gender = np.random.choice(["M", "F"], n, p = [0.4, 0.6])
    # Interest attributes (last purchase = days)
    list_last_purchase = np.random.randint(1, 7, n)
    list_last_purchase_item = np.random.choice(list(dict_items_brands), n)
    # Brand preferences (If any!)
    list_brand_preference = [np.random.choice(dict_items_brands[key]) for key in list_last_purchase_item]
    # Browsing behaviour (time spent = mins)
    list_time_spent_on_browsing = np.random.randint(3, 180, n)
    
    # now using the list's, create a pandas df for the users
    user_df = pd.DataFrame(list(zip(list_id, 
                                   list_age,
                                   list_gender,
                                   list_last_purchase, 
                                   list_last_purchase_item, 
                                   list_brand_preference,
                                   list_time_spent_on_browsing,
                                   )), 
                          columns = ["user_id", 
                                     "age", 
                                     "gender", 
                                     "last_purchase_day",
                                     "last_purchase_item",
                                     "brand_preference",
                                     "average_time_spent_on_browsing"])
    return user_df
    

dict_items_brands = {"clothing" : ["nike", "addidas", "gucci", "levis", "calvin_klein"],
                     "shoes" : ["corcs", "addidas", "nike", "under_armour", "kenneth_cole"], 
                     "watches" : ["rolex", "omega", "titan", "seiko", "fossil"],
                     "accessories" : ["atheleisure", "addidas"],
                     "mobile_phones" : ["apple", "samsung", "motorola", "xiaomi"],
                     "tablets" : ["apple", "samsung", "motorola", "xiaomi", "fire"],
                     "car" : ["ford", "mercedes", "toyota"],
                     "sports" : ["nike", "addidas", "puma"]
                    }
user_df = simulate_user_data(15, 50, dict_items_brands, n = 200)

# To encode the categorical variables
features_to_encode = ['gender', 'last_purchase_item']
user_df_one_hot = user_df
for feature in features_to_encode:
    user_df_one_hot = encode_and_bind(user_df_one_hot, feature)


In [58]:
user_df_one_hot

Unnamed: 0,user_id,age,last_purchase_day,brand_preference,average_time_spent_on_browsing,gender_F,gender_M,last_purchase_item_accessories,last_purchase_item_car,last_purchase_item_clothing,last_purchase_item_mobile_phones,last_purchase_item_shoes,last_purchase_item_sports,last_purchase_item_tablets,last_purchase_item_watches
0,1,26,2,xiaomi,170,0,1,0,0,0,1,0,0,0,0
1,2,30,4,addidas,29,1,0,0,0,0,0,0,1,0,0
2,3,38,5,ford,156,1,0,0,1,0,0,0,0,0,0
3,4,40,6,toyota,85,1,0,0,1,0,0,0,0,0,0
4,5,41,5,nike,109,1,0,0,0,0,0,0,1,0,0
5,6,45,2,mercedes,69,1,0,0,1,0,0,0,0,0,0
6,7,32,3,atheleisure,159,0,1,1,0,0,0,0,0,0,0
7,8,43,1,puma,8,1,0,0,0,0,0,0,1,0,0
8,9,46,2,puma,169,0,1,0,0,0,0,0,1,0,0
9,10,20,2,addidas,39,0,1,0,0,0,0,0,1,0,0


In [207]:
# Ad Data Generation

# Simulating a pandas dataframe for the ads and the charateristics of the ad

def simulate_ad_data(min_age, max_age, dict_items_brands, n = 20):
    # Unique ad_id 
    list_id = [i for i in range(1, n + 1)]
    
    list_creativity_score = np.random.randint(1, 5, n)
    list_reservation_price = np.random.randint(0.5, 10, n)
    list_ad_type = np.random.choice(["T", "A", "V"], n, p = [0.7, 0.2, 0.1])
    list_have_link_embedded = np.random.choice(["Y", "N"], n, p = [0.7, 0.3])
    
    # Target user attributes
    list_last_purchase_item = np.random.choice(list(dict_items_brands), n)
    list_age = np.random.randint(min_age, max_age, n)
    list_gender = np.random.choice(["M", "F"], n, p = [0.4, 0.6])
    list_last_purchase = np.random.randint(1, 7, n)
    list_time_spent_on_browsing = np.random.randint(3, 180, n)
    
    # Create the dataframe
    ad_df = pd.DataFrame(list(zip(list_id, 
                                   list_creativity_score,
                                   list_reservation_price,
                                   list_ad_type, 
                                   list_have_link_embedded, 
                                  list_last_purchase_item, 
                                  list_age,
                                  list_gender,
                                  list_last_purchase,
                                  list_time_spent_on_browsing
                                   )), 
                          columns = ["ad_id", 
                                     "creativity_score", 
                                     "reservation_prc", 
                                     "ad_type",
                                     "link_bool", 
                                    "last_purchase_item", 
                                    "age", 
                                    "gender",
                                    "last_purchase_day",
                                    "average_time_spent_on_browsing"
                                    ])
    return ad_df
    
ad_df = simulate_ad_data(15, 50, dict_items_brands, n = 100)

# To encode the categorical variables
features_to_encode = ['gender', 'last_purchase_item']
ad_df_one_hot = ad_df
for feature in features_to_encode:
    ad_df_one_hot = encode_and_bind(ad_df_one_hot, feature)
ad_df


Unnamed: 0,ad_id,creativity_score,reservation_prc,ad_type,link_bool,last_purchase_item,age,gender,last_purchase_day,average_time_spent_on_browsing
0,1,4,4,T,N,tablets,15,M,4,131
1,2,4,8,T,Y,car,24,F,6,158
2,3,2,8,T,N,clothing,36,M,4,101
3,4,1,5,T,Y,car,30,F,5,69
4,5,3,3,A,Y,mobile_phones,40,F,6,33
5,6,2,3,T,N,accessories,20,F,5,102
6,7,4,2,T,Y,car,20,F,1,146
7,8,2,1,T,Y,clothing,18,M,4,75
8,9,1,5,T,Y,car,49,F,2,14
9,10,4,0,T,Y,watches,25,F,4,39


In [176]:
user_data_for_similarity = pd.concat([user_df_one_hot[["age", 
                                                       "last_purchase_day",
                                                       "average_time_spent_on_browsing"]], 
                                      user_df_one_hot.iloc[:, 5: user_df_one_hot.shape[1]]
                                     ], axis = 1)
                                      

#print(user_data_for_similarity)

#print(ad_data_for_similarity)
offering_price = np.asscalar(np.random.randint(0.5, 10, 1))
eligible_ads = ad_df_one_hot[ad_df_one_hot["reservation_prc"] <= offering_price]
print(eligible_ads.shape)
# empty_list 
ad_user_pair = pd.DataFrame()
# Iterate over all the ads and see which user is the best user for that add
for index, row in eligible_ads.iterrows():
    #row = pd.DataFrame(row)
    #print(type(row))
    ad_data_for_similarity = row.iloc[5:18]
    #print(ad_data_for_similarity)
    #print(ad_data_for_similarity.shape)
    counter = 0

    #print(ad_data_for_similarity)
    for index2, row2 in user_data_for_similarity.iterrows():
        #print(np.array(row2))
        #print(np.array(ad_data_for_similarity))
        a = np.vstack((row2, ad_data_for_similarity))
        #print(a.shape)
        cosine_similarity_value = cosine_similarity(a)
        #print(np.stack(np.array(ad_data_for_similarity), np.array(row2)))
        cosine_similarity_value = cosine_similarity_value[0, 1]
        #print((cosine_similarity_value))
        if (cosine_similarity_value > counter):
            counter = cosine_similarity_value
            matched_user_index = index2
        else:
            pass
    merged_df = pd.DataFrame(pd.concat((ad_df.loc[index], user_df.loc[matched_user_index]))).T
    print((merged_df.shape))
    
    #print(ad_user_pair.shape)
    ad_user_pair = pd.concat((ad_user_pair, merged_df))

    #print(type(ad_data_for_similarity))

  # This is added back by InteractiveShellApp.init_path()


(50, 18)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)
(1, 17)


In [177]:
ad_user_pair

Unnamed: 0,ad_id,creativity_score,reservation_prc,ad_type,link_bool,last_purchase_item,age,gender,last_purchase_day,average_time_spent_on_browsing,user_id,age.1,gender.1,last_purchase_day.1,last_purchase_item.1,brand_preference,average_time_spent_on_browsing.1
0,1,4,3,T,N,clothing,41,F,1,66,96,49,M,1,accessories,atheleisure,77
0,2,1,0,T,Y,mobile_phones,28,F,5,53,50,43,F,5,watches,omega,78
0,7,4,0,A,N,tablets,15,F,3,65,84,21,M,5,tablets,motorola,91
0,8,4,3,T,N,tablets,48,M,6,176,83,27,M,4,tablets,samsung,98
0,11,4,2,A,Y,watches,35,F,1,60,44,34,F,2,tablets,fire,60
0,12,3,3,A,Y,clothing,15,M,1,142,122,17,M,2,mobile_phones,samsung,166
0,15,4,3,T,Y,accessories,47,F,5,52,81,43,F,4,shoes,corcs,45
0,16,1,0,T,Y,tablets,16,F,4,165,159,17,F,4,shoes,under_armour,174
0,17,3,2,A,N,shoes,18,M,6,23,165,35,F,6,tablets,apple,43
0,20,4,3,T,Y,clothing,35,M,2,95,38,45,M,2,tablets,fire,125


In [203]:
a = ad_user_pair["last_purchase_day"]
b = (a.iloc[:, 0] == a.iloc[:, 1])

In [204]:
b.value_counts().loc[True]/len(b)


0.4

In [200]:
len(b)

50

In [None]:
last_purchase_day

## Steps in the Alogrithm 

1. Pick an offering price.
2. Based on the offering price, choose all the ads whose reservation price is less than the offering price
3. Go through each one of the ads and find the best/optimal user. 
4. Find the best user using a cosine similarity between the target audience of the ad and the user data who has landed up on a webpage.
5. Now, we have the final dataframe that gives me the best target user for each ad. 
6. Repeat steps from 1 - 5 for different prices to mimic the behaviour of the adExchange.


## To Do List
1. UML Class Diagram
2. Implement the same code in the OOP concepts
3. Upload it on the github and add Prof. Charles

In [None]:
# Object Oriented 
# The code below is being taken from 
# https://stackoverflow.com/questions/46466588/generating-random-number-for-class-attribute
# Class creation
# User Class - It defines a user attributes like age, interest, browsing patter, etc. to mimic the 
# data that we get from cookies
from numpy import random

class Internet_User():

    # An Internet User attributes
    def __init__(self, name, age=None, gender):
        if age is None:
            age = random.randint(15,50)
        self.name = name
        self.age = age
        self.gender = gender

In [205]:
# Ad Supplier Class
hash("abc")

-5128333308206136575

In [206]:
hash("abc")

-5128333308206136575

## Ideas
1. How do we bucket time. How does real advertising works? 
Ans - Include this by sampling the universe 
2. How to inlcude the budget of the ad offering.
3. Data structures and Algorithms - Help needed!! 
