In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tqdm import tqdm
from torch.nn.utils import spectral_norm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ads = pd.read_csv('train/train_data_ads.csv')
# feeds = pd.read_csv('train/train_data_feeds.csv')

## Merga Publisher Data and Advertiser Data
* First, we grouped by the publisher data by user_id after choosing the suitable group function for each column.
* Then we inner joined publisher data and advertiser data on user_id.

In [13]:
def most_common(x):
    return x.value_counts().idxmax()

columns_to_mode = feeds.columns.drop(['u_userId','i_docId', 'i_s_sourceId','i_entities','i_dislikeTimes', 
                                     'i_upTimes', 'e_po', 'e_rn', 'e_et', 'label', 'cillabel', 'pro'])
columns_to_mean = ['i_dislikeTimes', 'i_upTimes', 'e_po', 'e_rn']
columns_to_min = [ 'label', 'cillabel', 'pro']

agg_dictionary = {
    **{column: most_common for column in columns_to_mode},
    **{column: 'mean' for column in columns_to_mean},
    **{column: 'min' for column in columns_to_min}
}

feeds_group_by = feeds.groupby('u_userId').agg(agg_dictionary).reset_index()


In [4]:
# feeds_group_by.to_csv('feeds_group_by.csv')
feeds_group_by = pd.read_csv('feeds_group_by.csv')
feeds_group_by =  feeds_group_by.iloc[:, 1:]
feeds_group_by

Unnamed: 0,u_userId,u_phonePrice,u_browserLifeCycle,u_browserMode,u_feedLifeCycle,u_refreshTimes,u_newsCatInterests,u_newsCatDislike,u_newsCatInterestsST,u_click_ca2_news,...,e_m,e_pl,e_section,i_dislikeTimes,i_upTimes,e_po,e_rn,label,cillabel,pro
0,100001,16,15,10,15,0,112,0,109,0,...,369,214,0,5.333333,6.000000,4.833333,3.000000,-1,-1,0
1,100002,11,17,13,15,0,117^78^112^98^157,0,117^78,117^78^112^98,...,1403,656,1,3.666667,9.000000,7.000000,1.000000,-1,-1,0
2,100003,16,17,13,17,2,152^123^220^30^16,0,112,112^65^123^10^16,...,1087,2780,1,3.500000,8.000000,6.500000,1.000000,-1,-1,0
3,100005,16,15,11,15,0,220^16^39^142^211,0,39^220^16,39^16^220^142^211,...,1123,2836,0,5.400000,8.800000,7.600000,1.400000,-1,-1,0
4,100006,16,17,14,17,3,16^152^98^220^39,0,220^98^106^16^195,220^106^98^16^195,...,998,835,1,1.384615,6.730769,7.000000,1.000000,-1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180118,287180,14,17,14,17,7,65^171^213^65^168,0,199^171^147^207^218,0^218^65^67^112,...,25,348,1,2.730769,5.307692,6.346154,1.000000,-1,-1,0
180119,287181,14,17,14,15,0,168^78,0,0,168^78^0,...,934,879,1,0.666667,2.666667,6.666667,1.000000,-1,-1,0
180120,287182,14,17,14,15,0,177^98^65^112^156,0,177^27,177^27^98^65^168,...,508,251,0,3.769231,5.961538,6.807692,1.923077,-1,-1,0
180121,287183,14,17,14,15,0,65^179^205,0,0,179^205^151,...,591,2450,1,1.000000,9.000000,7.000000,1.000000,-1,-1,0


In [5]:
data_merged = ads.merge(feeds_group_by, left_on='user_id', right_on='u_userId', how='inner')
data_merged = data_merged.drop(['u_userId','u_feedLifeCycle_y', 'u_refreshTimes_y','u_newsCatInterestsST_y'], axis = 1)
data_merged 

Unnamed: 0,log_id,label_x,user_id,age,gender,residence,city,city_rank,series_dev,series_group,...,e_m,e_pl,e_section,i_dislikeTimes,i_upTimes,e_po,e_rn,label_y,cillabel,pro
0,373250,0,100005,3,2,16,147,2,32,6,...,1123,2836,0,5.4,8.8,7.6,1.4,-1,-1,0
1,373253,1,100005,3,2,16,147,2,32,6,...,1123,2836,0,5.4,8.8,7.6,1.4,-1,-1,0
2,373252,1,100005,3,2,16,147,2,32,6,...,1123,2836,0,5.4,8.8,7.6,1.4,-1,-1,0
3,373251,0,100005,3,2,16,147,2,32,6,...,1123,2836,0,5.4,8.8,7.6,1.4,-1,-1,0
4,373255,0,100005,3,2,16,147,2,32,6,...,1123,2836,0,5.4,8.8,7.6,1.4,-1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7675512,309815,1,286999,5,2,16,426,2,34,7,...,943,2658,1,5.0,9.0,6.0,1.0,-1,-1,0
7675513,309813,0,286999,5,2,16,426,2,34,7,...,943,2658,1,5.0,9.0,6.0,1.0,-1,-1,0
7675514,309810,0,286999,5,2,16,426,2,34,7,...,943,2658,1,5.0,9.0,6.0,1.0,-1,-1,0
7675515,309812,0,286999,5,2,16,426,2,34,7,...,943,2658,1,5.0,9.0,6.0,1.0,-1,-1,0


## Process Merged Data
* split list data
* for timestamp data, we transformed it into int type which represents how mant minutes it is from the earliest time.

In [6]:
object_columns = data_merged.select_dtypes(include='object').columns
object_columns

# delete the '^' 
for col in object_columns:
    data_merged[col] = data_merged[col].apply(lambda x: x.split('^'))

# fixed length
fixed_length = 5

# pad or truncate 
def pad_or_truncate(lst, length):
    if len(lst) > length:
        return lst[:length]
    else:
        return lst + ['0'] * (length - len(lst))

for col in object_columns:
    data_merged[col] = data_merged[col].apply(lambda x: pad_or_truncate(x, fixed_length))

In [7]:
data_merged[object_columns].head()

Unnamed: 0,ad_click_list_v001,ad_click_list_v002,ad_click_list_v003,ad_close_list_v001,ad_close_list_v002,ad_close_list_v003,u_newsCatInterestsST_x,u_newsCatInterests,u_newsCatDislike,u_click_ca2_news
0,"[30157, 30648, 14278, 31706, 0]","[2066, 1776, 1036, 0, 0]","[114, 219, 312, 0, 0]","[24107, 0, 0, 0, 0]","[1218, 0, 0, 0, 0]","[173, 0, 0, 0, 0]","[39, 220, 16, 0, 0]","[220, 16, 39, 142, 211]","[0, 0, 0, 0, 0]","[39, 16, 220, 142, 211]"
1,"[30157, 30648, 14278, 31706, 0]","[2066, 1776, 1036, 0, 0]","[114, 219, 312, 0, 0]","[24107, 0, 0, 0, 0]","[1218, 0, 0, 0, 0]","[173, 0, 0, 0, 0]","[39, 220, 16, 0, 0]","[220, 16, 39, 142, 211]","[0, 0, 0, 0, 0]","[39, 16, 220, 142, 211]"
2,"[30157, 30648, 14278, 31706, 0]","[2066, 1776, 1036, 0, 0]","[114, 219, 312, 0, 0]","[24107, 0, 0, 0, 0]","[1218, 0, 0, 0, 0]","[173, 0, 0, 0, 0]","[39, 220, 16, 0, 0]","[220, 16, 39, 142, 211]","[0, 0, 0, 0, 0]","[39, 16, 220, 142, 211]"
3,"[30157, 30648, 14278, 31706, 0]","[2066, 1776, 1036, 0, 0]","[114, 219, 312, 0, 0]","[24107, 0, 0, 0, 0]","[1218, 0, 0, 0, 0]","[173, 0, 0, 0, 0]","[39, 220, 16, 0, 0]","[220, 16, 39, 142, 211]","[0, 0, 0, 0, 0]","[39, 16, 220, 142, 211]"
4,"[30157, 30648, 14278, 31706, 0]","[2066, 1776, 1036, 0, 0]","[114, 219, 312, 0, 0]","[24107, 0, 0, 0, 0]","[1218, 0, 0, 0, 0]","[173, 0, 0, 0, 0]","[39, 220, 16, 0, 0]","[220, 16, 39, 142, 211]","[0, 0, 0, 0, 0]","[39, 16, 220, 142, 211]"


In [9]:
# split into multiple columns
def split_list_column(df, column_name, new_column_prefix):
    max_length = df[column_name].apply(len).max()
    split_columns = pd.DataFrame(df[column_name].tolist(),
                                 columns=[f'{new_column_prefix}_{i+1}' for i in range(max_length)])
    df = df.drop(columns=[column_name]).join(split_columns)
    return df

for column in object_columns:
    data_merged = split_list_column(data_merged, column, column)

### Merged Data Generating

* We used a CGAN(Conditional Generative Adversial Network) model to generate synthesized data
* We used label as the condition in the model, for users with different labels are supposed to have different characteristics.
* We also used spectral normalization the process the Discriminator to prevent it to be too strong for the Generator.

In [11]:
import torch
import torch.nn as nn

# Conditional GAN model architecture
class Generator(nn.Module): 
    def __init__(self, input_size, condition_size, output_size):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size + condition_size, 128),
            nn.ReLU(True),
            nn.Linear(128, 256),
            nn.ReLU(True),
            nn.Linear(256, 512),
            nn.ReLU(True),
            nn.Linear(512, output_size)
        )
    
    def forward(self, x, c):
        # Concatenate noise and condition
        x = torch.cat((x, c), 1)
        return self.model(x)


class Discriminator(nn.Module):
    def __init__(self, input_size, condition_size):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            spectral_norm(nn.Linear(input_size + condition_size, 512)),  
            nn.LeakyReLU(0.2, inplace=True),
            spectral_norm(nn.Linear(512, 256)), 
            nn.LeakyReLU(0.2, inplace=True),
            spectral_norm(nn.Linear(256, 128)),  
            nn.LeakyReLU(0.2, inplace=True),
            spectral_norm(nn.Linear(128, 1)), 
            nn.Sigmoid()
        )
    
    def forward(self, x, c):
        # Concatenate input and condition
        x = torch.cat((x, c), 1)
        return self.model(x)


In [13]:
# hyperparameters
input_size = 100  
output_size = len(data_merged.drop('label_x', axis=1).columns)  
condition_size = 1

num_epochs = 10000
batch_size = 128

learning_rate = 0.0001

# initialization
generator = Generator(input_size, condition_size, output_size)
discriminator = Discriminator(output_size, condition_size)

# loss function & optimizer
criterion = nn.BCELoss()
generator_optimizer = optim.Adam(generator.parameters(), lr=learning_rate)
discriminator_optimizer = optim.Adam(discriminator.parameters(), lr=learning_rate)

# standardize
scaler = MinMaxScaler()
# StandardScaler()
ads_scaled = scaler.fit_transform(data_merged)

# convert to tensor
ads_data = torch.tensor(ads_scaled, dtype=torch.float32)

In [15]:
G_losses = []
D_losses = []
G_Accuracys = []
D_Accuracys = []

# Training
for epoch in range(num_epochs):
    
    discriminator_optimizer.zero_grad() 
    
    real_data = ads_data[torch.randint(0, len(ads_data), (batch_size,))]
    real_condition = real_data[:, 1].unsqueeze(1)
    real_samples = torch.cat((real_data[:, :1], real_data[:, 2:]), dim=1)
    real_labels = torch.ones(batch_size, 1)
    
    # generate fake data
    noise = torch.randn(batch_size, input_size)
    fake_condition = torch.randint(0, 2, (batch_size,1))
    fake_samples = generator(noise, fake_condition)
    fake_labels = torch.zeros(batch_size, 1)
    
    # loss for discriminator 
    real_output = discriminator(real_samples, real_condition)
    fake_output = discriminator(fake_samples.detach(), fake_condition)
    real_loss = criterion(real_output, real_labels)
    fake_loss = criterion(fake_output, fake_labels)
    discriminator_loss = real_loss + fake_loss
    
    discriminator_loss.backward()
    discriminator_optimizer.step()

    # clear gradients
    generator_optimizer.zero_grad() 
    
    # generate fake data
    noise = torch.randn(batch_size, input_size)
    fake_condition = torch.randint(0, 2, (batch_size,1))
    fake_samples = generator(noise, fake_condition)
    fake_output = discriminator(fake_samples, fake_condition)
    
    # loss for generator
    generator_loss = criterion(fake_output, real_labels)
    
    generator_loss.backward()
    generator_optimizer.step()
    
    # Print loss and other information every 100 epochs
    if (epoch + 1) % 100 == 0:
        # Calculate discriminator accuracy
        real_accuracy = (real_output > 0.5).float().mean().item()
        fake_accuracy = (fake_output < 0.5).float().mean().item()
        d_accuracy = 0.5 * (real_accuracy + fake_accuracy)

        # Calculate generator accuracy
        g_accuracy = (fake_output > 0.5).float().mean().item()
        
        G_losses.append(generator_loss.item())
        D_losses.append(discriminator_loss.item())
        G_Accuracys.append(d_accuracy)
        D_Accuracys.append(g_accuracy)
        
        # Print losses and accuracies
        tqdm.write(f"Epoch [{epoch + 1}/{num_epochs}], "
                f"Generator Loss: {generator_loss.item():.4f}, "
                f"Discriminator Loss: {discriminator_loss.item():.4f}, "
                f"Discriminator Accuracy: {d_accuracy:.4f}, "
                f"Generator Accuracy: {g_accuracy:.4f}")

Epoch [100/10000], Generator Loss: 0.9920, Discriminator Loss: 1.4944, Discriminator Accuracy: 0.5000, Generator Accuracy: 0.0000
Epoch [200/10000], Generator Loss: 0.6694, Discriminator Loss: 1.1800, Discriminator Accuracy: 0.6602, Generator Accuracy: 0.6797
Epoch [300/10000], Generator Loss: 0.9393, Discriminator Loss: 1.2485, Discriminator Accuracy: 0.5273, Generator Accuracy: 0.0000
Epoch [400/10000], Generator Loss: 0.8840, Discriminator Loss: 1.2290, Discriminator Accuracy: 0.9023, Generator Accuracy: 0.0000
Epoch [500/10000], Generator Loss: 0.8920, Discriminator Loss: 1.0114, Discriminator Accuracy: 1.0000, Generator Accuracy: 0.0000
Epoch [600/10000], Generator Loss: 1.2077, Discriminator Loss: 1.5501, Discriminator Accuracy: 0.5000, Generator Accuracy: 0.0000
Epoch [700/10000], Generator Loss: 0.8378, Discriminator Loss: 1.2857, Discriminator Accuracy: 0.7305, Generator Accuracy: 0.0000
Epoch [800/10000], Generator Loss: 0.8280, Discriminator Loss: 1.2164, Discriminator Accur

Epoch [6400/10000], Generator Loss: 0.9125, Discriminator Loss: 1.1246, Discriminator Accuracy: 0.7695, Generator Accuracy: 0.4531
Epoch [6500/10000], Generator Loss: 0.7629, Discriminator Loss: 1.3624, Discriminator Accuracy: 0.7227, Generator Accuracy: 0.5000
Epoch [6600/10000], Generator Loss: 0.8430, Discriminator Loss: 1.1123, Discriminator Accuracy: 0.7344, Generator Accuracy: 0.5156
Epoch [6700/10000], Generator Loss: 0.9856, Discriminator Loss: 1.2313, Discriminator Accuracy: 0.6055, Generator Accuracy: 0.0000
Epoch [6800/10000], Generator Loss: 0.7697, Discriminator Loss: 1.2195, Discriminator Accuracy: 0.7148, Generator Accuracy: 0.5234
Epoch [6900/10000], Generator Loss: 0.8407, Discriminator Loss: 1.1404, Discriminator Accuracy: 0.7461, Generator Accuracy: 0.4844
Epoch [7000/10000], Generator Loss: 1.0143, Discriminator Loss: 1.2250, Discriminator Accuracy: 0.6562, Generator Accuracy: 0.0000
Epoch [7100/10000], Generator Loss: 0.7791, Discriminator Loss: 1.3027, Discriminat

In [17]:
# generate new data point
noise = torch.randn(100, input_size)
condition = torch.randint(0, 2, (100,1))
generated_data = generator(noise, condition).detach().numpy()
# generated_data.shape
generated_data = np.concatenate((generated_data[:, :1], condition.float().numpy(), generated_data[:, 1:]), axis=1)
generated_data = scaler.inverse_transform(generated_data)
generated_data[generated_data < 0] = 0 # set negative values to 0
generated_data = np.round(generated_data).astype(int)  # round to nearest integer
generated_df = pd.DataFrame(generated_data, columns = data_merged.columns) # convert to dataframe
generated_df.head()

Unnamed: 0,log_id,label_x,user_id,age,gender,residence,city,city_rank,series_dev,series_group,...,u_newsCatDislike_1,u_newsCatDislike_2,u_newsCatDislike_3,u_newsCatDislike_4,u_newsCatDislike_5,u_click_ca2_news_1,u_click_ca2_news_2,u_click_ca2_news_3,u_click_ca2_news_4,u_click_ca2_news_5
0,146630,1,158795,3,2,20,192,4,24,5,...,33,0,43,0,0,138,87,117,23,86
1,176902,0,162059,3,2,22,198,4,22,5,...,25,0,38,0,0,124,82,110,26,80
2,153154,0,159486,3,2,22,192,4,23,5,...,31,0,42,0,0,132,86,111,22,82
3,201391,0,164411,3,2,22,197,4,21,5,...,20,0,32,0,0,115,81,98,31,76
4,202024,1,165928,3,2,22,202,4,22,5,...,21,0,36,0,0,124,84,105,32,82


In [12]:
generated_df['i_entities'] = generated_df[['i_entities_'+str(i+1) for i in range(5)]].astype(str).agg('^'.join, axis=1)
generated_df['u_newsCatInterests'] = generated_df[['u_newsCatInterests_'+str(i+1) for i in range(5)]].astype(str).agg('^'.join, axis=1)
generated_df['u_newsCatDislike'] = generated_df[['u_newsCatDislike_'+str(i+1) for i in range(5)]].astype(str).agg('^'.join, axis=1)
generated_df['u_newsCatInterestsST'] = generated_df[['u_newsCatInterestsST_'+str(i+1) for i in range(5)]].astype(str).agg('^'.join, axis=1)
generated_df['u_click_ca2_news'] = generated_df[['u_click_ca2_news_'+str(i+1) for i in range(5)]].astype(str).agg('^'.join, axis=1)

Unnamed: 0,log_id,label,user_id,age,gender,residence,city,city_rank,series_dev,series_group,...,ad_close_list_v003_1,ad_close_list_v003_2,ad_close_list_v003_3,ad_close_list_v003_4,ad_close_list_v003_5,u_newsCatInterestsST_1,u_newsCatInterestsST_2,u_newsCatInterestsST_3,u_newsCatInterestsST_4,u_newsCatInterestsST_5
100000,558337,0,115633,8,2,16,185,2,30,3,...,173,0,0,0,0,216,171,65,86,65
100001,558339,0,115633,8,2,16,185,2,30,3,...,173,0,0,0,0,216,171,65,86,65
100002,558341,0,115633,8,2,16,185,2,30,3,...,173,0,0,0,0,216,171,65,86,65
100003,558343,0,115633,8,2,16,185,2,30,3,...,173,0,0,0,0,216,171,65,86,65
100004,426771,0,115634,8,2,13,103,2,34,7,...,173,0,0,0,0,108,112,62,114,168


In [19]:
columns_to_process = ['ad_click_list_v001', 'ad_click_list_v002', 'ad_click_list_v003', 'ad_close_list_v001', 
                     'ad_close_list_v002', 'ad_close_list_v003', 'u_newsCatInterestsST_x', 'u_newsCatInterests',
                    'u_newsCatDislike', 'u_click_ca2_news']

for column in columns_to_process:
    generated_df[column] = generated_df[[column+'_'+str(i+1) for i in range(5)]].astype(str).agg('^'.join, axis=1)
    for item in [column+'_'+str(i+1) for i in range(5)]:
        generated_df.drop(item, axis = 1)

In [20]:
generated_df

Unnamed: 0,log_id,label_x,user_id,age,gender,residence,city,city_rank,series_dev,series_group,...,ad_click_list_v001,ad_click_list_v002,ad_click_list_v003,ad_close_list_v001,ad_close_list_v002,ad_close_list_v003,u_newsCatInterestsST_x,u_newsCatInterests,u_newsCatDislike,u_click_ca2_news
0,146630,1,158795,3,2,20,192,4,24,5,...,24684^17162^2672^7943^2857,1770^436^418^92^31,233^47^0^0^0,24016^9912^0^0^9109,1294^53^150^529^0,159^0^0^0^3,94^89^89^0^121,123^79^118^126^51,33^0^43^0^0,138^87^117^23^86
1,176902,0,162059,3,2,22,198,4,22,5,...,23282^18147^4803^9452^3592,1661^464^520^245^162,226^50^10^0^0,21938^8868^0^0^6523,1260^37^187^426^0,161^0^0^0^3,83^86^88^11^110,110^80^106^115^50,25^0^38^0^0,124^82^110^26^80
2,153154,0,159486,3,2,22,192,4,23,5,...,24155^17591^3532^8920^2933,1732^429^491^150^91,225^39^0^0^0,23071^9449^0^0^7604,1278^47^175^482^0,161^0^0^0^2,86^88^92^0^119,116^81^114^123^51,31^0^42^0^0,132^86^111^22^82
3,201391,0,164411,3,2,22,197,4,21,5,...,22684^18116^7606^10946^5238,1572^530^605^400^329,220^57^45^6^23,20921^7302^0^0^4307,1243^0^169^340^0,163^0^0^0^0,75^87^87^26^101,102^79^98^109^54,20^0^32^0^0,115^81^98^31^76
4,202024,1,165928,3,2,22,202,4,22,5,...,23167^18158^6787^10589^4937,1641^527^565^343^251,226^55^27^0^11,21923^8319^0^0^5790,1267^19^173^384^0,164^0^0^0^0,83^92^91^20^107,110^81^105^114^52,21^0^36^0^0,124^84^105^32^82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,182470,0,162792,3,2,22,196,4,23,5,...,23381^17721^5405^9574^4299,1674^479^531^255^181,227^54^15^0^1,22386^8651^0^0^6529,1266^29^155^432^0,161^0^0^0^1,85^90^88^13^110,113^77^109^118^52,23^0^37^0^0,129^85^109^27^80
96,185761,0,161787,3,2,21,195,4,22,5,...,23068^17219^5422^9497^3985,1647^480^512^267^185,224^54^13^0^3,21879^8151^0^0^6364,1267^21^170^408^0,161^0^0^0^0,82^86^85^14^106,109^77^103^113^50,21^0^35^0^0,122^82^104^27^80
97,180091,0,161840,3,2,21,196,4,22,5,...,23101^17493^5866^9822^4508,1644^480^547^303^217,222^52^24^0^9,21760^8417^0^0^5769,1259^7^168^398^0,159^0^0^0^0,82^89^88^16^107,110^77^105^116^52,22^0^35^0^0,121^82^100^27^79
98,163260,1,158795,3,2,21,193,4,23,5,...,23860^16959^4202^9049^3164,1709^448^487^189^97,225^45^0^0^0,22910^8877^0^0^7540,1277^32^159^457^0,160^0^0^0^2,87^86^89^5^113,117^78^111^119^51,28^0^39^0^0,126^85^108^24^82


Now we have finished the generation of merged data.