#Data Preprocessing


## Movie Lens

In [None]:
!pip install -U deepctr-torch
!pip install --upgrade tensorflow

In [None]:
!pip show tensorflow

In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
import torch
import tensorflow
import keras
import pickle

from sklearn.metrics import log_loss, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from keras_preprocessing.sequence import pad_sequences

from deepctr_torch.inputs import SparseFeat, DenseFeat, VarLenSparseFeat, get_feature_names
from deepctr_torch.models import DeepFM, DCN, xDeepFM
from deepctr_torch.models import WDL
from deepctr_torch.models.basemodel import BaseModel


import random
import os
from tqdm import tqdm
import io
import gzip
drive.mount('/content/drive/') 
dir = ('/content/drive/My Drive/Github/Colab Notebooks/UCL/grad_research/cross-domain_rs/ml-1m/')

Mounted at /content/drive/


In [None]:
#User information
unames = ['user_id','gender','age','occupation','zip']
users = pd.read_table(dir + 'users.dat', sep='::', 
                      header=None, names=unames, engine='python')
#Rating information
rnames = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_table(dir + 'ratings.dat', sep='::', header=None, names=rnames, engine='python')

#Movie information
mnames = ['movie_id','title','genres']
movies = pd.read_table(dir + 'movies.dat', sep='::', header=None, names=mnames, engine='python',encoding='ISO-8859-1')

In [None]:
data=pd.merge(pd.merge(ratings,users),movies)
data['CT'] = data['rating'].apply(lambda x: 1 if x > 3 else 0)
data.groupby('CT').count()
# data[data['genres']=="Animation|Children's|Musical|Romance"].groupby('gender').count()
data

In [None]:
def data_process(data_df, dense_features,sparse_features):
  # Replace continuous NA data to 0.0
  data_df[dense_features] = data_df[dense_features].fillna(0.0)
  # Replace discrete NA data to -1
  data_df[sparse_features] = data_df[sparse_features].fillna("-1")
  for feat in sparse_features:
    lbe = LabelEncoder()
    data_df[feat] = lbe.fit_transform(data_df[feat])
  return data_df[dense_features+sparse_features]
""" 1. Generate the paded and encoded sequence feature of sequence input feature(value 0 is for padding).
    2. Generate config of sequence feature with VarLenSparseFeat """
    
def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))

In [None]:
columns = data.columns.values
dense_features = ["timestamp"]
sparse_features = ["movie_id", "user_id","age", "occupation", "zip"]
target = ['CT']

In [None]:
# 1.Label Encoding for sparse features,and do simple Transformation for dense features
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

# Preprocess the sequence feture(padding)
key2index = {}
genres_list = list(map(split, data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length) #6 which means the max genres contains 6 genres
# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

In [None]:
 # 2.count #unique features for each sparse field and generate feature config for sequence feature

fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=4)
                          for feat in sparse_features] + [DenseFeat(feat, 1, )
                                                          for feat in dense_features]

varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
    key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean')]  # Notice : value 0 is for padding for sequence input feature

linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

data['genres'] = genres_list

## Single Domain Experiment

In [None]:
female = data[data['gender']=='F']
male = data[data['gender']=='M']
female = female.drop(['rating','title'],axis = 1)
male = male.drop(['gender','rating','title'],axis = 1)

In [None]:
# 3.generate input data for model

train, test = train_test_split(female, test_size=0.2)
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

## Mixed Domain

In [None]:
ftr,fte = train_test_split(female, test_size=0.4)
mtr,mte = train_test_split(male, test_size=0.4)
train = ftr.append(mtr)
test = fte
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

# Models

## FM

In [None]:
device = 'gpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = FM(linear_feature_columns, dnn_feature_columns,
                device=device)

model.compile("adagrad", "binary_crossentropy",
              metrics=["binary_crossentropy", "auc"], )
fm_history = model.fit(train_model_input,train[target].values,batch_size=64,epochs=30,verbose=1,validation_split=0.1)

pred_ans = model.predict(test_model_input, 64)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

## DeepFM

In [None]:
epochs = 20

search_space = {
    "lr": 1e-3,
    "batch_size": 256,
    "l2_reg_embedding": 0.000005,
    "l2_reg_linear": 0.000005,
    "l2_reg_dnn": 0.000005,
    "dnn_hidden_units": 256,
    "dnn_dropout": 0.8
}

In [None]:
device = 'gpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = DeepFM(linear_feature_columns, dnn_feature_columns,
                task='binary',device=device)

model.compile("adagrad", "binary_crossentropy",
              metrics=["binary_crossentropy", "auc"], )
deepfm_history = model.fit(train_model_input,train[target].values,batch_size=64,epochs=30,verbose=2,validation_split=0.1)

pred_ans = model.predict(test_model_input, 64)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

## DCN

In [None]:
 # 4.Define Model,train,predict and evaluate

device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = DCN(linear_feature_columns, dnn_feature_columns,
                task='binary',
               device=device)

model.compile("adagrad", "binary_crossentropy",
              metrics=["binary_crossentropy", "auc"], )
dcn_history = model.fit(train_model_input,train[target].values,batch_size=64,epochs=30,verbose=2,validation_split=0.1)

pred_ans = model.predict(test_model_input, 64)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

## XDeepFM

In [None]:

device = 'gpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

xdeepfm_model = xDeepFM(linear_feature_columns, dnn_feature_columns,
                task='binary',
                device=device)

xdeepfm_model.compile("adagrad", "binary_crossentropy",
              metrics=["binary_crossentropy", "auc"], )
xdeepfm_history = xdeepfm_model.fit(train_model_input,train[target].values,batch_size=64,epochs=30 ,verbose=2,validation_split=0.1)

pred_ans = xdeepfm_model.predict(test_model_input, 64)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))