In [5]:
def gen_profile_feas(data, k):
    """
    Add decomposed PID-specific features to data
    
    Args:
        - data (pandas) : DF we want to add the k-dimensional main components
                         of the decomposed PID-specific features
                          Needs a Column named "pid" [with the Personal IDs]
                         [these PID features are loaded automaitcally from
                          'data/raw/data_set_phase1/profiles.csv']
        - k (integer)   : Amount of dimensions we decompose our data to

    Res:
        - data (pandas) : same but with 'k' extra columns:
                           "svd_fea_1", ..., "svd_fea_k"
                           corresponding values of the k main comoponents
    """
    from sklearn.decomposition import TruncatedSVD
    # Inputcheck for data:
    if "pid" not in list(data):
        raise ValueError("Data doesn't have a 'pid' column")
            
    # read in the profile data
    profile_data = pd.read_csv("../data/raw/data_set_phase1/profiles.csv")

    # subset all PID specific features:
    x = profile_data.drop(['pid'], axis=1).values

    # check whether k is meaningful    [smaller  #binary_cols in profiles]
    if k >= x.shape[1]: raise ValueError("k needs to be smaller than ", str(x.shape[1]))
    
    # linear dimensionality reduction by means of truncated
    # singular value decomposition (SVD)
    svd = TruncatedSVD(n_components = k, n_iter = 20, random_state = 2019)
    svd_x = svd.fit_transform(x)
    
    # Save the decomposed PID features in DF
    svd_feas = pd.DataFrame(svd_x)
    svd_feas.columns = ['svd_fea_{}'.format(i) for i in range(k)]
    
    # add pid to svd_feas, so we can merge it to the original DF!
    svd_feas['pid'] = profile_data['pid'].values
   # data['pid'] = data['pid'].fillna(-1) # no na subsetted above
    
    data = data.merge(svd_feas, on='pid', how='inner')
    return data

In [2]:
import pandas as pd
import numpy as np

In [3]:
df_train = pd.read_pickle('../data/processed/df_train.pickle')

In [6]:
df = gen_profile_feas(df_train, 20)

In [7]:
df.head()

Unnamed: 0,sid,click_time,click_mode,distance_plan,eta,price,transport_mode,plan_time,pid,req_time,...,svd_fea_10,svd_fea_11,svd_fea_12,svd_fea_13,svd_fea_14,svd_fea_15,svd_fea_16,svd_fea_17,svd_fea_18,svd_fea_19
0,2848914,2018-11-17 18:42:17,1,53156,6456,700.0,1,2018-11-17 12:56:15,101804.0,2018-11-17 12:56:15,...,-0.087286,-0.860515,0.570766,-0.304735,-0.091801,0.279651,-0.175971,0.929617,0.007845,-0.475758
1,2848914,2018-11-17 18:42:17,1,48112,3535,700.0,3,2018-11-17 12:56:15,101804.0,2018-11-17 12:56:15,...,-0.087286,-0.860515,0.570766,-0.304735,-0.091801,0.279651,-0.175971,0.929617,0.007845,-0.475758
2,2848914,2018-11-17 18:42:17,1,48112,3655,16500.0,4,2018-11-17 12:56:15,101804.0,2018-11-17 12:56:15,...,-0.087286,-0.860515,0.570766,-0.304735,-0.091801,0.279651,-0.175971,0.929617,0.007845,-0.475758
3,2848914,2018-11-17 18:42:17,1,51641,8871,1200.0,1,2018-11-17 12:56:15,101804.0,2018-11-17 12:56:15,...,-0.087286,-0.860515,0.570766,-0.304735,-0.091801,0.279651,-0.175971,0.929617,0.007845,-0.475758
4,2848913,2018-11-17 22:25:29,2,10634,2355,2000.0,10,2018-11-17 22:25:16,101804.0,2018-11-17 22:25:16,...,-0.087286,-0.860515,0.570766,-0.304735,-0.091801,0.279651,-0.175971,0.929617,0.007845,-0.475758


In [9]:
def create_svm_file(df, features_X, path):
    """
    This function saves the dataframe as lib svm file.
    """
    from sklearn.datasets import dump_svmlight_file

    df.sort_values("sid", inplace=True)
    
    # Create ranking target
    if 'click_mode' in df.columns:
        df = df.assign(target = df.apply(lambda x: 1 if x.click_mode == x.transport_mode else 0, axis=1))
    else:
        df = df.assign(target = 0)
    
    X = df[features_X]
    y = df["target"]
    query_id = df.sid
    
    # Dump SVM file
    dump_svmlight_file(X=X, y=y, f=path, query_id=query_id, zero_based=False)
    return X, y

In [10]:
df.columns

Index(['sid', 'click_time', 'click_mode', 'distance_plan', 'eta', 'price',
       'transport_mode', 'plan_time', 'pid', 'req_time', 'o_long', 'o_lat',
       'd_long', 'd_lat', 'distance_query', 'svd_fea_0', 'svd_fea_1',
       'svd_fea_2', 'svd_fea_3', 'svd_fea_4', 'svd_fea_5', 'svd_fea_6',
       'svd_fea_7', 'svd_fea_8', 'svd_fea_9', 'svd_fea_10', 'svd_fea_11',
       'svd_fea_12', 'svd_fea_13', 'svd_fea_14', 'svd_fea_15', 'svd_fea_16',
       'svd_fea_17', 'svd_fea_18', 'svd_fea_19'],
      dtype='object')

In [11]:
features = [
    'transport_mode',
    'distance_plan',
    'eta', 
    'price',
    'distance_query', 'svd_fea_0', 'svd_fea_1',
       'svd_fea_2', 'svd_fea_3', 'svd_fea_4', 'svd_fea_5', 'svd_fea_6',
       'svd_fea_7', 'svd_fea_8', 'svd_fea_9', 'svd_fea_10', 'svd_fea_11',
       'svd_fea_12', 'svd_fea_13', 'svd_fea_14', 'svd_fea_15', 'svd_fea_16',
       'svd_fea_17', 'svd_fea_18', 'svd_fea_19'
]

In [16]:
X, y = create_svm_file(df, features, '../data/interim/train_pid.libsvm')

In [13]:
df_test = pd.read_pickle('../data/processed/df_test.pickle')

In [14]:
df_test_pid = gen_profile_feas(df_train, 20)

In [17]:
X, y = create_svm_file(df_test_pid, features, '../data/interim/test_pid.libsvm')