# Pre-Processing Real Data 

This is an example of how the real data was processed.

In [4]:
import pandas as pd

In [None]:
# example placeholders
rawdata1 = pd.read_parquet('raw_data_file1.parquet')
rawdata1 = pd.read_parquet('raw_data_file2.parquet')

In [None]:
# Load in raw data
df = pd.concat([rawdata1, rawdata2])

# Drop users that are empty, in order to delete bots
df = df.loc[df['user'] != 'None']

# Make sure features are not empty
df = df[df['features'].apply(len) > 1]

# Only take upto rank 25
df = df[df['rank'] <= 25]

In [None]:
ids = []
time = []
time_minute = []

# Make jobs combined on combined characteristics
for row, col in df.iterrows(): 
    obj = (col['features'][1],
            col['features'][2],
            col['features'][3],
            col['features'][4],
            col['features'][5],
            col['features'][6],
            col['features'][7],
            col['features'][8])
    id = '-'.join([str(x) for x in obj])
    ids.append(id)
    
    # Make truncated time variables
    hdm = [col['time'].hour, col['time'].day, col['time'].month]
    mhdm = [col['time'].minute, col['time'].hour, col['time'].day, col['time'].month]
    hdm = '-'.join([str(x) for x in hdm])
    mhdm = '-'.join([str(x) for x in mhdm])
    time.append(hdm)
    time_minute.append(mhdm)

In [None]:
df['embedding'] = ids
df['time_rounded'] = time
df['time_rounded_min'] = time_minute

In [None]:
df['session'] = df.groupby(by=['user', 'query_short', 'time'], sort=False).ngroup()+1

In [None]:
# Load in file with all applies/views for users
user_data = pd.read('user_data.parquet')

In [None]:
# Select only rows where user views and count views
user_action_view = user_data[user_data["action_type"] == 'view']["user"].value_counts().sort_values(ascending = False)

# To decide cut-off
print(user_action_view.head())
print(user_action_view.describe())
print(f"MODE: ", user_action_view.mode())
print(f"MEDIAN: ", user_action_view.median())

cut_off_views = 99999999 # placeholder

# Select users that viewed more than X job listings and save for later
sus_users_many_views = user_action_view[user_action_view.values > cutoff_views]

In [None]:
# Select only rows where user applies
user_action_apply = user_data[user_data["action_type"] == 'view']["user"].value_counts().sort_values(ascending = False)

# To decide cut-off
print(user_action_apply.head())
print(user_action_apply.describe())
print(f"MODE: ", user_action_apply.mode())
print(f"MEDIAN: ", user_action_apply.median())

cut_off_applies = 99999999 # placeholder

# Select users that applied for more than a X jobs and save for later
sus_users_many_applies = user_action_apply[user_action_apply.values > cut_off_applies]

In [None]:
# Make list of suspicious users
bots = set(sus_users_many_views.index.tolist() + sus_users_many_applies.index.tolist()) 
df = df[~df['user'].isin(bots)]

In [None]:
# Recode apply/view to 1, none to 0
df['clicks'] = np.where(df['action_type'].isin(["view", "apply"]), 1, 0)

In [None]:
# Filter out queries with no applies or views
clicks_session = df.groupby('session')["action_type"].sum() < 1
no_clicks = clicks_session.index[clicks_session.values == True]
df = df[~df['session'].isin(no_clicks)]

In [None]:
# Recode so rank starts at 0
df['rank'] = df['rank'] - 1

# assign query_doc ids
df = data.assign(
    qd_id = df[
        ['query_shortened', 'job', 'features']
    ].apply(
        lambda r : r['query_shortened'] + "_" + r['embedding'],
        axis = 1
    ).astype("category")
)

In [None]:
# Drop more duplicates that are in the same hour as each other
df = df.drop_duplicates(subset=['time_rounded', 'user', 'rank','qd_id', 'clicks'])