In [1]:
import pandas as pd 
import numpy as np 
import json
import os
import warnings 
warnings.filterwarnings(action='ignore') 

1. Json to csv 
* ActivityField 로그 데이터 중 사용 데이터만 csv로 변환 후 저장

2. activity_merged.csv 생성
* 각 행동유형 별 전처리 
    * 라벨링 (오퍼링, 행동유형)
* 행동유형 
    * ActivityField 로그:'EmailClickThrough', 'EmailOpen', 'WebVisit','FormSubmit'
    * FormSubmit ->  counsel, inquiry, event, service_request, subscribe, survey, register
    * CDO_webinar -> attend

* 컬럼명 통일 후 activity_merged 생성, 저장 

3. EmailSent 생성 ( CNS가 고객에게 보낸 이메일 )
    * 미분류 오퍼링의 경우 손수 라벨링 해주기
    

In [2]:
# 1. ActivityField 로그 데이터 중 사용 데이터만 csv로 변환 후 저장

# 지정한 행동유형에 해당하는 json파일을 불러오고 모두 합친 데이터프레임을 반환. 
def merge_files(filename, path): 
    dir_list = os.listdir(path)
    temp_list = list(filter(lambda x: filename in x, dir_list))
    dataframe = [] 
    for i in range(0, len(temp_list)):
        with open(path + temp_list[i], 'r') as f:
            data = json.load(f)
        df = pd.DataFrame.from_dict(data['items'])
        dataframe.append(df)
    return pd.concat([dataframe[i] for i in range(0,len(temp_list))])

# 각 행동유형 별로 merge_files(filename, path) 이용해서 json파일을 불러오고, 모두 합친 후 
# csv 파일로 타입변환 한 후 저장 
def json_to_csv(activity_list, json_path, csv_path):
    for activity in activity_list:
        temp_df = merge_files(activity, json_path)
        filename = activity + '.csv'
        temp_df.to_csv(csv_path + filename, index = False)


In [3]:
# 2. activityfield_merged 생성: 각 행동유형 별로 라벨링 후 모든 행동유형 합치기

def email_open_click(csv_path):
    email_open = pd.read_csv(csv_path + 'EmailOpen.csv', low_memory = False, usecols =['ContactId','ActivityDate','ActivityType','AssetId','AssetName'])
    email_click = pd.read_csv(csv_path + 'EmailClickThrough.csv', low_memory = False, usecols =['ContactId','ActivityDate','ActivityType','AssetId','AssetName'])
    data = pd.concat([email_open, email_click], axis =0)
    data.AssetName = data.AssetName.apply(lambda x: str(x).replace(' ',''))
    email_category = pd.read_excel('./labelling_category/action_activity.xlsx',sheet_name='EmailSendOpenClickthrough Asset')
    email_category = email_category[['AssetName','cns_action (major)','Offering']].drop_duplicates(subset = ['AssetName','Offering'])
    email_category.AssetName = email_category.AssetName.apply(lambda x: str(x).replace(' ',''))
    email_merged = pd.merge(data, email_category, on = ['AssetName'], how = 'left').drop_duplicates()
    email_merged = email_merged[(email_merged.Offering != 'USELESS') & (email_merged.Offering != 'TEST')]
    email_merged.rename(columns={'ContactId':'ContactID'},inplace=True)
    return email_merged


In [10]:

def webvisit(csv_path): 
    webvisit = pd.read_csv(csv_path + 'WebVisit.csv', low_memory = False)
    cloud = "HRSaaS-NextHR|saas|singlex|AM|Cloud|CloudXper|ProOps|데이터센터|DataCenter|LaunchCenter|cloud|MSCloud|lgamday|launchcenter|amsurvey|lgamday|amevent"
    logi = 'logi|스마트물류|물류|SmartTransportation|smartlogistics|SmartLogistic'
    pub = 'public|Publicity'; blc = 'Blockchain|Monachain'
    smcity = 'smartcity'; infra = 'Infra|infra'
    aidata = 'DAP|챗봇|Chatbot|RPA|AI|AI튜터|마이데이터|aidata|AIDay|RobotService|aibigdata|dap|aiday2021'
    consult = 'entrue|Entrue|consulting|Consulting'; dcx = 'dcx|DCX';unknown = 'S007_Subscription'
    fin = 'Finance|finance'; secu = 'OTSecurity|Security|Summit|security|SecuXperfree'
    dxsol = "DevOn|MSA|Solution|solution|dxsolution|Tuna|TunA|SolutionDay|devonmsa|PerfecTwinSuite|devonncd|FreeTrial"
    general = 'TechDay|lgcns-techday2022|techday|techdaysurvey|Softwave|softwave|SoftWave'
    
    d = {'cloud': cloud, 'infra':infra,'logi': logi, 'pub': pub, 'blc':blc, 'smcity':smcity, 
        'aidata':aidata, 'consult':consult, 'dcx':dcx, 'fin': fin, 
        'secu': secu, 'dxsol':dxsol, 'general':general, 'unknown':unknown}

    webvisit['Offering'] = 0
    offerings = list(d.keys()); word_lists = list(d.values())

    for i in range(len(offerings)):
        offering = offerings[i]; word_list = word_lists[i]
        idx = list(webvisit.FirstPageViewUrl.loc[webvisit.FirstPageViewUrl.str.contains(word_list)].index)
        webvisit['Offering'][idx] = offering

    idx = webvisit[(webvisit.Offering == 0) | (webvisit.Offering == '0')].index
    webvisit.drop(idx)
    webvisit.rename(columns={'ContactId':'ContactID'}, inplace=True)
    cols = ['ContactID', 'ActivityDate', 'ActivityType', 'Offering']
    return webvisit[cols]


def formsubmit(csv_path):
    form = pd.read_csv(csv_path + 'FormSubmit.csv')
    form.AssetName = form.AssetName.apply(lambda x: x.replace(' ',''))
    form_category =  pd.read_excel('./labelling_category/action_activity.xlsx',sheet_name='Formsubmit AssetType 분류')
    form_merged = pd.merge(form[['ActivityDate', 'ContactId', 'AssetId', 'AssetName','CampaignId']],form_category, on=['AssetId','AssetName'],how='left').drop_duplicates()
    form_merged.rename(columns={'ContactId':'ContactID'},inplace=True)
    form_merged = form_merged[(form_merged.Offering != 'USELESS') & (form_merged.Offering != 'TEST')]
    return form_merged


def attend(cdo_path): # CDO_webinar 전처리
    webinar = pd.read_csv(cdo_path + 'CDO_webinar.csv')
    # 참석한 사람만 걸러내기 
    attendees = webinar[webinar['참석여부'] == 'Y'] 
    attendees = attendees.loc[:, ['ContactID','Date Modified','로그인시간', '웨비나명']].drop_duplicates()
    attendees.columns = ['ContactID','Date Modified', 'ActivityDate', 'AssetName']
    idx = attendees[attendees.ActivityDate.isna()].index 
    attendees['ActivityDate'][idx] = attendees['Date Modified'][idx]
    attendees.drop(['Date Modified'], axis=1,inplace=True)
    attendees['ActivityType'] = 'attend'

    # 오퍼링 열 추가
    attendees['Offering'] = 0
    attendees.loc[attendees['AssetName'].str.contains("AM|Cloud|CloudXper|ProOps|데이터센터|LaunchCenter"), 'Offering'] =  'cloud'
    attendees.loc[attendees['AssetName'].str.contains("DevOn|MSA|Solution"), 'Offering'] =  'dxsol'
    attendees.loc[attendees['AssetName'].str.contains("DCX"), 'Offering'] =  'dcx'
    attendees.loc[attendees['AssetName'].str.contains("DAP|챗봇|RPA|AI|AI튜터|마이데이터"), 'Offering'] =  'aidata'
    attendees.loc[attendees['AssetName'].str.contains("Finance"), 'Offering'] =  'fin'
    attendees.loc[attendees['AssetName'].str.contains("OTSecurity|Security|Summit"), 'Offering'] =  'secu'
    attendees.loc[attendees['AssetName'].str.contains("Entrue"), 'Offering'] =  'consult'
    attendees.loc[attendees['AssetName'].str.contains("TechDay"), 'Offering'] =  'general'
    
    return attendees[(attendees.ContactID.notna()) & (attendees.ContactID != '-')]

In [11]:
save_path = '../../data/processed1/'

def create_save_activityfield_merged(save_path, save_option='n'): # 모든 행동유형 합친 뒤 저장 
    csv_path = '../../data/raw_data/ActivityField_csv/'
    cdo_path = '../../data/raw_data/'
    
    email_merged = email_open_click(csv_path)
    web = webvisit(csv_path)
    form = formsubmit(csv_path)
    webinar = attend(cdo_path)
    
    activity_merged = pd.concat([email_merged, web, form, webinar])
    activity_merged.drop_duplicates(inplace = True)
    activity_merged.sort_values(by = ['ContactID', 'ActivityDate'],inplace=True)
    activity_merged.reset_index(drop=True, inplace=True)
    activity_merged = activity_merged[['ContactID', 'ActivityType', 'ActivityDate', 'AssetId', 'AssetName','cns_action (major)', 'Offering']]
    activity_merged = activity_merged[activity_merged.ContactID.notna()]
    activity_merged = activity_merged.replace({'WebVisit':'webvisit'})
    activity_merged = activity_merged[(activity_merged.ActivityType.notna()) &(activity_merged.ActivityType != 'USELESS')]
    
    if save_option == 'y':
        activity_merged.to_csv(save_path + 'activity_merged.csv',index=False)
        print('Merged Activity Logs are saved to path: ',save_path, '\nfilename: activity_merged.csv')
    return activity_merged


In [12]:
save_path = '../../data/processed1/'
def create_save_emailsent(csv_path, save_option='n'):
    send = pd.read_csv(csv_path + 'EmailSend.csv', usecols =['ContactId','ActivityDate','ActivityType','AssetId','AssetName'])
    send.dropna(subset = ['ActivityDate'],axis=0, inplace = True)

    email_category = pd.read_excel('./labelling_category/action_activity.xlsx',sheet_name='EmailSendOpenClickthrough Asset')
    email_category = email_category[['AssetName','cns_action (major)','Offering']].drop_duplicates(subset = ['AssetName','Offering'])
    send.AssetName = send.AssetName.apply(lambda x: str(x).replace(' ',''))
    email_category.AssetName = email_category.AssetName.apply(lambda x: str(x).replace(' ',''))
    merged = pd.merge(send, email_category, on = ['AssetName'], how = 'left').drop_duplicates()
    merged.reset_index(drop=True, inplace=True)
    

    merged_new = merged[(merged.Offering != 'USELESS') & (merged.Offering != 'TEST')]
    merged_new.rename(columns={'ContactId':'ContactID'},inplace=True)
    merged_new.sort_values(by = ['ContactID', 'ActivityDate'],inplace=True)
    merged_new.reset_index(drop = True, inplace=True)
    if save_option == 'y':
        merged_new.to_csv(save_path + 'EmailSent.csv',index=False)
        print('EmailSent file is now saved to path: ', save_path,'\nfilename: EmailSent.csv')
    return merged_new
    #display(merged_new)
    

In [13]:
csv_path = '../../data/raw_data/ActivityField_csv/'
json_path = '../../data/raw_data/ActivityField_json/'
save_path = '../../data/processed1/'

In [14]:
# json to csv 실행 코드 
activity_list = ['EmailClickThrough', 'EmailOpen', 'EmailSend','WebVisit','FormSubmit']
#json_to_csv(activity_list,json_path,  csv_path)

In [15]:
# 코드 실행: activity merged 데이터 만들고 지정 경로에 저장
activity = create_save_activityfield_merged(save_path, 'y')

Merged Activity Logs are saved to path:  ../../data/processed1/ 
filename: activity_merged.csv


In [16]:
# 코드 실행: EmailSent 데이터 만들고 지정 경로에 저장 
sent = create_save_emailsent(csv_path, 'y')

EmailSent file is now saved to path:  ../../data/processed1/ 
filename: EmailSent.csv


In [17]:
activity.shape

(739687, 7)

In [19]:
sent.shape

(903110, 7)