In [1]:
import json
import numpy as np 
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import folium

from pandas import json_normalize

- channelGrouping : 유입 경로
- date : 방문 날짜
- device : 사용자 장치 정보
    - browser
    - browserVersion
    - browserSize
    - operatingSystem
    - operatingSystemVersion
    - isMobile : T/F
    - mobileDeviceBranding
    - mobileDeviceModel
    - mobileInputSelector
    - mobileDeviceInfo
    - mobileDeviceMarketingName
    - flashVersion
    - language
    - screenColors
    - screenResolution
    - deviceCategory
- fullVisitorId : 사용자 고유 식별자
- geoNetwork : 사용자 위치 정보
    - continent
    - subContinent
    - country
    - region
    - metro
    - city
    - cityId
    - networkDomain
    - latitude
    - longitude
    - networkLocation
- sessionId : fullVisitorId + visitId
- socialEngagementType : 참여 유형
- totals : 세션 전체 집계값(?)
    - visits
    - hits
    - pageviews
    - bounces
    - newVisits
- trafficSource : 유입 트래픽 소스
    - campaign
    - source
    - medium
    - keyword
    - adwordsClickInfo
        - criteriaParameters
    - isTrueDirect
- visitId : 식별자
- visitNumber : 세션 번호
- visitStartTime : 타임스탬프, visitId와 동일

In [2]:
# Extract features from 'device', 'geoNetwork', 'totals', 'trafficSource’. 
json_columns = ['device', 'geoNetwork','totals', 'trafficSource']

def load_dataframe(csv_path):
    path = csv_path
    df = pd.read_csv(path, converters={column: json.loads for column in json_columns}, 
                     dtype={'fullVisitorId': 'str'})
   
    for column in json_columns:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}_{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)

    return df

In [3]:
train = load_dataframe('C:/Users/USER/OneDrive/바탕 화면/내일배움캠프 데이터 분석/train.csv/train.csv')

In [4]:
train.head(10)

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device_browser,device_browserVersion,...,trafficSource_adwordsClickInfo.criteriaParameters,trafficSource_isTrueDirect,trafficSource_referralPath,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.adNetworkType,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adContent,trafficSource_campaignCode
0,Organic Search,20160902,1131660440785968503,1131660440785968503_1472830385,Not Socially Engaged,1472830385,1,1472830385,Chrome,not available in demo dataset,...,not available in demo dataset,,,,,,,,,
1,Organic Search,20160902,377306020877927890,377306020877927890_1472880147,Not Socially Engaged,1472880147,1,1472880147,Firefox,not available in demo dataset,...,not available in demo dataset,,,,,,,,,
2,Organic Search,20160902,3895546263509774583,3895546263509774583_1472865386,Not Socially Engaged,1472865386,1,1472865386,Chrome,not available in demo dataset,...,not available in demo dataset,,,,,,,,,
3,Organic Search,20160902,4763447161404445595,4763447161404445595_1472881213,Not Socially Engaged,1472881213,1,1472881213,UC Browser,not available in demo dataset,...,not available in demo dataset,,,,,,,,,
4,Organic Search,20160902,27294437909732085,27294437909732085_1472822600,Not Socially Engaged,1472822600,2,1472822600,Chrome,not available in demo dataset,...,not available in demo dataset,True,,,,,,,,
5,Organic Search,20160902,2938943183656635653,2938943183656635653_1472807194,Not Socially Engaged,1472807194,1,1472807194,Chrome,not available in demo dataset,...,not available in demo dataset,,,,,,,,,
6,Organic Search,20160902,1905672039242460897,1905672039242460897_1472817241,Not Socially Engaged,1472817241,1,1472817241,Chrome,not available in demo dataset,...,not available in demo dataset,,,,,,,,,
7,Organic Search,20160902,537222803633850821,537222803633850821_1472812602,Not Socially Engaged,1472812602,1,1472812602,Chrome,not available in demo dataset,...,not available in demo dataset,,,,,,,,,
8,Organic Search,20160902,4445454811831400414,4445454811831400414_1472805784,Not Socially Engaged,1472805784,1,1472805784,Internet Explorer,not available in demo dataset,...,not available in demo dataset,,,,,,,,,
9,Organic Search,20160902,9499785259412240342,9499785259412240342_1472812272,Not Socially Engaged,1472812272,1,1472812272,Firefox,not available in demo dataset,...,not available in demo dataset,,,,,,,,,


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 55 columns):
 #   Column                                             Non-Null Count   Dtype 
---  ------                                             --------------   ----- 
 0   channelGrouping                                    903653 non-null  object
 1   date                                               903653 non-null  int64 
 2   fullVisitorId                                      903653 non-null  object
 3   sessionId                                          903653 non-null  object
 4   socialEngagementType                               903653 non-null  object
 5   visitId                                            903653 non-null  int64 
 6   visitNumber                                        903653 non-null  int64 
 7   visitStartTime                                     903653 non-null  int64 
 8   device_browser                                     903653 non-null  object
 9   devi

In [6]:
columns = train.columns.tolist()
print(columns)

['channelGrouping', 'date', 'fullVisitorId', 'sessionId', 'socialEngagementType', 'visitId', 'visitNumber', 'visitStartTime', 'device_browser', 'device_browserVersion', 'device_browserSize', 'device_operatingSystem', 'device_operatingSystemVersion', 'device_isMobile', 'device_mobileDeviceBranding', 'device_mobileDeviceModel', 'device_mobileInputSelector', 'device_mobileDeviceInfo', 'device_mobileDeviceMarketingName', 'device_flashVersion', 'device_language', 'device_screenColors', 'device_screenResolution', 'device_deviceCategory', 'geoNetwork_continent', 'geoNetwork_subContinent', 'geoNetwork_country', 'geoNetwork_region', 'geoNetwork_metro', 'geoNetwork_city', 'geoNetwork_cityId', 'geoNetwork_networkDomain', 'geoNetwork_latitude', 'geoNetwork_longitude', 'geoNetwork_networkLocation', 'totals_visits', 'totals_hits', 'totals_pageviews', 'totals_bounces', 'totals_newVisits', 'totals_transactionRevenue', 'trafficSource_campaign', 'trafficSource_source', 'trafficSource_medium', 'trafficSo

In [7]:
unique_values = train['channelGrouping'].unique()
print(unique_values)

['Organic Search' 'Referral' 'Paid Search' 'Affiliates' 'Direct' 'Display'
 'Social' '(Other)']


In [8]:
unique_values = train['device_browser'].unique()
print(unique_values)

['Chrome' 'Firefox' 'UC Browser' 'Internet Explorer' 'Safari' 'Edge'
 'Opera Mini' 'Opera' 'BlackBerry' 'Safari (in-app)' 'Coc Coc'
 'Mozilla Compatible Agent' 'ADM' 'MRCHROME' 'Amazon Silk' 'YaBrowser'
 'Android Webview' 'Puffin' 'Nokia Browser' 'Maxthon' 'Nintendo Browser'
 'Android Browser' 'Lunascape' 'IE with Chrome Frame' 'ThumbSniper'
 'LYF_LS_4002_12' 'Mozilla' 'osee2unifiedRelease' 'NokiaE52-1' 'Iron'
 '[Use default User-agent string] LIVRENPOCHE' '(not set)'
 'LYF_LS_4002_11' 'M5' 'Android Runtime' 'Apple-iPhone7C2' 'SeaMonkey'
 'Konqueror' 'Seznam' 'Changa 99695759' 'no-ua' 'MQQBrowser' 'Nichrome'
 'HTC802t_TD' 'DASH_JR_3G' 'DoCoMo' 'subjectAgent: NoticiasBoom' 'YE'
 'User Agent' '0' 'Hisense M20-M_LTE' 'Reddit' 'TCL P500M' 'CSM Click']


In [9]:
unique_values = train['trafficSource_source'].unique()
print(unique_values)

['google' 'baidu' 'googleweblight.com' 'sites.google.com' 'bing'
 'siliconvalley.about.com' 'analytics.google.com' 'Partners' '(direct)'
 'mall.googleplex.com' 'google.com' 'google.co.in' 'qiita.com' 'google.pl'
 'mail.google.com' 'hangouts.google.com' 'dealspotr.com'
 'optimize.google.com' 'arstechnica.com' 'dfa' 'docs.google.com'
 'sashihara.jp' 'google.gatewaycdi.com' 'phandroid.com' 'images.google'
 'youtube.com' 'l.facebook.com' 'facebook.com' 'quora.com'
 'm.facebook.com' 'reddit.com' 'google.co.jp' 'aol' 'moma.corp.google.com'
 'blog.golang.org' 'yahoo' 't.co' 'nordic.businessinsider.com'
 'duckduckgo.com' 'm.baidu.com' 'tpc.googlesyndication.com'
 'googleads.g.doubleclick.net'
 '0.shared.bow.cat2.ads-bow.vu.borg.google.com:9856' 'online-metrics.com'
 'cv.il3.ub.edu' 'gophergala.com' 'plus.google.com' 's0.2mdn.net'
 'keep.google.com' 'svirkar.mtv.corp.google.com:8888'
 'connect.googleforwork.com' 'google.com.pk' 'support.google.com'
 'gdeals.googleplex.com' 'google.com.au' 'int.

In [10]:
train.isnull().sum()

channelGrouping                                           0
date                                                      0
fullVisitorId                                             0
sessionId                                                 0
socialEngagementType                                      0
visitId                                                   0
visitNumber                                               0
visitStartTime                                            0
device_browser                                            0
device_browserVersion                                     0
device_browserSize                                        0
device_operatingSystem                                    0
device_operatingSystemVersion                             0
device_isMobile                                           0
device_mobileDeviceBranding                               0
device_mobileDeviceModel                                  0
device_mobileInputSelector              

In [11]:
unique_values = train['device_browserSize'].unique()
print(unique_values)

['not available in demo dataset']
