# 라이브러리 선언

In [1]:
import pandas as pd
import numpy as np

In [2]:
selloutData = pd.read_csv("../dataset/kopo_channel_seasonality_new.csv")

In [3]:
customerData = pd.read_csv("../dataset/customerdata.csv")

# 1. 데이터 조작하기

#### 1-1. 분석 할 데이터의 칼럼 조회하는 방법

In [4]:
selloutData[["REGIONID","PRODUCT"]].head(3)

Unnamed: 0,REGIONID,PRODUCT
0,A60,PRODUCT4
1,A60,PRODUCT59
2,A60,PRODUCT34


#### 1-2. 데이터프레임 생성(데이터를 새로운 변수에 저장하기)

In [5]:
selloutData2 = pd.DataFrame(selloutData)
selloutData2.head(3)

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY
0,A60,PRODUCT4,201402,71.0
1,A60,PRODUCT59,201402,22275.0
2,A60,PRODUCT34,201402,4463.0


#### 1-3. 데이터를 조건을 설정하여 조회하기 ( ※ inplace 적용이 안되기 때문에 조회를 한 후 변수에 대입하여야한다.)

In [6]:
condition1 = customerData[(customerData.EMI==3) & (customerData.DEVICECOUNT>5)]
condition1.head(3)

Unnamed: 0,CUSTID,AVERAGEPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE
0,A13566,4273.9,3,6.4,1.679181,Big-Screen-lover
27,F17131,3179.708359,3,6.5,1.1,Early-bird
33,G16437,3787.328898,3,6.7,1.0,Early-bird


In [7]:
condition2 = customerData.query('EMI == 3 & DEVICECOUNT > 5')
condition2.head(3)

Unnamed: 0,CUSTID,AVERAGEPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE
0,A13566,4273.9,3,6.4,1.679181,Big-Screen-lover
27,F17131,3179.708359,3,6.5,1.1,Early-bird
33,G16437,3787.328898,3,6.7,1.0,Early-bird


#### 1-4. 데이터프레임 타입변환

##### - 데이터프레임 칼럼의 자료형을 확인할 때 : (data file name).dtypes

In [8]:
customerData.dtypes

CUSTID           object
AVERAGEPRICE    float64
EMI               int64
DEVICECOUNT     float64
PRODUCTAGE      float64
CUSTTYPE         object
dtype: object

##### - 데이터프레임의 자료형 변환

In [9]:
customerData["DEVICECOUNT"] = customerData["DEVICECOUNT"].astype(float)

In [10]:
customerData.dtypes  #object타입은 string타입을 의미한다

CUSTID           object
AVERAGEPRICE    float64
EMI               int64
DEVICECOUNT     float64
PRODUCTAGE      float64
CUSTTYPE         object
dtype: object

### 실습 문제 1

#### 문제 1-1. kopo_customerdata 데이터에서 gender == male 이면서 email > 0 인 데이터를 추출하시오

##### - 데이터프레임에 데이터 저장

In [11]:
kopo_customerData = pd.read_csv("../dataset/kopo_customerdata.csv")
kopo_customerData.head()

Unnamed: 0,CUSTOMERCODE,STATENAME,ST,GENDER,DOB,GENDER1,EMAIL,FEST_CNT,TOTAL_AMOUNT,AC_AMOUNT,AV_AMOUNT,HA_AMOUNT,EMI_CE_AMOUNT,IS_HIGHEND,CNT_VIST,GAP_VISIT
0,1503989,State2,2,Male,0,1,1,0,30300,0,30300,0,0,0,1,0
1,1190338,State2,2,Male,0,1,0,1,46500,0,46500,0,0,0,1,0
2,1424715,State2,2,Female,0,2,0,0,22700,0,0,22700,0,0,1,0
3,2483305,State2,2,Male,0,1,0,1,31400,0,31400,0,0,0,1,0
4,1178802,State2,2,Male,0,1,0,1,44900,0,0,44900,0,0,1,0


##### - 추출한 데이터 중 필요한 데이터를 정제

In [12]:
kopo_customerData["GENDER"] = kopo_customerData.GENDER.astype("str")

In [13]:
kopo_customerData = kopo_customerData[(kopo_customerData.GENDER.str[0:4] == "Male") & \
                                (kopo_customerData.EMAIL > 0)]

In [14]:
kopo_customerData.head(3)

Unnamed: 0,CUSTOMERCODE,STATENAME,ST,GENDER,DOB,GENDER1,EMAIL,FEST_CNT,TOTAL_AMOUNT,AC_AMOUNT,AV_AMOUNT,HA_AMOUNT,EMI_CE_AMOUNT,IS_HIGHEND,CNT_VIST,GAP_VISIT
0,1503989,State2,2,Male,0,1,1,0,30300,0,30300,0,0,0,1,0
16,1154228,State2,2,Male,0,1,1,0,18200,0,0,18200,0,0,1,0
81,393246,State2,2,Male,0,1,1,1,18200,0,0,18200,0,0,1,0


#### 문제 1-2. kopo_product_volume 데이터에서 PRODUCTGROUP == ST0002 이면서 VOLUME > 300000 데이터를 추출하시오

##### -데이터프레임에 데이터 저장

In [15]:
kopo_product_volume_data = pd.read_csv("../dataset/kopo_product_volume.csv")
kopo_product_volume_data.head(3)

Unnamed: 0,REGIONID,PRODUCTGROUP,YEARWEEK,VOLUME
0,A01,ST0001,201415,810144
1,A01,ST0002,201415,128999
2,A01,ST0001,201418,671464


##### - 추출한 데이터 중 필요한 데이터를 정제

In [16]:
kopo_product_volume_data = kopo_product_volume_data[(kopo_product_volume_data.PRODUCTGROUP == "ST0002") & \
                                                     (kopo_product_volume_data.VOLUME > 300000)]

In [17]:
kopo_product_volume_data.head(5)

Unnamed: 0,REGIONID,PRODUCTGROUP,YEARWEEK,VOLUME
80,A01,ST0002,201452,422468
86,A01,ST0002,201451,356271
90,A01,ST0002,201449,669711
94,A01,ST0002,201447,378339
102,A01,ST0002,201448,1400339


### 실습문제 2

#### 문제 2-1. [데이터타입 통합] kopo_channel_seasonality_new.csv 자료에서 QTY컬럼 -> 실수(float), 이외컬럼 ->문자(str)로 변경하시오

##### - 위에서 불러왔던 데이터 사용, 출력하여 컬럼 확인

In [18]:
selloutData.head(0)

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY


In [19]:
selloutData.dtypes

REGIONID     object
PRODUCT      object
YEARWEEK      int64
QTY         float64
dtype: object

In [20]:
selloutData["REGIONID"] = selloutData["REGIONID"].astype(str)
selloutData["PRODUCT"] = selloutData["PRODUCT"].astype(str)
selloutData["YEARWEEK"] = selloutData["YEARWEEK"].astype(str)
selloutData["QTY"] = selloutData["QTY"].astype(float)

In [21]:
selloutData.dtypes

REGIONID     object
PRODUCT      object
YEARWEEK     object
QTY         float64
dtype: object

#### 2-1. 원하는 컬럼 조회하기

In [22]:
# how to?
# 데이터 프레임명 [['컬럼명1','컬럼명2']]
# 데이터 프레임명.filter(items=['컬럼명1','컬럼명2'])

In [23]:
customerData.head(0)

Unnamed: 0,CUSTID,AVERAGEPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE


##### - 데이터프레임 리스트 활용하기

In [24]:
customerData_ColumnTest = customerData[['CUSTID','EMI']]
customerData_ColumnTest.head(3)

Unnamed: 0,CUSTID,EMI
0,A13566,3
1,A14219,2
2,A15312,2


##### - filter 함수 활용하기

In [25]:
customerData_ColumnTest2 = customerData.filter(items=['CUSTID','EMI'])
customerData_ColumnTest2.head(3)

Unnamed: 0,CUSTID,EMI
0,A13566,3
1,A14219,2
2,A15312,2


### 실습문제 3

#### 문제 3-1. kopo_customerdata 테이블에서 customercode, statename, gender, total_amount 컬럼만 추출하세요

##### - 데이터를 로드하여 컬럼 확인

In [26]:
kopo_customerData.head(0)

Unnamed: 0,CUSTOMERCODE,STATENAME,ST,GENDER,DOB,GENDER1,EMAIL,FEST_CNT,TOTAL_AMOUNT,AC_AMOUNT,AV_AMOUNT,HA_AMOUNT,EMI_CE_AMOUNT,IS_HIGHEND,CNT_VIST,GAP_VISIT


##### - 원하는 컬럼 재정의하여 데이터프레임으로 추출

In [27]:
kopo_customerData_NeedColumn = kopo_customerData[["CUSTOMERCODE","STATENAME","GENDER","TOTAL_AMOUNT"]]

In [28]:
kopo_customerData_NeedColumn.head(3)

Unnamed: 0,CUSTOMERCODE,STATENAME,GENDER,TOTAL_AMOUNT
0,1503989,State2,Male,30300
16,1154228,State2,Male,18200
81,393246,State2,Male,18200


#### 3-1. 인덱스를 활용하여 조회하기 (iloc)

In [29]:
# how to?
# 데이터프레임명.iloc[ : , : ]
# 콤마(,)를 기준으로 앞은 행 인덱스 뒤는 열인덱스 ( : ) 만 활용 시 전체
# [n1:n2]는 n1부터 n2까지
# [n1,n2]는 n1과 n2만

In [30]:
dfc1 = customerData.iloc[0:1,:]
dfc1

Unnamed: 0,CUSTID,AVERAGEPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE
0,A13566,4273.9,3,6.4,1.679181,Big-Screen-lover


### 실습문제 4

#### 문제 4-1. kopo_customerdata 테이블에서 0~5 번째 행의 statename, gender를 선택하세요(인덱스 활용)

In [31]:
kopo_customerData.head(10)

Unnamed: 0,CUSTOMERCODE,STATENAME,ST,GENDER,DOB,GENDER1,EMAIL,FEST_CNT,TOTAL_AMOUNT,AC_AMOUNT,AV_AMOUNT,HA_AMOUNT,EMI_CE_AMOUNT,IS_HIGHEND,CNT_VIST,GAP_VISIT
0,1503989,State2,2,Male,0,1,1,0,30300,0,30300,0,0,0,1,0
16,1154228,State2,2,Male,0,1,1,0,18200,0,0,18200,0,0,1,0
81,393246,State2,2,Male,0,1,1,1,18200,0,0,18200,0,0,1,0
206,2532568,State2,2,Male,1,1,1,0,48000,48000,0,0,0,0,1,0
216,487928,State2,2,Male,0,1,1,0,72300,0,27900,44400,0,0,1,0
226,2452240,State2,2,Male,1,1,1,1,22500,0,0,22500,0,0,1,0
299,1295854,State2,2,Male,1,1,1,2,59900,0,25100,34800,0,0,4,655
308,554757,State2,2,Male,0,1,1,0,22100,0,0,22100,0,0,1,0
330,761018,State2,2,Male,1,1,1,0,21200,0,0,21200,0,0,1,0
367,2191152,State2,2,Male,0,1,1,1,55700,0,55700,0,0,0,1,0


In [32]:
kopo_customerData_indexTest = kopo_customerData.iloc[0:5,[1,3]]
kopo_customerData_indexTest

Unnamed: 0,STATENAME,GENDER
0,State2,Male
16,State2,Male
81,State2,Male
206,State2,Male
216,State2,Male


#### 3-2. 인덱스를 활용하여 조회하기 (loc)

In [33]:
# how to?
# 데이터프레임명.loc[:,["COLUMNS","COLUMNS"]]
# 앞에는 행 인덱스 뒤에는 열의 STRING 이름을 입력
# 열은 범위로 지정이 불가하고 ,로 구분하여 지정해주어야함

In [34]:
customerData.head(0)

Unnamed: 0,CUSTID,AVERAGEPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE


In [35]:
dfc2 = customerData.loc[0:2,["CUSTID","AVERAGEPRICE","DEVICECOUNT"]]
dfc2

Unnamed: 0,CUSTID,AVERAGEPRICE,DEVICECOUNT
0,A13566,4273.9,6.4
1,A14219,3642.44195,4.0
2,A15312,3653.884565,5.0


#### 4-1. 이상 데이터 정제하기

In [36]:
# how to?
# 데이터프레임명['컬럼명'] = np.where(조건식, 조건식에 충족한다면 들어갈 값, 충족하지 않는다면 들어갈 값)

In [37]:
customerData["PRODUCTAGE_NEW"] = np.where(customerData["PRODUCTAGE"]<1,1,customerData["PRODUCTAGE"])
customerData.head(3)

Unnamed: 0,CUSTID,AVERAGEPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,PRODUCTAGE_NEW
0,A13566,4273.9,3,6.4,1.679181,Big-Screen-lover,1.679181
1,A14219,3642.44195,2,4.0,2.682023,Sleeping-dog,2.682023
2,A15312,3653.884565,2,5.0,3.208202,Sleeping-dog,3.208202


### 실습문제 5

#### 문제 5-1. PRODUCTAGE가 1보다 작으면 1, 2보다 작으면 2, 3보다 작으면 3, 이외는 5라고 새로운 컬럼 PRODUCT_AGE_NEW를 생성하시오

In [38]:
customerData["PRODUCTAGE_NEW"] = np.where(customerData["PRODUCTAGE"]<1,1,\
                                         np.where(customerData["PRODUCTAGE"]<2,2,\
                                                  np.where(customerData["PRODUCTAGE"]<3,3,\
                                                          5)))
customerData.head()

Unnamed: 0,CUSTID,AVERAGEPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,PRODUCTAGE_NEW
0,A13566,4273.9,3,6.4,1.679181,Big-Screen-lover,2
1,A14219,3642.44195,2,4.0,2.682023,Sleeping-dog,3
2,A15312,3653.884565,2,5.0,3.208202,Sleeping-dog,5
3,A16605,3713.211107,2,6.6,0.9,Early-bird,1
4,B10634,3391.074215,2,4.2,2.453656,Sleeping-dog,3


### 실습문제 6

#### 문제 6-1. [불량 데이터 처리] kopo_channel_seasonality_new.csv 자료에서 QTY 마이너스 값을 0으로 변경한 자료를 QRY_NEW 컬럼에 생성하세요

In [39]:
selloutData["QTY_NEW"] = np.where(selloutData["QTY"]<0,0,selloutData["QTY"])

In [40]:
len(selloutData)

124658

#### 문제 6-2.[불량 데이터 처리] kopo_channel_seasonality_new.csv 자료에서 QTY 마이너스 값을 0으로 변경한 자료를 QTY_NEW 컬럼에 생성하시오

In [41]:
selloutData["YEAR"] = selloutData.YEARWEEK.astype(str).str[0:4]
selloutData.head(1)

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY,QTY_NEW,YEAR
0,A60,PRODUCT4,201402,71.0,71.0,2014


In [42]:
selloutData["WEEK"] = selloutData.YEARWEEK.astype(str).str[4:]
selloutData.head(1)

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY,QTY_NEW,YEAR,WEEK
0,A60,PRODUCT4,201402,71.0,71.0,2014,2


In [43]:
refineSelloutData = selloutData[selloutData.WEEK.astype(int) <= 52]
refineSelloutData.head()

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY,QTY_NEW,YEAR,WEEK
0,A60,PRODUCT4,201402,71.0,71.0,2014,2
1,A60,PRODUCT59,201402,22275.0,22275.0,2014,2
2,A60,PRODUCT34,201402,4463.0,4463.0,2014,2
3,A60,PRODUCT47,201402,0.0,0.0,2014,2
4,A60,PRODUCT56,201402,23.0,23.0,2014,2


In [44]:
len(refineSelloutData)

123864

#### 4-2. 이상 데이터 정제하기

In [46]:
# 함수 활용하여 이상 데이터 정제

In [50]:
def refining(df):
    if df['PRODUCTAGE']<1:
        return 1
    elif df['PRODUCTAGE']<2:
        return 2
    elif df['PRODUCTAGE']<3:
        return 3
    else:
        return 5

customerData["PRODUCTAGE_NEW2"] = customerData.apply(refining, axis=1)

In [51]:
customerData.head()

Unnamed: 0,CUSTID,AVERAGEPRICE,EMI,DEVICECOUNT,PRODUCTAGE,CUSTTYPE,PRODUCTAGE_NEW,PRODUCTAGE_NEW2
0,A13566,4273.9,3,6.4,1.679181,Big-Screen-lover,2,2
1,A14219,3642.44195,2,4.0,2.682023,Sleeping-dog,3,3
2,A15312,3653.884565,2,5.0,3.208202,Sleeping-dog,5,5
3,A16605,3713.211107,2,6.6,0.9,Early-bird,1,1
4,B10634,3391.074215,2,4.2,2.453656,Sleeping-dog,3,3
