# 쇼핑조 화이팅

## 라이브러리 호출

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#한글폰트 가져오기
from matplotlib import rc
rc('font', family='NanumGothic')

# Warning 메세지를 뜨지 않게 해줌
import warnings
warnings.filterwarnings('ignore')

## 데이터 불러오기

In [3]:
df = pd.read_excel('./data/Online Retail.xlsx')
df_origin = df.copy()

### CustomerID가 결측인 행 삭제

In [4]:
df = df[~df['CustomerID'].isna()]
df['CustomerID'].isna().sum()
df = df.reset_index(drop=True)
df_NaN = df.copy()

### 취소된 주문 삭제

In [5]:
Stock_len = df['StockCode'].astype(str).str.len()
df = df[~(df['InvoiceNo'].astype(str).str.startswith('C') & ((Stock_len == 5)|(Stock_len == 6)))]
df_C = df.copy()

In [6]:
#df = df_C.copy()

### 관리자 정보 제거

In [7]:
#M - Manual: 관리자 정보 + 해석불가
df = df.drop(df[df['StockCode'] == 'M'].index)

#D - DIscount, CRUK - Cancer Research UK 할인내역
df = df.drop(df[df['StockCode'] == 'D'].index)
df = df.drop(df[df['StockCode'] == 'CRUK'].index)

#C2 - Carriage, DOT - DOTCOM POSTAGE, POST - POSTAGE: 배송료
df = df.drop(df[df['StockCode'] == 'C2'].index)
df = df.drop(df[df['StockCode'] == 'DOT'].index)
df = df.drop(df[df['StockCode'] == 'POST'].index)

#BANK CHARGES - Bank Charges: 소비 내역이 아님
df = df.drop(df[df['StockCode'] == 'BANK CHARGES'].index)

### Description 하나로 통일

In [8]:
SC_Dec_count_before = df[['StockCode','Description']].groupby('StockCode').nunique()
print('통일 전',SC_Dec_count_before.value_counts()) #2,3,4자리를 한자리로 수정 필요

for i in [2,3,4]: #2~4자리
    for j in SC_Dec_count_before[SC_Dec_count_before['Description'] == i].index: #groupby를 해서 2~4자리인 애들의 index가 StockCode임
        first_description = df.loc[df['StockCode'] == j, 'Description'].iloc[0] #StockCode가 일치하는 행 -> Description을 출력 후 맨 처음 값을 저장
        df.loc[df['StockCode'] == j, 'Description'] = first_description #저장한 값을 StockCode가 일치하는 모든 Description으로 변경

SC_Dec_count_after = df[['StockCode','Description']].groupby('StockCode').nunique()
print('통일 후',SC_Dec_count_after.value_counts())

통일 전 Description
1              3447
2               196
3                15
4                 2
Name: count, dtype: int64
통일 후 Description
1              3660
Name: count, dtype: int64


### 6자리 StockCode의 마지막자리 알파벳 삭제

In [9]:
TF_Stock_6 = df['StockCode'].astype(str).str.len() == 6
df.loc[TF_Stock_6,'StockCode'] = df[TF_Stock_6]['StockCode'].str[:-1]
df['StockCode'].value_counts()

StockCode
85099    2941
85123    2035
22423    1724
84879    1408
47566    1397
         ... 
90041       1
20667       1
90102       1
77079       1
23843       1
Name: count, Length: 3196, dtype: int64

### StockCode 자릿수 통일

In [10]:
#15056BL
df.loc[df['StockCode'] == '15056BL','StockCode'] = 15056
#PAD
df.loc[df['StockCode'] == 'PADS','StockCode'] =	10000

In [11]:
df['StockCode'].astype(str).str.len().value_counts()

StockCode
5    396379
Name: count, dtype: int64

In [12]:
#15056BL이 환불된 데이터가 남아있어 삭제
display(df[df['InvoiceNo'].astype(str).str.startswith('C')])
df = df.drop(df[df['InvoiceNo'].astype(str).str.startswith('C')].index)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
25232,C539576,15056,EDWARDIAN PARASOL BLACK,-6,2010-12-20 12:25:00,5.95,14911.0,EIRE
101107,C549050,15056,EDWARDIAN PARASOL BLACK,-2,2011-04-06 10:17:00,5.95,13767.0,United Kingdom
155529,C555879,15056,EDWARDIAN PARASOL BLACK,-1,2011-06-07 15:31:00,4.95,14292.0,United Kingdom
203011,C561810,15056,EDWARDIAN PARASOL BLACK,-120,2011-07-29 15:12:00,4.95,14145.0,United Kingdom
212415,C562773,15056,EDWARDIAN PARASOL BLACK,-1,2011-08-09 12:18:00,5.95,17841.0,United Kingdom


## 데이터 확인

In [13]:
display(df.head())
display(df.tail())

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
406824,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
406825,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.1,12680.0,France
406826,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
406827,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France
406828,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France


In [14]:
print(df.info())
df['InvoiceNo'] = df['InvoiceNo'].astype(int)
df['StockCode'] = df['StockCode'].astype(int)
df['CustomerID'] = df['CustomerID'].astype(int)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 396374 entries, 0 to 406828
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    396374 non-null  object        
 1   StockCode    396374 non-null  object        
 2   Description  396374 non-null  object        
 3   Quantity     396374 non-null  int64         
 4   InvoiceDate  396374 non-null  datetime64[ns]
 5   UnitPrice    396374 non-null  float64       
 6   CustomerID   396374 non-null  float64       
 7   Country      396374 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 27.2+ MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 396374 entries, 0 to 406828
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    396374 non-null  int32         
 1   StockCode    396374 non-null  int32         
 2 

In [15]:
df.describe()

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,UnitPrice,CustomerID
count,396374.0,396374.0,396374.0,396374,396374.0,396374.0
mean,560618.913239,30762.967889,13.046353,2011-07-11 00:04:48.847805440,2.867954,15301.603824
min,536365.0,10000.0,1.0,2010-12-01 08:26:00,0.0,12346.0
25%,549235.0,22027.0,2.0,2011-04-07 11:16:00,1.25,13975.0
50%,561893.0,22666.0,6.0,2011-07-31 14:39:00,1.95,15159.0
75%,572092.0,23271.0,12.0,2011-10-20 14:41:00,3.75,16803.0
max,581587.0,90214.0,80995.0,2011-12-09 12:50:00,649.5,18287.0
std,13106.400653,20606.305554,180.732689,,4.264554,1709.976485


In [16]:
#df.to_csv('./data/Online Retail_df.csv', index = False)