### < 모델 생성 후 고려해볼 것들>
1. 모델을 여러가지 사용해보기 ex) 앙상블, XGBoost, Light GBM, LSTM
2. 이상치를 중앙값으로 할지, 최대치로 할지
3. 시계열 데이터를 어떻게 처리할지 ex) 4분기, 2분기 등
4. 판매량, 판매 가격의 데이터 쏠림을 전처리 할지 말지
-> 팀당 하루에 5번 제출 가능

In [1]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# 1. 데이터 불러오기

In [4]:
data_dir = os.getenv('HOME') + '/kaggle/predict_future_sales/data'

items_path = os.path.join(data_dir, 'items.csv')
item_category_path = os.path.join(data_dir, 'item_categories.csv')
shops_path = os.path.join(data_dir, 'shops.csv')
train_path = os.path.join(data_dir, 'sales_train.csv')
test_path = os.path.join(data_dir, 'test.csv')

In [5]:
items = pd.read_csv(items_path)
categories = pd.read_csv(item_category_path)
shops = pd.read_csv(shops_path)
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

<br><br><br>

# 2. 데이터 분석

In [6]:
data = pd.read_csv(train_path)
sub = pd.read_csv(test_path)
print('train data dim : {}'.format(data.shape))
print('test data dim : {}'.format(sub.shape))

train data dim : (2935849, 6)
test data dim : (214200, 3)


학습데이터는 100만개, 테스트 데이터는 21만개로 이루어져 있음. 
test 데이터의 컬럼이 3개인것을 볼 수 있는데, 제외된 3개의 컬럼 중 하나는 우리가 마주어야 할 집의 가격 `price`일 것임

In [7]:
items.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [8]:
categories.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [9]:
shops.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [10]:
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [11]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   date            object 
 1   date_block_num  int64  
 2   shop_id         int64  
 3   item_id         int64  
 4   item_price      float64
 5   item_cnt_day    float64
dtypes: float64(2), int64(3), object(1)
memory usage: 134.4+ MB


## 3. Feature Engineering

#### 3.1 데이터 결측치 확인

In [13]:
train.isnull().sum()

date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64

#### 3.2 중복 제거

In [14]:
sum(train.duplicated()), sum(test.duplicated())

(6, 0)

In [15]:
train[train.duplicated() == True]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
76962,05.01.2013,0,54,20130,149.0,1.0
1435367,23.02.2014,13,50,3423,999.0,1.0
1496766,23.03.2014,14,21,3423,999.0,1.0
1671873,01.05.2014,16,50,3423,999.0,1.0
1866340,12.07.2014,18,25,3423,999.0,1.0
2198566,31.12.2014,23,42,21619,499.0,1.0


In [16]:
train.drop_duplicates(inplace=True)

In [17]:
#중복 재확인
sum(train.duplicated())

0

#### 3.3 아이템 대분류 만들기(은지님 추가)

#### 3.4 지역별 가게 대분류 만들기

In [18]:
shop_data_path = join(data_dir, 'shops.csv')
shops = pd.read_csv(shop_data_path)
pd.DataFrame(shops.shop_name.values.reshape(-1, 4))

NameError: name 'join' is not defined

In [None]:
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops.head(10)

In [None]:
# 문자 데이터를 모두 수치 데이터로 Encoding
shops.city.value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder
city_encoder = LabelEncoder()
shops['city_code'] = city_encoder.fit_transform(shops['city'])

In [19]:
shops.head(10)

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4
5,"Вологда ТРЦ ""Мармелад""",5
6,"Воронеж (Плехановская, 13)",6
7,"Воронеж ТРЦ ""Максимир""",7
8,"Воронеж ТРЦ Сити-Парк ""Град""",8
9,Выездная Торговля,9


#### 3.4 이상치 제거(진오님)