# Bigmart Sales Prediction

- 회귀 모델로 분석하기

### 필요한 패키지 임포트 하기

In [123]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### 데이터 불러오기

In [124]:
df = pd.read_csv('Train.csv', sep=',')
df.head(3)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27


##### - 결측치 처리

In [125]:
df.shape

(8523, 12)

In [126]:
# 결측치 확인 -> 결측치 많은 편 아니므로 대체할 값 선정
df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

- Item_Weight 결측치 -> 평균 무게로 대체

In [127]:
# Item_Weight의 평균값 결측치에 삽입하기
df.Item_Weight.fillna(df.Item_Weight.mean(), inplace=True)
# 변경 후 결측값 확인
df.Item_Weight.isna().sum()

0

- Outlet_Size -> 최빈값 데이터로 대체

In [128]:
# Outlet_Size 컬럼 데이터 별 개수 세기
df.Outlet_Size.value_counts() # 최빈값 : High 
df.Outlet_Size.fillna('High', inplace=True)

In [129]:
# 전체 결측치 다시 확인하기
df.isna().sum().sum()

0

##### - 카테고리 값인 컬럼 숫자로 변환

- 카테고리 값인 컬럼
    - Item_Fat_Content / Item_Type / Outlet_Size / Outlet_Location_Type / Outlet_Type

##### 1. 카테고리 값인 컬럼 데이터 종류 분석해서 일정하게 맞추기

In [130]:
df.Item_Fat_Content.value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [131]:
# Low Fat == low fat, LF / Regular == reg 
df['Item_Fat_Content'].replace(['low fat', 'LF', 'reg'], ['Low Fat', 'Low Fat', 'Regular'], inplace=True)
# 데이터 변경 적용되었는지 확인
df.Item_Fat_Content.value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

##### 2. 카테고리 값인 컬럼 인코딩 하기

In [132]:
# Label Encoder 로 변환 -> 객체 생성
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [133]:
# 데이터 인코딩
df.Item_Fat_Content = le.fit_transform(df.Item_Fat_Content)
df.Item_Type = le.fit_transform(df.Item_Type)
df.Outlet_Size = le.fit_transform(df.Outlet_Size)
df.Outlet_Location_Type = le.fit_transform(df.Outlet_Location_Type)
df.Outlet_Type = le.fit_transform(df.Outlet_Type)
df.head(3)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,0,0.016047,4,249.8092,OUT049,1999,1,0,1,3735.138
1,DRC01,5.92,1,0.019278,14,48.2692,OUT018,2009,1,2,2,443.4228
2,FDN15,17.5,0,0.01676,10,141.618,OUT049,1999,1,0,1,2097.27


In [134]:
# 인코딩 상태 확인하기
print(np.unique(df.Item_Fat_Content, return_counts=True)) # Low Fat(0) / Regular(1)
print(np.unique(df.Item_Type, return_counts=True))
print(np.unique(df.Outlet_Size, return_counts=True)) # Hight(0) / Medium(1) / Small(2)
print(np.unique(df.Outlet_Location_Type, return_counts=True)) # Tier 3(0) / Tier 2(1) / Tier1(2)
print(np.unique(df.Outlet_Type, return_counts=True)) # Grocery Store(0) / Supermarket Type1(1) / Supermarket Type2(2) / Supermarket Type3(3)

(array([0, 1]), array([5517, 3006], dtype=int64))
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]), array([ 648,  251,  110,  649,  682,  856, 1232,  214,  520,  910,  425,
        169,   64, 1200,  445,  148], dtype=int64))
(array([0, 1, 2]), array([3342, 2793, 2388], dtype=int64))
(array([0, 1, 2]), array([2388, 2785, 3350], dtype=int64))
(array([0, 1, 2, 3]), array([1083, 5577,  928,  935], dtype=int64))


##### 3. 컬럼 (Outlet_Establishment_Year -> Outlet_Age) 조작하기

- 아울렛이 얼마나 오래되었는지 나타내는 컬럼으로 바꾸기

In [135]:
df['Outlet_Establishment_Year'] = df.Outlet_Establishment_Year.apply(lambda year : 2021 - year)

In [138]:
df.rename(columns={'Outlet_Establishment_Year': 'Outlet_Age'}, inplace=True)
df.head(3)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Age,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,0,0.016047,4,249.8092,OUT049,22,1,0,1,3735.138
1,DRC01,5.92,1,0.019278,14,48.2692,OUT018,12,1,2,2,443.4228
2,FDN15,17.5,0,0.01676,10,141.618,OUT049,22,1,0,1,2097.27


### 학습 / 테스트 데이터셋 분리하기

##### 주제1. Item_Fat_Content와 Item_Outlet_Sales 간의 관계

- 상품의 지방 함유량에 따른 매출액 추이

In [146]:
fat = df.Item_Fat_Content.values.reshape(-1,1)    # X 값
sale_amount = df.Item_Outlet_Sales.values   # target(label) 값
X.shape, y.shape

((8523, 11), (8523,))

In [147]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    fat, sale_amount, test_size=0.2, random_state=2021
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6818, 1), (1705, 1), (6818,), (1705,))

In [148]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

-0.0007070937387958498

### 회귀선 그리기