# 가장 좋게 나온 값 : 평균 0.6398254837541293
**배깅(특성값추가)**

In [32]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "training_linear_models"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [33]:
#csv 파일 로드
data = pd.read_csv("winequality-red.csv")
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [34]:
#품질과 그 외 것들 분리
train_set= data.iloc[:,:-1]
quality_set = data.iloc[:,-1]
train_set

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [35]:
x_train = train_set.iloc[:-299,:]
x_test = train_set.iloc[-299:,:]

y_train = quality_set.iloc[:-299]
y_test = quality_set.iloc[-299:]

y_train

0       5
1       5
2       5
3       6
4       5
       ..
1295    5
1296    5
1297    6
1298    6
1299    3
Name: quality, Length: 1300, dtype: int64

In [36]:
data["sulpH"] = data["sulphates"]/ data["pH"]

In [37]:
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,sulpH
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0.159544
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,0.212500
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,0.199387
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,0.183544
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0.159544
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,0.168116
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,0.215909
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,0.219298
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,0.198880


In [38]:
#품질과 그 외 것들 분리
train_set2= data.drop(data.columns[[11]], axis='columns')
train_set2

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,sulpH
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0.159544
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,0.212500
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,0.199387
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,0.183544
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0.159544
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,0.168116
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,0.215909
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,0.219298
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,0.198880


**Decision Tree 방식 사용**

In [23]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(train_set, quality_set)

DecisionTreeRegressor(random_state=42)

In [24]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, train_set, quality_set,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [25]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

Scores: [0.81394103 0.78660664 0.92870878 0.88388348 0.95197164 0.95524866
 1.00933146 0.92195445 1.         0.83553169]
Mean: 0.9087177819592572
Standard deviation: 0.07275516136855807


**LinerRegression 방식 사용**

In [12]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(train_set, quality_set)

LinearRegression()

In [16]:
from sklearn.model_selection import cross_val_score
lin_scores = cross_val_score(lin_reg, train_set, quality_set, cv=10, scoring="neg_root_mean_squared_error")
lin_rmse_scores = np.sqrt(-lin_scores)

display_scores(lin_rmse_scores)

Scores: [0.82840094 0.79528004 0.8276756  0.81059034 0.78340509 0.8506474
 0.79549967 0.80952084 0.77556054 0.83577095]
Mean: 0.8112351415216074
Standard deviation: 0.022963011524826448


In [28]:
from sklearn.linear_model import LinearRegression

lin_reg2 = LinearRegression()
lin_reg2.fit(train_set2, quality_set)

LinearRegression()

In [30]:
from sklearn.model_selection import cross_val_score
lin_scores2 = cross_val_score(lin_reg2, train_set2, quality_set, cv=10, scoring="neg_root_mean_squared_error")
lin_rmse_scores2 = np.sqrt(-lin_scores2)

display_scores(lin_rmse_scores2)

Scores: [0.79727974 0.78615896 0.83059434 0.8110955  0.77720634 0.84956872
 0.79309008 0.80520181 0.78323488 0.8307009 ]
Mean: 0.8064131266317227
Standard deviation: 0.02263149520787265


**RandomForest 방식 사용**

In [17]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(train_set, quality_set)

RandomForestRegressor(random_state=42)

In [18]:
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_reg, train_set, quality_set,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [0.61371614 0.58156631 0.68472075 0.66384722 0.5649209  0.72317443
 0.66583876 0.65855239 0.61977667 0.70264281]
Mean: 0.6478756369545334
Standard deviation: 0.04894232181514159


**RandomForest(특성값 추가)**

In [26]:
from sklearn.ensemble import RandomForestRegressor

forest_reg2 = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg2.fit(train_set2, quality_set)

RandomForestRegressor(random_state=42)

In [27]:
from sklearn.model_selection import cross_val_score

forest_scores2 = cross_val_score(forest_reg2, train_set2, quality_set,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores2 = np.sqrt(-forest_scores2)
display_scores(forest_rmse_scores2)

Scores: [0.61186038 0.58803221 0.68006663 0.65943157 0.56597317 0.73313795
 0.68155612 0.66253585 0.61965615 0.70127602]
Mean: 0.6503526048420449
Standard deviation: 0.04996167938143674


**배깅(회귀)**

In [17]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

bag_reg = BaggingRegressor(
    DecisionTreeRegressor(random_state=42), n_estimators=500,
    max_samples=100, bootstrap=True, random_state=42)
bag_reg.fit(train_set, quality_set)

BaggingRegressor(base_estimator=DecisionTreeRegressor(random_state=42),
                 max_samples=100, n_estimators=500, random_state=42)

In [19]:
from sklearn.model_selection import cross_val_score

bag_reg_scores = cross_val_score(bag_reg, train_set, quality_set,
                                scoring="neg_mean_squared_error", cv=10)

In [25]:
bag_rmse_scores = np.sqrt(-bag_reg_scores)
display_scores(bag_rmse_scores)

Scores: [0.61126457 0.61880981 0.67423928 0.65187217 0.55413775 0.72181351
 0.6336805  0.66082643 0.59736538 0.69757373]
Mean: 0.6421583131623663
Standard deviation: 0.047131030418272504


# 가장 좋게 나온 값 : 평균 0.6398254837541293
**Bagging(특성값추가)**

In [39]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

bag_reg2 = BaggingRegressor(
    DecisionTreeRegressor(random_state=42), n_estimators=500,
    max_samples=100, bootstrap=True, random_state=42)
bag_reg2.fit(train_set2, quality_set)

BaggingRegressor(base_estimator=DecisionTreeRegressor(random_state=42),
                 max_samples=100, n_estimators=500, random_state=42)

In [40]:
from sklearn.model_selection import cross_val_score

bag_reg2_scores = cross_val_score(bag_reg2, train_set2, quality_set,
                                scoring="neg_mean_squared_error", cv=10)
bag_rmse2_scores = np.sqrt(-bag_reg2_scores)
display_scores(bag_rmse2_scores)

Scores: [0.61585246 0.61475818 0.66504099 0.65154785 0.55414143 0.71977884
 0.6387439  0.65237989 0.59300451 0.69300677]
Mean: 0.6398254837541293
Standard deviation: 0.045705408131916134


**배깅 - 랜덤포레스트**

In [26]:
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor

ranbag_reg = BaggingRegressor(
    RandomForestRegressor(random_state=42), n_estimators=500,
    max_samples=100, bootstrap=True, random_state=42)
ranbag_reg.fit(train_set, quality_set)

BaggingRegressor(base_estimator=RandomForestRegressor(random_state=42),
                 max_samples=100, n_estimators=500, random_state=42)

In [27]:
from sklearn.model_selection import cross_val_score

ranbag_reg_scores = cross_val_score(ranbag_reg, train_set, quality_set,
                                scoring="neg_mean_squared_error", cv=10)
ranbag_rmse_scores = np.sqrt(-ranbag_reg_scores)
display_scores(ranbag_rmse_scores)

Scores: [0.62012892 0.62853312 0.67700285 0.661216   0.56071881 0.73331528
 0.63656077 0.66466002 0.60771308 0.70492037]
Mean: 0.6494769219834815
Standard deviation: 0.04724177173369781


**부스팅 이용**

In [52]:
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=100, random_state=42)
gbrt.fit(x_train, y_train)

errors = [mean_squared_error(y_test, y_pred)
          for y_pred in gbrt.staged_predict(x_test)]
bst_n_estimators = np.argmin(errors) + 1

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators, random_state=42)
gbrt_best.fit(x_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=83, random_state=42)

In [53]:
from sklearn.model_selection import cross_val_score

gbrt_best_reg_scores = cross_val_score(gbrt_best, train_set, quality_set,
                                scoring="neg_mean_squared_error", cv=10)
gbrt_best_rmse_scores = np.sqrt(-gbrt_best_reg_scores)
display_scores(gbrt_best_rmse_scores)

Scores: [0.62441137 0.61983536 0.67064762 0.64678759 0.56980679 0.71253568
 0.6737587  0.65472259 0.58330476 0.70374656]
Mean: 0.6459557008763588
Standard deviation: 0.04483653992478154


**extratree 사용 (기존 값 / 특성 값)**

In [19]:
from sklearn.ensemble import ExtraTreesRegressor

ext_reg = ExtraTreesRegressor(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)
ext_reg.fit(train_set, quality_set)

ExtraTreesRegressor(max_leaf_nodes=16, n_estimators=500, n_jobs=-1,
                    random_state=42)

In [20]:
from sklearn.model_selection import cross_val_score

ext_scores = cross_val_score(ext_reg, train_set, quality_set,
                                scoring="neg_mean_squared_error", cv=10)
ext_rmse_scores = np.sqrt(-ext_scores)
display_scores(ext_rmse_scores)

Scores: [0.62558421 0.6344394  0.68999255 0.66409461 0.56133124 0.73557433
 0.64750922 0.6575551  0.60784708 0.70787589]
Mean: 0.6531803627497443
Standard deviation: 0.04786445822819484


In [49]:
from sklearn.ensemble import ExtraTreesRegressor

ext_reg2 = ExtraTreesRegressor(n_estimators=500, random_state=42)
ext_reg2.fit(train_set2, quality_set)

ExtraTreesRegressor(n_estimators=500, random_state=42)

In [50]:
from sklearn.model_selection import cross_val_score

ext_scores2 = cross_val_score(ext_reg2, train_set2, quality_set,
                                scoring="neg_mean_squared_error", cv=10)
ext_rmse_scores2 = np.sqrt(-ext_scores2)
display_scores(ext_rmse_scores2)

Scores: [0.62530764 0.60287909 0.67853548 0.6667471  0.56356541 0.73532372
 0.6759338  0.64942759 0.61172866 0.69645205]
Mean: 0.6505900557000496
Standard deviation: 0.047924736161568816
