In [37]:
from sklearn import datasets
from sklearn.preprocessing import PolynomialFeatures # 다항식 회귀 tool
from sklearn.model_selection import train_test_split # 데이터 분할
from sklearn.linear_model import LinearRegression  # 선형 회귀
from sklearn.linear_model import LogisticRegression # 로지스틱 회귀
from sklearn.metrics import mean_squared_error
import pandas as pd

In [17]:
boston_dataset = datasets.load_boston()
print(boston_dataset.DESCR)
print(boston_dataset.data.shape)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [None]:
print(boston_dataset.feature_names)

In [21]:
x = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
print(x)

        CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0    0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1    0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2    0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3    0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4    0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   
..       ...   ...    ...   ...    ...    ...   ...     ...  ...    ...   
501  0.06263   0.0  11.93   0.0  0.573  6.593  69.1  2.4786  1.0  273.0   
502  0.04527   0.0  11.93   0.0  0.573  6.120  76.7  2.2875  1.0  273.0   
503  0.06076   0.0  11.93   0.0  0.573  6.976  91.0  2.1675  1.0  273.0   
504  0.10959   0.0  11.93   0.0  0.573  6.794  89.3  2.3889  1.0  273.0   
505  0.04741   0.0  11.93   0.0  0.573  6.030  80.8  2.5050  1.0  273.0   

     PTRATIO       B  LSTAT  
0       15.3  396.90   4.98  
1       17.8  396.90   9.14  
2       1

In [None]:
x = x[['NOX']]
print(x)

In [26]:
y = pd.DataFrame(boston_dataset.target, columns=['MEDV'])
print(y)

     MEDV
0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
..    ...
501  22.4
502  20.6
503  23.9
504  22.0
505  11.9

[506 rows x 1 columns]


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)
print(x_train, x_test, y_train, y_test)

In [None]:
model = LinearRegression()
model.fit(x_train, y_train)
y_test_prediction = model.predict(x_test)
print(y_test_prediction)

In [27]:
x = x[['NOX','AGE']]
print(x)

       NOX   AGE
0    0.538  65.2
1    0.469  78.9
2    0.469  61.1
3    0.458  45.8
4    0.458  54.2
..     ...   ...
501  0.573  69.1
502  0.573  76.7
503  0.573  91.0
504  0.573  89.3
505  0.573  80.8

[506 rows x 2 columns]


In [31]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)
model = LinearRegression()
model.fit(x_train, y_train)
print(model.coef_)
print(model.intercept_)

[[-25.89408203  -0.04866708]]
[40.22513315]


In [None]:
y_test_prediction = model.predict(x_test)
print(y_test_prediction)
print(mean_squared_error(y_test, y_test_prediction)**0.5)

In [None]:
polynomial_transformer = PolynomialFeatures(2)
polynomial_data = polynomial_transformer.fit_transform(boston_dataset.data)
print(polynomial_data.shape)

In [None]:
polynomial_feature_names = polynomial_transformer.get_feature_names(boston_dataset.feature_names)
print(polynomial_feature_names)

In [None]:
x = pd.DataFrame(polynomial_data, columns = polynomial_feature_names)
print(x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)
model = LinearRegression()
model.fit(x_train, y_train)
y_test_prediction = model.predict(x_test)
print(y_test_prediction)

In [None]:
print(mean_squared_error(y_test, y_test_prediction) ** 0.5)

In [None]:
diabetes_dataset = datasets.load_diabetes()

polynomial_transformer = PolynomialFeatures(2)
polynomial_data = polynomial_transformer.fit_transform(diabetes_dataset.data)
polynomial_feature_names = polynomial_transformer.get_feature_names(diabetes_dataset.feature_names)
x = pd.DataFrame(polynomial_data, columns = polynomial_feature_names)

y = pd.DataFrame(diabetes_dataset.target, columns=['diabetes'])


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)
model = LinearRegression()
model.fit(x_train, y_train)
y_test_predict = model.predict(x_test)

mse = mean_squared_error(y_test, y_test_predict)**0.5
print(mse)

In [32]:
iris_data = datasets.load_iris()
print(iris_data.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [33]:
x = pd.DataFrame(iris_data.data, columns=iris_data.feature_names)
print(x)

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                  5.1               3.5                1.4               0.2
1                  4.9               3.0                1.4               0.2
2                  4.7               3.2                1.3               0.2
3                  4.6               3.1                1.5               0.2
4                  5.0               3.6                1.4               0.2
..                 ...               ...                ...               ...
145                6.7               3.0                5.2               2.3
146                6.3               2.5                5.0               1.9
147                6.5               3.0                5.2               2.0
148                6.2               3.4                5.4               2.3
149                5.9               3.0                5.1               1.8

[150 rows x 4 columns]


In [34]:
y = pd.DataFrame(iris_data.target, columns=['class'])
print(y)

     class
0        0
1        0
2        0
3        0
4        0
..     ...
145      2
146      2
147      2
148      2
149      2

[150 rows x 1 columns]


In [35]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)
y_train = y_train.values.ravel()

In [42]:
model = LogisticRegression(solver='saga', max_iter=3500)
model.fit(x_train, y_train)
print(model.predict(x_test))
print(model.score(x_test, y_test))

[1 2 2 0 2 1 0 2 0 1 1 2 2 2 0 0 2 2 0 0 1 2 0 1 1 2 1 1 1 2]
0.9666666666666667
