# Домашнее задание по теме "Работа с переменными"

In [29]:
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error, r2_score

## Получение данных и загрузка их в dataframe

In [2]:
housing = fetch_california_housing()
print(housing.data.shape, housing.target.shape)

(20640, 8) (20640,)


In [3]:
print(fetch_california_housing().DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [25]:
df = pd.DataFrame(housing.data, columns=[housing.feature_names])
df['MedHCost'] = housing.target

In [26]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHCost
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [6]:
df.shape

(20640, 9)

## Проверка данных на наличие пропусков

In [7]:
df.isna().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
MedHCost      0
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   (MedInc,)      20640 non-null  float64
 1   (HouseAge,)    20640 non-null  float64
 2   (AveRooms,)    20640 non-null  float64
 3   (AveBedrms,)   20640 non-null  float64
 4   (Population,)  20640 non-null  float64
 5   (AveOccup,)    20640 non-null  float64
 6   (Latitude,)    20640 non-null  float64
 7   (Longitude,)   20640 non-null  float64
 8   (MedHCost,)    20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


Все признаки содержат числовые значения. Пропусков в наборе данных нет.

## Обучение модели линейной регрессии и вычисление метрик

Зададим функцию, которая будет разбивать датафрейм на обучающую и тестовую подвыборки, обучать модель линейной регрессии и выподить на экран значения mse и r2

In [63]:
def get_scores(df, random_seed=5):
    X = df[housing.feature_names]
    y = df[['MedHCost']]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    train_mse = mean_squared_error(y_train, train_pred)
    test_mse = mean_squared_error(y_test, test_pred)
    train_r2 = r2_score(y_train, train_pred)
    test_r2 = r2_score(y_test, test_pred)
    score_dict = {"Train_mse":[train_mse], "Train_r2":[train_r2], "Test_mse":[test_mse], "Test_r2":[test_r2]}
    return score_dict

In [65]:
score_df = pd.DataFrame(get_scores(df))
score_df

Unnamed: 0,Train_mse,Train_r2,Test_mse,Test_r2
0,0.52144,0.604792,0.536343,0.611257
