In [6]:
import numpy as np
import pandas as pd

from pydataset import data

import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression, TweedieRegressor, LassoLars
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.preprocessing import PolynomialFeatures

from scipy import stats
from wrangle import split_data

In [10]:
def X_y_split(df, target):
    '''
    This function takes in a dataframe and a target variable
    Then it returns the X_train, y_train, X_validate, y_validate, X_test, y_test
    and a print statement with the shape of the new dataframes
    '''  
    train, validate, test = split_data(df)

    X_train = train.drop(columns= target)
    y_train = train[target]

    X_validate = validate.drop(columns= target)
    y_validate = validate[target]

    X_test = test.drop(columns= target)
    y_test = test[target]
        
    # Have function print datasets shape
    print(f'''
    X_train -> {X_train.shape}
    X_validate -> {X_validate.shape}
    X_test -> {X_test.shape}''') 
    
    return X_train, y_train, X_validate, y_validate, X_test, y_test

1. Select a dataset with a continuous target variable.

In [4]:
df = data('swiss')

In [5]:
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


2. Be sure your data is prepared (no missing values, numeric datatypes) and split into samples.

In [8]:
train, validate, test = split_data(df)
train.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Broye,83.8,70.2,16,7,92.85,23.6
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Echallens,68.3,72.6,18,2,24.2,21.2
Aubonne,66.9,67.5,14,7,2.27,19.1


In [9]:
train.shape

(27, 6)

In [11]:
X_train, y_train, X_validate, y_validate, X_test, y_test = X_y_split(df, 'Infant.Mortality')
X_train.head()


    X_train -> (27, 5)
    X_validate -> (10, 5)
    X_test -> (10, 5)


Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic
St Maurice,65.0,75.9,9,9,99.06
Porrentruy,76.1,35.3,9,7,90.57
Le Locle,72.7,16.7,22,13,11.22
Echallens,68.3,72.6,18,2,24.2
Sierre,92.2,84.6,3,3,99.46


In [12]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [13]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)

In [22]:
y_train.head()

St Maurice    17.8
Porrentruy    26.6
Le Locle      18.9
Echallens     21.2
Sierre        16.3
Name: Infant.Mortality, dtype: float64

In [15]:
X_val_scaled = pd.DataFrame(scaler.transform(X_validate), index=X_validate.index, columns=X_validate.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

In [27]:
r, p = stats.pearsonr(train['Fertility'], train['Infant.Mortality'])
r,p

(0.47506727879086624, 0.012275249386784558)

3. Work through all of the steps outlined in the lesson, from setting the baseline to selected a model and evaluating the final model on your test data.

In [16]:
best = SelectKBest(f_regression, k=3)
best.fit(X_train, y_train)

In [17]:
X_train.columns[best.get_support()]

Index(['Agriculture', 'Examination', 'Education'], dtype='object')

In [20]:
k_columns = X_train.columns[best.get_support()].to_list()


k_columns

['Agriculture', 'Examination', 'Education']

In [29]:
train['baseline_mean'] = train['Infant.Mortality'].mean()
train['baseline_median'] = train['Infant.Mortality'].median()

# scores:
rmse_mean = mean_squared_error(train['Infant.Mortality'],
                               train['baseline_mean'], squared=False)
rmse_med = mean_squared_error(train['Infant.Mortality'],
                               train['baseline_median'], squared=False)

rmse_mean, rmse_med

(3.1248506137133485, 3.1327422610349025)

In [32]:
y_validate['baseline_mean'] = train['Infant.Mortality'].mean()

In [34]:
metric_df = pd.DataFrame(
[
    {
        'model': 'baseline_mean',
        'rmse': mean_squared_error(validate['Infant.Mortality'], validate.baseline_mean),
        'r^2': explained_variance_score(validate['Infant.Mortality'], validate.baseline_mean)
    
    }
])


AttributeError: 'DataFrame' object has no attribute 'baseline_mean'