In [15]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression

## Load Data

In [2]:
# Load updated df
df = pd.read_csv("../Data/df_updated.csv")

In [3]:
df.shape

(5265, 33)

In [4]:
df.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,0,1,2,3,4
Skin symptoms,1,1,1,1,1
Season born,1.0,2.0,2.0,1.0,4.0
Heating system_house,1.0,2.0,1.0,2.0,1.0
House _now,,2.0,3.0,4.0,3.0
House_before,,3.0,1.0,,1.0
Clean_house,2.0,2.0,2.0,2.0,2.0
Puppy_dewormed,1.0,1.0,1.0,1.0,1.0
Puppy_vaccinated,1.0,1.0,1.0,1.0,1.0
Dam_dewormed_prebirth,,1.0,1.0,1.0,1.0


In [5]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [24]:
df.rename(columns = {'Skin symptoms':'Skin_symptoms'}, inplace = True)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5265 entries, 0 to 5264
Data columns (total 32 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Skin_symptoms             5265 non-null   int64  
 1   Season born               5140 non-null   float64
 2   Heating system_house      4762 non-null   float64
 3   House _now                5238 non-null   float64
 4   House_before              3930 non-null   float64
 5   Clean_house               5239 non-null   float64
 6   Puppy_dewormed            4998 non-null   float64
 7   Puppy_vaccinated          5080 non-null   float64
 8   Dam_dewormed_prebirth     3524 non-null   float64
 9   Dam_vaccinated_prebirth   5265 non-null   int64  
 10  Gender                    5124 non-null   float64
 11  Over 50% white            4991 non-null   float64
 12  Other animals             5265 non-null   int64  
 13  Other dogs                5265 non-null   int64  
 14  Born in 

In [34]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='Skin_symptoms'), 
                                                    df.Skin_symptoms, test_size=0.3, 
                                                    random_state=47)

In [36]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3685, 31), (1580, 31), (3685,), (1580,))

In [37]:
names_list = ['Breed_En', 'Classification', 'Category', 'Height_low_inches', 'Height_high_inches', 'Weight_low_lbs', 'Weight_high_lbs']
names_train = X_train[names_list]
names_test = X_test[names_list]
X_train.drop(columns=names_list, inplace=True)
X_test.drop(columns=names_list, inplace=True)
X_train.shape, X_test.shape

((3685, 24), (1580, 24))

In [38]:
X_train.dtypes

Season born                 float64
Heating system_house        float64
House _now                  float64
House_before                float64
Clean_house                 float64
Puppy_dewormed              float64
Puppy_vaccinated            float64
Dam_dewormed_prebirth       float64
Dam_vaccinated_prebirth       int64
Gender                      float64
Over 50% white              float64
Other animals                 int64
Other dogs                    int64
Born in owner family          int64
Does the dog have a yard    float64
Bcs_under 2 mo              float64
Outside_under 2 mo          float64
Outside a day_under 5mo     float64
Skin symptoms_dam           float64
Age                         float64
Vet diagnosis               float64
Obey                        float64
Reps_lower                  float64
Reps_upper                  float64
dtype: object

In [39]:
X_test.dtypes

Season born                 float64
Heating system_house        float64
House _now                  float64
House_before                float64
Clean_house                 float64
Puppy_dewormed              float64
Puppy_vaccinated            float64
Dam_dewormed_prebirth       float64
Dam_vaccinated_prebirth       int64
Gender                      float64
Over 50% white              float64
Other animals                 int64
Other dogs                    int64
Born in owner family          int64
Does the dog have a yard    float64
Bcs_under 2 mo              float64
Outside_under 2 mo          float64
Outside a day_under 5mo     float64
Skin symptoms_dam           float64
Age                         float64
Vet diagnosis               float64
Obey                        float64
Reps_lower                  float64
Reps_upper                  float64
dtype: object

In [40]:
# Filling NaN with Median given that most features are categorical. 
X_defaults_median = X_train.median()
X_defaults_median

Season born                  2.0
Heating system_house         2.0
House _now                   3.0
House_before                 2.0
Clean_house                  2.0
Puppy_dewormed               1.0
Puppy_vaccinated             1.0
Dam_dewormed_prebirth        1.0
Dam_vaccinated_prebirth      1.0
Gender                       2.0
Over 50% white               2.0
Other animals                1.0
Other dogs                   2.0
Born in owner family         1.0
Does the dog have a yard     2.0
Bcs_under 2 mo               1.0
Outside_under 2 mo           1.0
Outside a day_under 5mo      3.0
Skin symptoms_dam            1.0
Age                          3.0
Vet diagnosis                0.0
Obey                        70.0
Reps_lower                  16.0
Reps_upper                  25.0
dtype: float64

In [41]:
X_tr = X_train.fillna(X_defaults_median)
X_te = X_test.fillna(X_defaults_median)

In [42]:
scaler = StandardScaler()
scaler.fit(X_tr)
X_tr_scaled = scaler.transform(X_tr)
X_te_scaled = scaler.transform(X_te)

In [43]:
lm = LinearRegression().fit(X_tr_scaled, y_train)

In [44]:
y_tr_pred = lm.predict(X_tr_scaled)
y_te_pred = lm.predict(X_te_scaled)

In [45]:
median_r2 = r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)
median_r2

(0.20226750905842694, 0.2063213394085398)

In [46]:
median_mae = mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)
median_mae

(0.23068343691919407, 0.2375006205373552)

In [47]:
median_mse = mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)
median_mse

(0.11434391388122164, 0.1202039748081484)

In [None]:
#Maybe will have better results with different K 

In [58]:
pipe15 = make_pipeline(
    SimpleImputer(strategy='median'), 
    StandardScaler(),
    SelectKBest(f_regression, k=15),
    LinearRegression()
)

In [59]:
pipe15.fit(X_train, y_train)

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                ('standardscaler', StandardScaler()),
                ('selectkbest',
                 SelectKBest(k=15,
                             score_func=<function f_regression at 0x7fa69b9658b0>)),
                ('linearregression', LinearRegression())])

In [60]:
y_tr_pred = pipe15.predict(X_train)
y_te_pred = pipe15.predict(X_test)


In [61]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.200298550874637, 0.20988947768059785)

In [62]:
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(0.2309896409294554, 0.2368465672666747)

In [64]:
#Maybe a different K 
pipe5 = make_pipeline(
    SimpleImputer(strategy='median'), 
    StandardScaler(),
    SelectKBest(f_regression, k=5),
    LinearRegression()
)

In [65]:
pipe5.fit(X_train, y_train)

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                ('standardscaler', StandardScaler()),
                ('selectkbest',
                 SelectKBest(k=5,
                             score_func=<function f_regression at 0x7fa69b9658b0>)),
                ('linearregression', LinearRegression())])

In [66]:
y_tr_pred = pipe5.predict(X_train)
y_te_pred = pipe5.predict(X_test)

In [67]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.193449311213014, 0.20649283368222515)

In [68]:
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(0.23225022989397118, 0.23875435517887328)

### With cross validation 

In [69]:
cv_results = cross_validate(pipe15, X_train, y_train, cv=5)

In [70]:
cv_scores = cv_results['test_score']
cv_scores

array([0.1325385 , 0.20462363, 0.12863176, 0.23745546, 0.19882285])

In [71]:
np.mean(cv_scores), np.std(cv_scores)

(0.18041444182788924, 0.04278360406664876)

In [72]:
np.round((np.mean(cv_scores) - 2 * np.std(cv_scores), np.mean(cv_scores) + 2 * np.std(cv_scores)), 2)

array([0.09, 0.27])

### GridSearchCV

In [73]:
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'simpleimputer', 'standardscaler', 'linearregression', 'simpleimputer__add_indicator', 'simpleimputer__copy', 'simpleimputer__fill_value', 'simpleimputer__missing_values', 'simpleimputer__strategy', 'simpleimputer__verbose', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'linearregression__copy_X', 'linearregression__fit_intercept', 'linearregression__n_jobs', 'linearregression__normalize', 'linearregression__positive'])

In [74]:
k = [k+1 for k in range(len(X_train.columns))]
grid_params = {'selectkbest__k': k}

In [75]:
lr_grid_cv = GridSearchCV(pipe, param_grid=grid_params, cv=5, n_jobs=-1)

In [76]:
lr_grid_cv.fit(X_train, y_train)

ValueError: Invalid parameter selectkbest for estimator Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                ('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())]). Check the list of available parameters with `estimator.get_params().keys()`.

## Summary 
I have questions  