https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation

In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm

from sklearn.metrics import r2_score, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.linear_model import LinearRegression


In [2]:
pd.set_option('precision', 3)

In [3]:
#version 3
def statsmodels_train_test_split(df, stratify=None, **kwargs):

    if stratify is None:
        y, X = df.iloc[:,0], df.drop(columns=df.columns[0])
        X_train, X_test, y_train, y_test = train_test_split(X,y, **kwargs)
    else:
        y, X = stratify, df.drop(columns = stratify.name)
        X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y, **kwargs)
    
    return pd.concat([X_train, y_train], axis=1), pd.concat([X_test, y_test], axis=1)

In [4]:
from sklearn.base import BaseEstimator, RegressorMixin
class SMWrapper(BaseEstimator, RegressorMixin):
    """ A universal sklearn-style wrapper for statsmodels regressors """
    def __init__(self, model_class, fit_intercept=True):
        self.model_class = model_class
        self.fit_intercept = fit_intercept
    def fit(self, X, y):
        if self.fit_intercept:
            X = sm.add_constant(X)
        self.model_ = self.model_class(y, X)
        self.results_ = self.model_.fit()
    def predict(self, X):
        if self.fit_intercept:
            X = sm.add_constant(X)
        return self.results_.predict(X)

In [5]:
df = sns.load_dataset('iris')
df.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [6]:
#let's do linear regression with statsmodels first

In [18]:
train, test = statsmodels_train_test_split(df, random_state=3)

In [19]:
formula = 'sepal_length ~ sepal_width + petal_length + petal_width + C(species)'
model = smf.ols(formula=formula, data=train)
fitted_model = model.fit()
fitted_model.summary()

0,1,2,3
Dep. Variable:,sepal_length,R-squared:,0.874
Model:,OLS,Adj. R-squared:,0.868
Method:,Least Squares,F-statistic:,146.7
Date:,"Thu, 17 Sep 2020",Prob (F-statistic):,5.79e-46
Time:,12:02:03,Log-Likelihood:,-15.098
No. Observations:,112,AIC:,42.2
Df Residuals:,106,BIC:,58.51
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.4518,0.326,7.515,0.000,1.805,3.099
C(species)[T.versicolor],-0.6399,0.280,-2.283,0.024,-1.196,-0.084
C(species)[T.virginica],-0.8006,0.384,-2.085,0.040,-1.562,-0.039
sepal_width,0.4194,0.102,4.097,0.000,0.216,0.622
petal_length,0.8750,0.074,11.792,0.000,0.728,1.022
petal_width,-0.5994,0.165,-3.643,0.000,-0.926,-0.273

0,1,2,3
Omnibus:,0.452,Durbin-Watson:,2.016
Prob(Omnibus):,0.798,Jarque-Bera (JB):,0.608
Skew:,-0.093,Prob(JB):,0.738
Kurtosis:,2.691,Cond. No.,105.0


In [20]:
%precision %.2f
r2_score(test['sepal_length'],fitted_model.predict(test))  #0.8308454946286852 with random_state=3

0.8308454946286852

In [10]:
# Do it with sklearn

In [21]:
# Change our dataset to dummies!

df_dummies = pd.get_dummies(df, drop_first=True)

In [22]:
formula = 'sepal_length ~ sepal_width + petal_length + petal_width + C(species)'

In [23]:
y = df_dummies[formula.split('~')[0].strip()]
X = df_dummies.drop(columns=formula.split('~')[0].strip())

In [24]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   sepal_width         150 non-null    float64
 1   petal_length        150 non-null    float64
 2   petal_width         150 non-null    float64
 3   species_versicolor  150 non-null    uint8  
 4   species_virginica   150 non-null    uint8  
dtypes: float64(3), uint8(2)
memory usage: 3.9 KB


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=3)

In [34]:
X_train.shape, X_test.shape

((112, 5), (38, 5))

In [35]:
y_train.shape, y_test.shape

((112,), (38,))

In [28]:
linear_model = LinearRegression(n_jobs=-1).fit(X_train, y_train)

In [38]:
linear_model.predict(y_test.values.reshape(1,-1))

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 5 is different from 38)

In [29]:
r2_score(y_test,linear_model.predict(y_train))

ValueError: Expected 2D array, got 1D array instead:
array=[6.4 6.2 6.4 5.8 4.6 5.1 6.1 6.4 5.1 6.  5.8 5.4 4.8 5.  5.8 7.  6.1 4.9
 6.3 5.1 6.5 5.7 5.  4.8 6.8 5.9 5.4 4.8 5.1 6.4 6.3 4.9 6.7 5.7 4.7 5.1
 7.4 7.7 5.8 5.7 5.2 6.3 5.1 5.5 6.4 6.7 6.9 7.1 5.9 7.7 4.8 6.3 4.3 5.7
 6.6 6.4 6.2 5.2 5.5 5.4 6.1 6.4 6.  4.9 7.2 5.1 6.7 6.1 6.7 6.3 5.6 6.1
 5.7 7.6 5.6 6.5 6.5 5.2 6.  6.3 5.  5.6 5.4 4.9 5.3 5.7 6.3 5.5 5.  6.9
 5.  6.  5.9 6.2 6.5 6.5 5.6 5.5 7.2 5.  4.7 5.6 5.5 5.  5.  6.  6.  7.3
 5.1 5.1 7.9 4.9].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.