https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm

from sklearn.metrics import r2_score, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.linear_model import LinearRegression


In [3]:
#version 3
def statsmodels_train_test_split(df, stratify=None, **kwargs):

    if stratify is None:
        y, X = df.iloc[:,0], df.drop(columns=df.columns[0])
        X_train, X_test, y_train, y_test = train_test_split(X,y, **kwargs)
    else:
        y, X = stratify, df.drop(columns = stratify.name)
        X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y, **kwargs)
    
    return pd.concat([X_train, y_train], axis=1), pd.concat([X_test, y_test], axis=1)

In [4]:
# https://stackoverflow.com/questions/41045752/using-statsmodel-estimations-with-scikit-learn-cross-validation-is-it-possible/48949667#48949667
from sklearn.base import BaseEstimator, RegressorMixin
import statsmodels.api as sm
class SMWrapper(BaseEstimator, RegressorMixin):
    """ A universal sklearn-style wrapper for statsmodels regressors """
    def __init__(self, model_class, fit_intercept=True):
        self.model_class = model_class
        self.fit_intercept = fit_intercept
    def fit(self, X, y):
        if self.fit_intercept:
            X = sm.add_constant(X)
        self.model_ = self.model_class(y, X)
        self.results_ = self.model_.fit()
    def predict(self, X):
        if self.fit_intercept:
            X = sm.add_constant(X)
        return self.results_.predict(X)

In [5]:
df = sns.load_dataset('iris')
df.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [6]:
#let's do linear regression with statsmodels first

In [7]:
formula = 'sepal_length ~ sepal_width + petal_length + petal_width + C(species)'

In [None]:
train, test = statsmodels_train_test_split(df, random_state=3)
model = smf.ols(formula=formula, data=train).fit()

r2_statsmodels = r2_score(test['sepal_length'],model.predict(test))  #0.8308454946286852 with random_state=3
r2_statsmodels

In [None]:
# Do it with sklearn
df_dummies = pd.get_dummies(df, drop_first=True)
df_dummies.columns

In [None]:
# formula = 'sepal_length ~ sepal_width + petal_length + petal_width + C(species)' #same as above
LHS = formula.split('~')[0].strip() # 'sepal_length'
y = df_dummies[LHS]
X = df_dummies.drop(columns=LHS) # df[['sepal_width', 'petal_length', 'petal_width' + species_dummy_1 + species_dummy_2]]
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=3)

linear_model = LinearRegression(n_jobs=-1).fit(X_train, y_train)
r2_sklearn = r2_score(y_test,linear_model.predict(X_test))
r2_sklearn

In [None]:
round(r2_statsmodels,5) == round(r2_sklearn,5)

So now we know we can get the same r-squared, to within rounding error.
What about 10 fold cross validation?

In [None]:
# start with sklearn

In [None]:
cross_val_sklearn_r2 = np.mean(cross_val_score(linear_model, X_test, y_test, scoring='r2', cv=10, n_jobs=-1))
cross_val_sklearn_r2

In [None]:
sm.OLS??

In [8]:
LHS = formula.split('~')[0].strip() # 'sepal_length'
y = df_dummies[LHS]
X = df_dummies.drop(columns=LHS) # df[['sepal_width', 'petal_length', 'petal_width' + species_dummy_1 + species_dummy_2]]
# X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=3)


NameError: name 'df_dummies' is not defined

In [None]:
X.info()

In [None]:
X.shape, y.shape

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape

In [11]:
df_dummies = pd.get_dummies(df, drop_first=True)
df_dummies.columns
LHS = formula.split('~')[0].strip() # 'sepal_length'
y = df_dummies[LHS]
X = df_dummies.drop(columns=LHS)
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=3)
model = sm.OLS
cross_val_statsmodels_r2 = cross_val_score(SMWrapper(model), 
                                                   X_train, 
                                                   y_train, 
                                                   scoring='r2', 
                                                   cv=10, 
                                                   n_jobs=-1
                                            )
cross_val_statsmodels_r2

array([0.71260406, 0.87968716, 0.80601214, 0.87399924, 0.88859355,
       0.867954  , 0.75752086, 0.82381163, 0.86767947, 0.89468072])

In [None]:
cross_validate??