In [22]:
####################### SWITCH FOR PLOTTING ########################
# Set to false to produce a large number of exploratory plots
# When False, only a few images are produced.
#

SHOW_ALL_PLOTS = False

#
#
#####################################################################

from collections import Counter
import datetime
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib import rc
import numpy as np
import os
import pandas as pd
from scipy.constants import convert_temperature
import seaborn as sns

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import MultiTaskLassoCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV, LassoCV,  ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, mean_squared_log_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVR, SVR

from coinflip.classes import FigureUtilities, Mappers

from pprint import pprint
import warnings

def set_fig_size(width=18,height=10):
    fig,ax=plt.subplots(1,1)
    fig.set_size_inches(width,height)
    return fig,ax
    

def set_font_size(size=18):
    font = {'family' : 'verdana',
            'size'   : size}
    rc('font', **font)
    
def make_big(f=18,w=18,h=10):
    set_font_size(size=f)
    fig, ax = set_fig_size(width=w,height=h)
    return fig, ax

from time import time
  
def timer_func(func):
    # This function shows the execution time of 
    # the function object passed
    def wrap_func(*args, **kwargs):
        t1 = time()
        result = func(*args, **kwargs)
        t2 = time()
        print(f'Function {func.__name__!r} executed in {(t2-t1):.4f}s')
        return result
    return wrap_func




def get_df(drop_index=False, sort=False):
    df = pd.read_csv('training_data.csv').drop(columns=['Unnamed: 0'])
    if sort:
        df = df.sort_values(by='datetime')
    if drop_index:
        return df.reset_index(drop=True)
    return df

def get_dd(drop_index=True, sort=True):
    df = get_df(drop_index=drop_index, sort=sort).drop(columns=['atemp'])
    return df[list(df.columns[1:])+[df.columns[0]]]

def do_little_eda():
    df = get_df()
    make_big()
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
    corr = df.corr()
    
    make_big()
    sns.heatmap(corr[['atemp','temp']], annot=True, cmap='coolwarm')
    plt.show()
    
    make_big()
    sns.boxplot(x='atemp', y='temp', data=df)
    plt.xticks(rotation=70)
    
    make_big()
    sns.pairplot(df, hue='season')
    
def get_dt(dt):
    return datetime.datetime.fromisoformat(dt)

class Mappers:
    _season_mapper = {
        1: 'spring',
        2: 'summer',
        3: 'fall',
        4: 'winter'
    }

    _weekday_mapper = {
        0: 'monday',
        1: 'tuesday',
        2: 'wednesday',
        3: 'thurday',
        4: 'friday',
        5: 'saturday',
        6: 'sunday'  
    }
        
    def season_mapper(s_ord):
        return Mappers._season_mapper[s_ord]
    
    
    def weekday_mapper(w_ord):
        return Mappers._weekday_mapper[w_ord]
    
    
    def temp_mapper(temp=None, 
                    low=12, 
                    high=29):
        if temp is None:
            raise ValueError(f'Function agurment temp must be a number, got {temp}.')
        if temp<low//2:
            return 'freezing'
        elif temp<low:
            return 'cold'
        elif temp>=high*1.33:
            return 'hades'
        elif temp <= high:
            return 'nice'
        else:
            return 'hot'

@timer_func
def get_dx_old():
    
    dd = get_dd()
    get_datetime = get_dt
    from scipy.constants import convert_temperature
    dx = dd.copy()
    
    dx['tempcat'] = list(map(Mappers.temp_mapper, dx.temp))
    dts = dx['datetime'].copy()
    dx['hour'] = dts.apply(func= lambda x: get_dt(x).hour)
    dx['weekday'] = dts.apply(func= lambda x: get_datetime(x).weekday())
    dx['weekday'] = dx.weekday.apply(func=lambda x: Mappers.weekday_mapper(x))
    dx['year'] = dts.apply(func= lambda x: (get_datetime(x).year - 2011)%2011+1)
    dx['month'] = dts.apply(func=lambda x: get_datetime(x).month )
    dx['approx_temp'] = dx.temp.apply(func=lambda x: round(x,1))
    low_temp_F, high_temp_F=49,78
    dx['temp_cat'] = dx.temp.apply(func=lambda x : Mappers.temp_mapper(temp=x,low=convert_temperature(low_temp_F,'f','c'),high=convert_temperature(high_temp_F,'f','c')))
    dx['season'] = dx.season.apply(func=lambda x: Mappers.season_mapper(x))
    return dx


def format_title(**kwargs):
    return ';'.join([ f'{k}={v}' for k,v in kwargs.items() ])
    
dx = get_dx_old()

def do_plot(x,y,hue,data=dx):
    dx_=data.copy()
    fig, ax = make_big()
    sns.boxplot(x=dx_[x],y=dx_[y],hue=dx_[hue])
    plt.xticks(rotation=70)
    plt.title(format_title(y=y, x=x, hue=hue))
    plt.legend(loc=4)
    plt.show()


def read_input(infile):    
    return pd.read_csv(infile).drop(columns='Unnamed: 0').sort_values(by='datetime').reset_index(drop=True)

    
@timer_func
def get_dx_new(verbose=False):

    low_temp_F, high_temp_F=49,78
    dd = get_dd()
    dx = dd.copy()
    
    get_datetime   = get_dt
    weekday_mapper = Mappers.weekday_mapper
    season_mapper  = Mappers.season_mapper
    temp_mapper    = Mappers.temp_mapper
    fromisoformat  = datetime.datetime.fromisoformat
    
    dx['tempcat'] = list(map(Mappers.temp_mapper, dx.temp))
    dts = dx['datetime'].copy()
    dx['weekday'] = dts.apply(func=lambda x: weekday_mapper(get_datetime(x).weekday()))
    dx['weekday2'] = [ weekday_mapper(fromisoformat(dx.iloc[idx].datetime).weekday()) for idx in range(len(dx)) ]
    
    dx['year'] = dts.apply(func= lambda x: (get_datetime(x).year - 2011)%2011+1)
    dx['month'] = dts.apply(func=lambda x: get_datetime(x).month )
    dx['hour'] = [ get_dt(x).hour for x in dx.datetime ]
    dx['approx_temp'] = dx.temp.apply(func=lambda x: round(x,1))
    dx['temp_cat'] = dx.temp.apply(func=lambda x : Mappers.temp_mapper(temp=x,low=convert_temperature(low_temp_F,'f','c'),high=convert_temperature(high_temp_F,'f','c')))
    dx['season'] = dx.season.apply(func=lambda x: Mappers.season_mapper(x))

    dx['hr_workingday'] = [ str(dx.iloc[idx].hour) + ','+str(dx.iloc[idx].workingday) for idx in range(len(dx))]
    dx['hr_tempcat'] = [ str(dx.iloc[idx].hour) + ','+str(dx.iloc[idx].tempcat) for idx in range(len(dx))]                                                                         
    dx['hr_season'] = [ str(dx.iloc[idx].hour) + ','+str(dx.iloc[idx].season) for idx in range(len(dx))]
    dx['hr_month'] = [ str(dx.iloc[idx].hour) + ',' + str(dx.iloc[idx].month) for idx in range(len(dx))]
    dx['hr_weekday'] = [ str(dx.iloc[idx].hour) + ',' +str(dx.iloc[idx].weekday) for idx in range(len(dx))]
    dx['hr_weekday_season'] =  [ str(dx.iloc[idx].hr_weekday)+','+str(dx.iloc[idx].season) for idx in range(len(dx))]
    dx['hr_workingday_season'] = [ str(dx.iloc[idx].hr_workingday)+','+str(dx.iloc[idx].season) for idx in range(len(dx)) ]
    
    dx['yr_month'] = [ str(get_dt(x).year) + ','+str(get_dt(x).month) for x in dx.datetime ]
    dx['yr_season'] = [ str(get_dt(dx.iloc[idx].datetime).year) + ','+str(dx.iloc[idx].season) for idx in range(len(dx)) ]
    dx['yr_season_weekday']= [ row.yr_season+','+str(row.weekday) for idx, row in dx.iterrows() ] 
    
    dx['season_weekday'] = [ ','.join(x.split(',')[1:]) for x in dx.yr_season_weekday ]
    dx['yr_season_weekday_hr'] = [ row.yr_season_weekday+','+str(row.hour) for idx,row in dx.iterrows() ]
  

    if verbose:
        print(dx[['datetime','year','hour','month','hr_season','yr_month','yr_season']])
        print()
        print('feature name,','number of disctint values')
        print('set of distinct values')
        print('='*80)
        for time_var in [#'datetime',
                         'season',
                         'year',
                         'hour',
                         'month',
                         'hr_season',
                         'yr_month',
                         'yr_season']:
            print()
            print(time_var,len(list(sorted(list(set(dx[time_var]))))))
            print(list(sorted(list(set(dx[time_var])))))

    return dx.reset_index(drop=True)
    

def process_input(infile, 
                  kind='train',
                  verbose=False):
    print('Processing:', infile, 'kind:', kind, 'filename:', infile)
                                  
    if kind not in ('train', 'test'):
        raise ValueError(f'kind must be set to either "train" or "test"; got {kind}')
        
    
    def get_dt(dt):
        return datetime.datetime.fromisoformat(dt)
    
    dx = get_dx_new()

    if kind == 'train':
        y_vars = [
            'casual', 
            'registered', 
            'count']
    elif kind == 'test':
        y_vars = []

    x_continuous = [
            'temp',
            'humidity',
            'windspeed']

    drop_timevars = [
            'datetime',
            'index',
            'season',
            'year',
            'hour',
            'month']

    x_categorical = [ col for col in dx.columns if col not in y_vars+x_continuous+drop_timevars ] 

    if verbose:
        print('Y:', y_vars)
        print('X_cont:', x_continuous)
        print('X_cat:', x_categorical)
        print('All:', dx.columns)
        
    d_y = dx[y_vars]
    d_xcat = dx[x_categorical]
    d_xcont = dx[x_continuous]
    
    return d_y, d_xcat, d_xcont
    
    
def get_d_y(**kwargs):
    return process_input(**kwargs)[0]

        
# preprocessors
def get_vectorized(data,
                   sparse=False,
                   filter_data=False):
    if filter_data:
        data = data[filter_data]
        
    v = DictVectorizer(sparse=sparse)
    return v, v.fit_transform(data.to_dict('records'))
        
        
def get_dict_vectorized(data, sparse=False, filter_data=False):
    if filter_data:
        data = data[filter_data]
    return DictVectorizer(sparse=sparse).fit_transform(data.to_dict('records'))

        
def get_minmax_scaled(data:pd.DataFrame):
    data = data.copy()
    scaler = MinMaxScaler()
    return scaler, pd.DataFrame(columns = data.columns,
                 data = scaler.fit_transform(data.to_numpy()))
        

def get_dummies(data, drop_first=False, filter_data=False):
    if filter_data:
        data = data[filter_data]
    return pd.get_dummies(data, drop_first=drop_first)




def get_x_train(infile='training_data.csv'):
    '''
    def get_x_train(inp='training_data.csv')->np.array
    '''
    train_y, train_xcat, train_xcont = process_input(infile=infile,
                                                     kind='train')
    vectorizer,xvectorized = get_vectorized(train_xcat)
    scaler, xscaled = get_minmax_scaled(data=train_xcont)
    return np.concatenate((xvectorized, xscaled), 
                          axis=1)

def get_x_test(infile='test_data.csv'):
    test_xcat, test_xcont = process_input('test_data.csv',
                                      kind='test')
    vectorizer,xvectorized = get_vectorized(test_xcat)
    scaler, xscaled = get_minmax_scaled(data=test_xcont)
    return np.concatenate((xvectorized, xscaled), 
                          axis=1)
    

def get_y_train(infile='training_data.csv',
                minmaxscale=False):
    
    train_y, train_xcat, train_xcont = process_input(infile=infile,
                                                     kind='train')
    return train_y


def get_train_test_split(minmax_y=False,
                         shuffle=False):
    
    X_TRAIN = get_x_train()
    Y_TRAIN = get_y_train()
    print(X_TRAIN.shape)
    print(Y_TRAIN.shape)

    X_TEST = get_x_test()
    print(X_TEST.shape)

    if minmax_y:
        yscaler, y = get_minmax_scaled(Y_TRAIN)
    else:
        y_scaler, y = None, Y_TRAIN
    X = X_TRAIN
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=shuffle)

    if minmax_y:
        return X_train, X_test, y_train, y_test, y_scaler
    
    return X_train, X_test, y_train, y_test


# hues = ['hour','temp_cat','weekday','month']
# xs = ['workingday',
#       'weekday',
#       'temp_cat',
#       'temp',
#       #'month',
#       'holiday',
#       'temp_cat']



# plots = []


# for h in hues:
#     for x in xs:
#         for y in ['casual', 'registered']:
#             if h==x:
#                 continue
#             this = {
#                 'y':y,
#                 'x':x,
#                 'hue':h}
#             plots.append(this)
            

# for i, plot in enumerate(plots):
#     print(plot)
#  #   do_plot(**plots[i])
    
# for dep_var in ['count','registered','casual']:
#     for wd in (0,1):
#         d = dx[dx.workingday==wd]
#         fig, ax = make_big()
#         plt.scatter(range(len(d.index)), d[dep_var])
#         plt.ylabel('count')
#         plt.xlabel('Time (sample index of date sorted data)')
#         plt.title(f'workingday={wd}, dep_var={dep_var}')
#         plt.show()

# d = dx[dx.workingday==1]
# fig, ax = make_big()
# plt.scatter(range(len(d.index)), d['count'])
# plt.ylabel('count')
# plt.xlabel('Sample Index (sorted by date)')
# plt.show()
# plt.hist(d.temp_cat)
# plt.title('temp categories')


# fig, ax=make_big()
# sns.boxplot(y='count',x='season', hue='year', data=dx[dx.workingday==1])
# plt.show()

# # sns.boxplot(y='count',x='workingday',data=dd)
# # plt.show()


# from scipy.constants import convert_temperature

# def get_datetime(x):
#     return datetime.datetime.fromisoformat(x)


# season_mapper = {
#     1: 'spring',
#     2: 'summer',
#     3: 'fall',
#     4: 'winter'
# }

# weekday_mapper = {
#     0: 'monday',
#     1: 'tuesday',
#     2: 'wednesday',
#     3: 'thurday',
#     4: 'friday',
#     5: 'saturday',
#     6: 'sunday'
    
# }


    
# def temp_mapper(temp=None, 
#                 low=12, 
#                 high=29):

#     if temp is None:
#         raise ValueError(f'temp must be a number. got {temp}')

#     if temp<low//2:
#         return 'freezing'
#     elif temp<low:
#         return 'cold'
#     elif temp>=high*1.33:
#         return 'hades'
#     elif temp <= high:
#         return 'nice'
#     else:
#         return 'hot'
    
    
# import os
# os.getcwd()

# import time

# get_dx_new()

Function 'get_dx_old' executed in 0.0937s
Function 'get_dx_new' executed in 9.4779s


Unnamed: 0,season,holiday,workingday,weather,temp,humidity,windspeed,casual,registered,count,...,hr_season,hr_month,hr_weekday,hr_weekday_season,hr_workingday_season,yr_month,yr_season,yr_season_weekday,season_weekday,yr_season_weekday_hr
0,spring,0,0,1,9.84,81,0.0000,3,13,16,...,"0,spring",01,"0,saturday","0,saturday,spring","0,0,spring",20111,"2011,spring","2011,spring,saturday","spring,saturday","2011,spring,saturday,0"
1,spring,0,0,1,9.84,75,0.0000,0,1,1,...,"4,spring",41,"4,saturday","4,saturday,spring","4,0,spring",20111,"2011,spring","2011,spring,saturday","spring,saturday","2011,spring,saturday,4"
2,spring,0,0,1,9.02,80,0.0000,2,0,2,...,"6,spring",61,"6,saturday","6,saturday,spring","6,0,spring",20111,"2011,spring","2011,spring,saturday","spring,saturday","2011,spring,saturday,6"
3,spring,0,0,1,8.20,86,0.0000,1,2,3,...,"7,spring",71,"7,saturday","7,saturday,spring","7,0,spring",20111,"2011,spring","2011,spring,saturday","spring,saturday","2011,spring,saturday,7"
4,spring,0,0,1,9.84,75,0.0000,1,7,8,...,"8,spring",81,"8,saturday","8,saturday,spring","8,0,spring",20111,"2011,spring","2011,spring,saturday","spring,saturday","2011,spring,saturday,8"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8703,winter,0,1,1,15.58,50,23.9994,23,546,569,...,"18,winter",1812,"18,wednesday","18,wednesday,winter","18,1,winter",201212,"2012,winter","2012,winter,wednesday","winter,wednesday","2012,winter,wednesday,18"
8704,winter,0,1,1,15.58,50,26.0027,7,329,336,...,"19,winter",1912,"19,wednesday","19,wednesday,winter","19,1,winter",201212,"2012,winter","2012,winter,wednesday","winter,wednesday","2012,winter,wednesday,19"
8705,winter,0,1,1,14.76,57,15.0013,10,231,241,...,"20,winter",2012,"20,wednesday","20,wednesday,winter","20,1,winter",201212,"2012,winter","2012,winter,wednesday","winter,wednesday","2012,winter,wednesday,20"
8706,winter,0,1,1,13.94,61,15.0013,4,164,168,...,"21,winter",2112,"21,wednesday","21,wednesday,winter","21,1,winter",201212,"2012,winter","2012,winter,wednesday","winter,wednesday","2012,winter,wednesday,21"


In [707]:
dx = get_dx_new()

Function 'get_dx_new' executed in 9.4153s


In [759]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
# OneHotEncoder?
cols = dx.columns

ohec = OneHotEncoder()
tr_oh_dx = ohec.fit_transform(dx)


# trdx

orec = OrdinalEncoder()
tr_oh_dx = orec.fit_transform(dx)

pca = PCA(n_components=3)
transformed_data = PCA.fit(tr_oh_dx)



array([[1.000e+00, 0.000e+00, 0.000e+00, ..., 9.000e+00, 9.000e+00,
        2.160e+02],
       [1.000e+00, 0.000e+00, 0.000e+00, ..., 9.000e+00, 9.000e+00,
        2.340e+02],
       [1.000e+00, 0.000e+00, 0.000e+00, ..., 9.000e+00, 9.000e+00,
        2.360e+02],
       ...,
       [3.000e+00, 0.000e+00, 1.000e+00, ..., 5.500e+01, 2.700e+01,
        1.333e+03],
       [3.000e+00, 0.000e+00, 1.000e+00, ..., 5.500e+01, 2.700e+01,
        1.334e+03],
       [3.000e+00, 0.000e+00, 1.000e+00, ..., 5.500e+01, 2.700e+01,
        1.335e+03]])

## from sklearn.decomposition import PCA
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder


pca = PCA(n_components=3)
data = dx[['hour','temp','humidity']].to_numpy()
transformed_data = pca.fit_transform(data)
import matplotlib.pyplot as plt

# Create a scatter plot of the first two principal components
plt.scatter(transformed_data[:, 0], transformed_data[:, 1])
plt.xlabel("First Principal Component")
plt.ylabel("Second Principal Component")
plt.show()

In [735]:
pca.get_covariance()

array([[ 47.88628656,   7.48095422, -37.52975449],
       [  7.48095422,  60.35689838,  -9.30601252],
       [-37.52975449,  -9.30601252, 372.99341809]])

In [772]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.decomposition import PCA

 
class OrdinalEncoderFactory:
    def __init__(self, data):
        self.encoder = OrdinalEncoder()
        self.data = data
        self.transformed_data = self.encoder.fit_transform(self.data)
        self.columns = data.columns
        
class OneHotEncoderFactory:
    def __init__(self, data):
        self.encoder = OneHotEncoder()
        self.data = data
        self.transformed_data = self.encoder.fit_transform(self.data)
        self.columns
        
class PCAFractory(PCA):
    
    def __init__(self, 
                 n_components=3, 
                 data=None,
                 encoder=OrdinalEncoder()):
        super().__init__(n_components=n_components)
        self.data = data
        self.n_components = n_components
        self.pca = PCA(n_components=self.n_components)
        self.encoder = encoder
        self.encoded_data = self.encoder.fit_transform(self.data)
        self.transformed_data = self.pca.fit_transform(self.encoded_data)
        

    def scatter_components(self, comp1=0, comp2=1):
        plt.scatter(transformed_data[:, comp1], transformed_data[:, comp2])
        plt.xlabel(f"Principal Component {comp1}")
        plt.ylabel(f"Principal Component {comp2}")
        plt.show()

    def biplot(self):
        # Create a biplot
        def draw_vector(v0, v1, ax=None):
            ax = ax or plt.gca()
            arrowprops=dict(arrowstyle='->',
                            linewidth=2,
                            shrinkA=0, shrinkB=0)
            ax.annotate('', v1, v0, arrowprops=arrowprops)

        # plot data
        plt.scatter(transformed_data[:, 0], transformed_data[:, 1], alpha=0.2)
        for length, vector in zip(pca.explained_variance_ratio_, pca.components_):
            v = vector * 3 * np.sqrt(length)
            draw_vector(pca.mean_, pca.mean_ + v)
        plt.axis('equal')
    
   
        
        
# oc=OrdinalEncoder()
# data = dx[['hour','hr_weekday', 'hr_weekday_season']]
# cols = data.columns


# transformed_data = oc.fit_transform(dx[['hour','hr_weekday', 'hr_weekday_season']])
# data.to_numpy()



In [776]:


make_pipeline?

[0;31mSignature:[0m [0mmake_pipeline[0m[0;34m([0m[0;34m*[0m[0msteps[0m[0;34m,[0m [0mmemory[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mverbose[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Construct a Pipeline from the given estimators.

This is a shorthand for the Pipeline constructor; it does not require, and
does not permit, naming the estimators. Instead, their names will be set
to the lowercase of their types automatically.

Parameters
----------
*steps : list of estimators.

memory : str or object with the joblib.Memory interface, default=None
    Used to cache the fitted transformers of the pipeline. By default,
    no caching is performed. If a string is given, it is the path to
    the caching directory. Enabling caching triggers a clone of
    the transformers before fitting. Therefore, the transformer
    instance given to the pipeline cannot be inspected
    directly. Use the attribute ``named_steps`` or ``steps`

In [42]:
dx = get_dx_new()
dy = dx[['casual','registered','count']]
dd = dx.drop(columns=list(dy.columns))
y = dy['casual']
X = pd.get_dummies(dd)

Function 'get_dx_new' executed in 10.0052s


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.preprocessing import  OrdinalEncoder, OneHotEncoder, LabelEncoder, PolynomialFeatures, Normalizer
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline


models = {
    'ridge': RidgeCV,
    'elasticnet': ElasticNetCV,
    'lasso': LassoCV
}



@timer_func
def main(dep_var, modelname, pipelines=None):
    
    if pipelines is None:
        pipelines=dict()
    
    label = modelname+','+dep_var
    m = models[modelname]
    model = m()
    print(label)

    pipeline = make_pipeline(MinMaxScaler(), model)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

    pipeline.fit(X_train, y_train)
    y_predict = pipeline.predict(X_test)

    y_true = y_test
    
    fig, ax = make_big()
    plt.scatter(y_true,y_predict, label=label)

    maxx = max(max(y_true),max(y_true))
    plt.plot(range(maxx),range(maxx), color='k',linewidth=3)

    plt.title(f'Results')
    plt.xlabel('y_true')
    plt.ylabel('y_predict')

    plt.legend()
    plt.show()
    
    fig, ax = make_big()
    plt.plot(y_true,y_predict, label=label)
    
    return pipelines

    
pipelines = dict()
for dep_var in ['casual', 'registered']:
    for modelname in models.keys():  
        pipelines = main(dep_var, modelname, pipelines=pipelines)

In [798]:
dx.columns

Index(['season', 'holiday', 'workingday', 'weather', 'temp', 'humidity',
       'windspeed', 'casual', 'registered', 'count', 'datetime', 'tempcat',
       'weekday', 'weekday2', 'year', 'month', 'hour', 'approx_temp',
       'temp_cat', 'hr_workingday', 'hr_tempcat', 'hr_season', 'hr_month',
       'hr_weekday', 'hr_weekday_season', 'hr_workingday_season', 'yr_month',
       'yr_season', 'yr_season_weekday', 'season_weekday',
       'yr_season_weekday_hr'],
      dtype='object')

In [None]:


class PCAresult(indata):
    oc = OrdinalEncoder()
    cols = indata.columns
    


In [703]:
from time import time

@timer_func
def time_get_dx_new():
    return get_dx_new()
    
DX = time_get_dx_new()


def gen_poly_interaction(*cols, data=DX):
    print('using func')
    if data is None:
        data = get_dx_new()
    return [ ','.join([ str(data.iloc[idx][col]) for col in cols ]) for idx in range(len(data)) ]


start = 0
DX['hr_wkd1'] = gen_poly_interaction('hour','weekday')

print('using apply')
DX['hr_wkd2'] = [ ','.join(list(map(str,[row.hour,row.weekday]))) for idx, row in dx.iterrows() ]

print(all(DX['hr_weekday'] == DX['hr_wkd1']))
print(all(DX['hr_wkd1'] == DX['hr_wkd2']))



Function 'time_get_dx_new' executed in 6.6193s
using func
using apply
True
True


In [704]:
DX

Unnamed: 0,season,holiday,workingday,weather,temp,humidity,windspeed,casual,registered,count,...,hr_season,hr_weekday,yr_month,yr_season,yr_season_weekday,season_weekday,yr_season_weekday_hr,season_weekday_hr,hr_wkd1,hr_wkd2
0,spring,0,0,1,9.84,81,0.0000,3,13,16,...,"0,spring","0,saturday",20111,"2011,spring","2011,spring,saturday","spring,saturday","2011,spring,saturday,0","spring,saturday,0","0,saturday","0,saturday"
1,spring,0,0,1,9.84,75,0.0000,0,1,1,...,"4,spring","4,saturday",20111,"2011,spring","2011,spring,saturday","spring,saturday","2011,spring,saturday,4","spring,saturday,4","4,saturday","4,saturday"
2,spring,0,0,1,9.02,80,0.0000,2,0,2,...,"6,spring","6,saturday",20111,"2011,spring","2011,spring,saturday","spring,saturday","2011,spring,saturday,6","spring,saturday,6","6,saturday","6,saturday"
3,spring,0,0,1,8.20,86,0.0000,1,2,3,...,"7,spring","7,saturday",20111,"2011,spring","2011,spring,saturday","spring,saturday","2011,spring,saturday,7","spring,saturday,7","7,saturday","7,saturday"
4,spring,0,0,1,9.84,75,0.0000,1,7,8,...,"8,spring","8,saturday",20111,"2011,spring","2011,spring,saturday","spring,saturday","2011,spring,saturday,8","spring,saturday,8","8,saturday","8,saturday"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8703,winter,0,1,1,15.58,50,23.9994,23,546,569,...,"18,winter","18,wednesday",201212,"2012,winter","2012,winter,wednesday","winter,wednesday","2012,winter,wednesday,18","winter,wednesday,18","18,wednesday","18,wednesday"
8704,winter,0,1,1,15.58,50,26.0027,7,329,336,...,"19,winter","19,wednesday",201212,"2012,winter","2012,winter,wednesday","winter,wednesday","2012,winter,wednesday,19","winter,wednesday,19","19,wednesday","19,wednesday"
8705,winter,0,1,1,14.76,57,15.0013,10,231,241,...,"20,winter","20,wednesday",201212,"2012,winter","2012,winter,wednesday","winter,wednesday","2012,winter,wednesday,20","winter,wednesday,20","20,wednesday","20,wednesday"
8706,winter,0,1,1,13.94,61,15.0013,4,164,168,...,"21,winter","21,wednesday",201212,"2012,winter","2012,winter,wednesday","winter,wednesday","2012,winter,wednesday,21","winter,wednesday,21","21,wednesday","21,wednesday"


In [700]:
# gen_poly_interaction('hour','weekday')

In [701]:
old = get_dx_old()
new = get_dx_new()

for col in old.columns:
    not(col in new.columns) and print(f'Missing column "{col}" in new.columns.')
    not(all(new[col] == old[col])) and print(f'Column "{col}" has some different values in old, new')
    

NameError: name 'idx' is not defined

In [683]:
all(new['hr_tempcat_'] == new['hr_tempcat'])

True

In [679]:
# old['year'].values[:100]

In [656]:
old9,new9 = old.iloc[9],new.iloc[9]
#

# new9[list(old.columns)] == old9

for col in old.columns:
    print(col,new9[col], old9[col])
    
dict(new9)

season spring spring
holiday 0 0
workingday 0 0
weather 2 2
temp 18.86 18.86
humidity 72 72
windspeed 19.0012 19.0012
casual 35 35
registered 71 71
count 106 106
datetime 2011-01-01 14:00:00 2011-01-01 14:00:00
tempcat nice nice
hour 14 14
weekday friday friday
year 1 1
month 7 7
approx_temp 28.7 28.7
temp_cat nice nice


{'season': 'spring',
 'holiday': 0,
 'workingday': 0,
 'weather': 2,
 'temp': 18.86,
 'humidity': 72,
 'windspeed': 19.0012,
 'casual': 35,
 'registered': 71,
 'count': 106,
 'datetime': '2011-01-01 14:00:00',
 'tempcat': 'nice',
 'weekday': 'friday',
 'weekday2': 'saturday',
 'year': 1,
 'month': 7,
 'approx_temp': 28.7,
 'temp_cat': 'nice',
 'hour': 14,
 'hr_workingday': '14,0',
 'hr_tempcat': '14,nice',
 'hr_season': '14,spring',
 'yr_month': '2011,1',
 'yr_season': '2011,spring',
 'yr_season_weekday': '2011,spring,friday',
 'season_weekday': 'spring,friday',
 'hr_weekday': '14,friday',
 'yr_season_weekday_hr': '2011,spring,friday,14',
 'season_weekday_hr': 'spring,friday,14'}

In [657]:
for i in range(len(old)):
    o = int(old.hour.values[i])
    n = int(new.hour.values[i])
    o - n and print(i, o, n, o - n)


0 16 0 16
1 10 4 6
2 5 6 -1
3 2 7 -5
4 16 8 8
5 1 10 -9
6 12 11 1
7 9 12 -3
8 16 13 3
10 17 15 2
13 22 20 2
14 20 21 -1
15 1 22 -21
16 16 0 16
17 0 1 -1
18 14 2 12
19 13 3 10
20 5 4 1
21 9 6 3
22 13 7 6
23 0 8 -8
24 2 9 -7
25 15 11 4
26 6 12 -6
27 6 13 -7
28 8 14 -6
29 1 16 -15
30 11 17 -6
32 19 20 -1
33 11 21 -10
34 18 22 -4
35 7 23 -16
37 23 4 19
39 22 6 16
40 6 9 -3
41 5 11 -6
42 4 12 -8
43 12 14 -2
45 13 16 -3
46 14 17 -3
47 16 18 -2
48 20 19 1
49 21 20 1
50 16 22 -6
51 10 23 -13
52 18 0 18
53 5 1 4
54 3 2 1
55 21 4 17
56 8 5 3
58 16 9 7
59 14 10 4
60 0 11 -11
61 23 12 11
62 1 13 -12
63 13 14 -1
64 4 16 -12
65 10 17 -7
66 12 18 -6
67 17 19 -2
68 8 21 -13
69 19 22 -3
70 8 23 -15
71 18 0 18
72 18 1 17
73 0 2 -2
74 0 4 -4
75 23 5 18
76 0 6 -6
77 5 7 -2
78 22 8 14
79 13 9 4
80 14 11 3
81 2 12 -10
82 22 13 9
83 16 14 2
84 11 15 -4
86 18 19 -1
87 14 20 -6
88 13 21 -8
89 9 22 -13
90 4 0 4
91 22 1 21
92 19 2 17
93 1 6 -5
94 16 8 8
95 2 9 -7
96 23 10 13
97 16 11 5
98 14 12 2
99 16 14 2
100 

In [560]:
def f(*args):
    return tuple(args)


f(1,2,3,99)[-1]

99

In [27]:
class Singleton:
    _instance = None
    def __new__(cls):
        if not cls._instance:
            cls._instance = super().__new__(cls)
        return cls._instance
    @classmethod
    def get_instance(cls):
        return cls()


In [39]:
x=Singleton
y=Singleton()
x._instance == x()

False

In [542]:
class InitCounter:
    where = 'global'
    init_count=0
    def __eq__(self, other):
        return self.__dict__ == other.__dict__

class MyClass(InitCounter):

    init_count = 0
    ic = InitCounter
    
    def __init__(self, arg1=0, arg2=9):
        self.arg1 = arg1
        self.arg2 = arg2
        self.increment_init_count()
        
    @classmethod
    def increment_init_count(cls):
        cls.init_count += 1
        cls.ic.init_count += 1
        
    @classmethod
    def from_dict(cls, data):
        arg1 = data['a']
        arg2 = data['b']
        return cls(arg1, arg2)
    
    def fromDict(data):
        arg1 = data['a']
        arg2 = data['b']
        return MyClass(arg1, arg2)
    
    def __eq__(self, other):
        return self.__dict__ == other.__dict__
    
    def proof():
        return MyClass(1,2) == MyClass.from_dict({'a':1,'b':2}) == MyClass.fromDict({'a':1,'b':2})

4

In [323]:
# MyClass.InitCounter

-100

In [25]:
class SelfReferential:
    a = 'A'
    b = 'B'

    @classmethod
    def get_a(cls):
        return cls.a
    
    def __init__(self,a=None,b=None):
        self.a = a or 'A'
        self.b = b or 'B'
        
#     def get_a(self):
#         return self.a
   
    @classmethod
    def get_b(self):
        return self.b
    
#     def get_funccall_values():
#         return [ SelfReferential.get_a(), SelfReferential.get_b() ]

In [26]:
SelfReferential.get_a()

TypeError: get_a() missing 1 required positional argument: 'cls'

In [569]:
get_datetime = get_dt

weekday_mapper = Mappers.weekday_mapper
temp_mapper = Mappers.temp_mapper
season_mapper = Mappers.season_mapper

dx = get_dd()
dts = dx['datetime'].copy()
dx['hour'] = dts.apply(func= lambda x: get_datetime(x).hour)
dx['weekday'] = dts.apply(func= lambda x: get_datetime(x).weekday())
dx['weekday'] = dx.weekday.apply(func=lambda x: weekday_mapper(x))
dx['year'] = dts.apply(func= lambda x: (get_datetime(x).year - 2011)%2011+1)
dx['month'] = dts.apply(func=lambda x: get_datetime(x).month )
# dx['temp_C'] = dx.temp
# dx['temp_F'] = dx.temp.apply(func=lambda x: convert_temperature(x, 'c', 'f'))
dx['approx_temp'] = dx.temp.apply(func=lambda x: round(x,1))

# temperature categories
low_temp_F, high_temp_F=49,78


dx['temp_cat'] = dx.temp.apply(func=lambda x : temp_mapper(temp=x,
                                                              low=convert_temperature(low_temp_F,'f','c'),
                                                              high=convert_temperature(high_temp_F,'f','c'))
                                 )
dx['season'] = dx.season.apply(func=lambda x: season_mapper(x))



def format_title(**kwargs):
    return ';'.join([ f'{k}={v}' for k,v in kwargs.items() ])
    
# def do_plot(x,y,hue):
#     dx=df.copy()
#     make_big()
#     sns.boxplot(x=dx[x],y=dx[y],hue=dx[hue])
#     plt.xticks(rotation=70)
#     plt.title(format_title(y=y, x=x, hue=hue))
#     plt.legend(loc=4)
#     plt.show()

    

# hues = ['hour','temp_cat', 'weekday']
# xs = ['workingday','weekday','temp_cat','month','holiday','temp_cat']



# plots1 = []


# for h in hues:
#     for x in xs:
#         for y in ['casual', 'registered']:
#             if h==x:
#                 continue
#             this = {
#                 'y':y,
#                 'x':x,
#                 'hue':h}
#             plots1.append(this)
    
# if SHOW_ALL_PLOTS:
#     print(f'Plotting...')
#     pprint(plots1)
#     [ do_plot(**plots1[i]) for i in range(len(plots1)) ]
    
# dx = get_dd()

dx['hr_season'] = [ str(get_dt(dx.iloc[idx].datetime).time().hour) + ','+str(dx.iloc[idx].season) for idx in range(len(dx))]
dx['yr_month'] = [ str(get_dt(x).year) + ','+str(get_dt(x).month) for x in dx.datetime ]
dx['yr_season'] = [ str(get_dt(dx.iloc[idx].datetime).year) + ','+str(dx.iloc[idx].season) for idx in range(len(dx)) ]

dx['weekday'] = [ weekday_mapper(datetime.datetime.fromisoformat(dx.iloc[idx].datetime).weekday()) for idx in range(len(dx)) ]
dx['yr_season_weekday']= [ row.yr_season+','+str(row.weekday) for idx, row in dx.iterrows() ] 
dx['season_weekday'] = [ ','.join(x.split(',')[1:]) for x in dx.yr_season_weekday ]

dx['hr_weekday'] = [ ','.join(list(map(str,[row.hour,row.weekday]))) for idx, row in dx.iterrows() ]
dx['yr_season_weekday_hr'] = [ row.yr_season_weekday+','+str(row.hour) for idx,row in dx.iterrows() ]

dx['season_weekday_hr'] =  [ ','.join(x.split(',')[1:]) for x in dx['yr_season_weekday_hr'] ]
dx['workday_hr'] = [ ','.join([str(row.workingday), str(row.hour)]) for idx, row in dx.iterrows() ]

print(dx[['datetime','year','hour','month','hr_season','yr_month','yr_season', 'weekday']])
print()
print('feature name,','number of disctint values')
print('set of distinct values')
print('='*80)
for time_var in [#'datetime',
                 'season',
                 'year',
                 'hour',
                 'month',
                 'hr_season',
                 'yr_month',
                 'yr_season',
                 'weekday']:
    print()
    print(time_var,len(list(sorted(list(set(dx[time_var]))))))
    print(list(sorted(list(set(dx[time_var])))))
# dx



dx = dx.reset_index(drop=True)

y_vars = [
        'casual', 
        'registered', 
        'count']

x_continuous = [
        'temp',
        'humidity',
        'windspeed']

drop_timevars = [
        'datetime',
        'index',
        'season',
        'year',
        'hour',
        'month']

x_categorical = [ col for col in dx.columns if col not in y_vars+x_continuous+drop_timevars ] 

print('Y:', y_vars)
print('X_cont:', x_continuous)
print('X_cat:', x_categorical)
print('All:', dx.columns)

d_y = dx[y_vars]
d_xcat = dx[x_categorical]
d_xcont = dx[x_continuous]



x_cont_scaler = MinMaxScaler()
x_cont_scaled = x_cont_scaler.fit_transform(d_xcont.to_numpy())
d_xcont_scaled = pd.DataFrame(columns=d_xcont.columns,
                              data=x_cont_scaled)

d_xcont_scaled
check = pd.DataFrame(columns=d_xcont_scaled.columns,
             data=x_cont_scaler.inverse_transform(d_xcont_scaled.to_numpy()))

d_ycont_scaled = d_y

d_ycont_scaled



                 datetime  year  hour  month  hr_season yr_month    yr_season  \
0     2011-01-01 00:00:00     1     0      1   0,spring   2011,1  2011,spring   
1     2011-01-01 04:00:00     1     4      1   4,spring   2011,1  2011,spring   
2     2011-01-01 06:00:00     1     6      1   6,spring   2011,1  2011,spring   
3     2011-01-01 07:00:00     1     7      1   7,spring   2011,1  2011,spring   
4     2011-01-01 08:00:00     1     8      1   8,spring   2011,1  2011,spring   
...                   ...   ...   ...    ...        ...      ...          ...   
8703  2012-12-19 18:00:00     2    18     12  18,winter  2012,12  2012,winter   
8704  2012-12-19 19:00:00     2    19     12  19,winter  2012,12  2012,winter   
8705  2012-12-19 20:00:00     2    20     12  20,winter  2012,12  2012,winter   
8706  2012-12-19 21:00:00     2    21     12  21,winter  2012,12  2012,winter   
8707  2012-12-19 22:00:00     2    22     12  22,winter  2012,12  2012,winter   

        weekday  
0      sa

Unnamed: 0,casual,registered,count
0,3,13,16
1,0,1,1
2,2,0,2
3,1,2,3
4,1,7,8
...,...,...,...
8703,23,546,569
8704,7,329,336
8705,10,231,241
8706,4,164,168


In [None]:
X_train, X_test, y_train, y_test = get_train_test_split()

In [None]:
import sys
from sklearn.linear_model import LinearRegression

def make_models(minmax_y=False,
                preserve_order=False):
    '''
    Uses local vars to produce an output dataframe.
    References pre-defined train/test split.
    '''
    
    _records_dict, _records = dict(), []

    shuffle=not(preserve_order)
    
    if not minmax_y:
        X_train, X_test, y_train, y_test = get_train_test_split(minmax_y=minmax_y, shuffle=shuffle)
    else:
        X_train, X_test, y_train, y_test, y_scaler = get_train_test_split(minmax_y=minmax_y, shuffle=shuffle)

    outputs,model_outputs = [],dict()
    ar = np.array
    models = {
        'ridge': RidgeCV, 
        #'svm lin': LinearSVR,
        #'svm': SVR,
       # 'linear_reg': LinearRegression,
        'elasticnet': ElasticNetCV
    }
    
    dep_vars = y_test.columns

    
    for dep_var in ['casual', 'registered']:
        for modelname, m in models.items():
            label = modelname+','+dep_var
            model = m()
         
            if modelname == 'lasso':
                model = LassoCV()
            elif modelname == 'elasticnet':
                model = ElasticNetCV()

            y_true = y_test[dep_var]
            y_tr = y_train[dep_var]

            try:
         
                print(f'fitting {label}')
                model.fit(X_train, y_tr)
                y_pred = model.predict(X_test)

                ys = y_true, y_pred
                
                e1,e2,e3,e4 = mean_absolute_error(*ys),mean_absolute_percentage_error(*ys),mean_squared_error(*ys),np.nan#ar([_**0.5 for _ in mean_squared_error(*ys)])

                #e1,e2,e3,e4 = mean_absolute_error(*ys),mean_absolute_percentage_error(*ys),mean_squared_error(*ys),mean_squared_log_error(*ys)#ar([_**0.5 for _ in mean_squared_error(*ys)])

                # label = modelname+','+dep_var

                try:
                    coefs = model.coef_
                except:
                    coefs = np.nan

                if modelname == 'svm':
                    coefs = np.nan


                this_output = {'label': label,
                               'modelname': modelname,
                               'model': model,
                               'dep_var': dep_var,
                               'strmodel': str(model),
                               'coefs': coefs,
                               'mean_absolute_error': float(e1),
                               'mean_absolute_percentage_error': float(e2),
                               'mean_squared_error': float(e3),
                               'mean_squared_log_error': float(e4),
                               'y_true': y_true,
                               'y_pred': y_pred,
                               'error': ''}

                # raw_model_outputs[modelname+','+dep_var] = (results)
                _records_dict[label]=this_output
                _records.append(this_output)

                # print(this_output)
    #             print(*results)
            except:
                print(sys.exc_info())
                keys_ = ['modelname',
                         'model',
                         'dep_var',
                         'strmodel',
                         'coefs',
                         'mean_absolute_error',
                         'mean_absolute_percentage_error',
                         'mean_squared_error',
                         'y_true',
                         'y_pred',
                         'error']
                this_output = { _:np.nan for _ in keys_ }
                this_output['error'] = str(sys.exc_info())
                _records_dict[label] = this_output
                _records.append(this_output)
                
    dm = pd.DataFrame.from_records(_records)
    dm['y_true']=dm.y_true.apply(lambda x: x.values)
    dm['ytr']=dm.y_true.apply(lambda x: ','.join(list(map(str, x))))
    dm['y_pred']=dm.y_pred.apply(lambda x: np.array(list(map(lambda y: round(y,1), x))))
    dm['ypr']=dm.y_pred.apply(lambda x: ','.join(list(map(str, x))))

    return dm, _records_dict, _records

In [None]:
dm, model_outputs, records = make_models(preserve_order=True)

print(dm.columns)

dm['rms_error'] = dm.mean_squared_error
dm = dm[~pd.isnull(dm.rms_error)]
errors = dm[['model','rms_error', 'mean_absolute_error','mean_squared_error']]
errors.rms_error = [ np.sqrt(e) for e in errors.rms_error ]
print(errors.head(4))
print(dm.head(4))
dm