Pipelines for model trainng @author Tim Copeland

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import average_precision_score

In [2]:
# Create a class to select numerical columns
class DataFrameSelector(BaseEstimator, TransformerMixin):
        
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        #add year_qnum column
        #X['year_qnum'] = X['date'].apply(lambda x: int(str(x[0:4]) + str(X['qnum'])))
        
        #ret_X = X[X['year_qnum'] < target]
        return X

SGDClassifier with both hinge loss and log loss 

In [3]:
surprise_earn_df = pd.read_csv('AFE_data/Training DFs_new/surprise_df1.csv')
full_dist_df     = pd.read_csv('AFE_data/Training DFs_new/surprise_df2.csv')

In [4]:
full_dist_df.columns

Index(['Unnamed: 0', 'TICKER', 'quarternum', 'analyst', 'forecast_value',
       'year', 'earn_value', 'std', 'Analyst_Counts', 'Max', 'Min',
       'the max_forecast-min_forecast', 'delta_return', 'Y', 'Y_up', 'Y_down',
       'quantile_0', 'quantile_10', 'quantile_20', 'quantile_30',
       'quantile_40', 'quantile_50', 'quantile_60', 'quantile_70',
       'quantile_80', 'quantile_90'],
      dtype='object')

In [5]:
#drop y_up and y_down for now
full_dist_df = full_dist_df.drop(['Y_up', 'Y_down'], axis=1)

In [6]:
#drop redundant index col
surprise_earn_df = surprise_earn_df.drop(surprise_earn_df.columns[0], axis=1)
full_dist_df = full_dist_df.drop(full_dist_df.columns[0], axis=1)

In [7]:
#replace inf's with nan's
#surprise_earn_df = surprise_earn_df.replace(np.inf, np.nan)
#full_dist_df     = full_dist_df.replace(np.inf, np.nan)

In [8]:
surprise_earn_df.head()

Unnamed: 0,surprise_median_fa,surprise_mean_fa,TICKER,year,quarternum,Skew,Kurtosis,delta_return,Y
0,10.392305,9.814955,ABBV,2015,0,-0.016738,0.83,-0.0381,0
1,10.392305,9.814955,ABBV,2015,0,-0.016738,0.83,-0.0381,0
2,10.392305,9.814955,ABBV,2015,0,-0.016738,0.83,-0.0381,0
3,0.0,0.0,ABBV,2016,2,0.0,1.26,0.0145,0
4,0.0,0.0,ABBV,2016,2,0.0,1.26,0.0145,0


In [9]:
full_dist_df.head()

Unnamed: 0,TICKER,quarternum,analyst,forecast_value,year,earn_value,std,Analyst_Counts,Max,Min,...,quantile_0,quantile_10,quantile_20,quantile_30,quantile_40,quantile_50,quantile_60,quantile_70,quantile_80,quantile_90
0,ABBV,0,171887,0.83,2015,0.89,0.005774,3,0.84,0.83,...,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333
1,ABBV,0,109306,0.84,2015,0.89,0.005774,3,0.84,0.83,...,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333
2,ABBV,0,84063,0.83,2015,0.89,0.005774,3,0.84,0.83,...,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333
3,ABBV,2,105436,1.29,2016,1.26,0.042426,2,1.29,1.23,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
4,ABBV,2,130921,1.23,2016,1.26,0.042426,2,1.29,1.23,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5


In [10]:
#add a year + qnum column
to_str = lambda x: str(x)
to_int = lambda x: int(x)

surprise_earn_df['year_qnum'] = surprise_earn_df['year'].apply(to_str) + surprise_earn_df['quarternum'].apply(to_str)
surprise_earn_df['year_qnum'] = surprise_earn_df['year_qnum'].apply(to_int)

full_dist_df['year_qnum']     = full_dist_df['year'].apply(to_str) + full_dist_df['quarternum'].apply(to_str)
full_dist_df['year_qnum']     = full_dist_df['year_qnum'].apply(to_int)

In [17]:
#generates X and y trainng and test sets for a given df and year_qnum
class analyst_forecasting_model:
    #df is either the full_dist_df or the surprise_earn_df
    def __init__(self,df, predictor):
        self.df               = df
        self.predictor        = predictor
        
        self.grid_fit         = [] #contains grid search fit paramaters for the quarter
        #self.grid_fit_up      = []
        #self.grid_fit_down    = []
        
        self.predictions      = [] #contains grid search best fit predictions for the quarter
        #self.predictions_up   = []
        #self.predictions_down = []
        
        self.missed           = [] #contains sum of missed predictions / total # predictions for the quarter
        #self.missed_up        = []
        #self.missed_down      = []
        
        self.pr_curve         = [] #precision recall curve
    
        #pre processes data 
        #self.full_pipeline = self.create_full_pipeline_with_predictor()
        
        self.full_pipeline = Pipeline([        
            ('selector', DataFrameSelector()),
            ('imputer1', Imputer(strategy="median")),
            ('imputer2', Imputer(missing_values = np.inf, strategy="median")),
            ('imputer3', Imputer(missing_values = -np.inf, strategy="median")),
            ('std_scaler', StandardScaler())
        ])
        
    def run_all_quarters(self):
        #need to skip first quarter because we don't have trainng data for it
        for i in self.df['year_qnum'].unique()[1:-1]: 
            self.train_predict(i)
    
    def data_sets(self, year_qnum):
        df = self.df
        
        #train and test data
        X_train      = df[df['year_qnum'] < year_qnum]
        y_train      = X_train['Y']
        #y_train_up   = X_train['Y_up']
        #y_train_down = X_train['Y_down']

        #X_train = X_train.drop(['Y', 'Y_up', 'Y_down', 'year', 'TICKER', 'year_qnum'], axis=1)
        X_train = X_train.drop(['Y', 'year', 'TICKER', 'year_qnum'], axis=1)

        X_test      = df[df['year_qnum'] == year_qnum]
        y_test      = X_test['Y']
        #y_test_up   = X_test['Y_up']
        #y_test_down = X_test['Y_down']
        
        #X_test = X_test.drop(['Y', 'Y_up', 'Y_down', 'year', 'TICKER', 'year_qnum'], axis=1)
        X_test = X_test.drop(['Y', 'year', 'TICKER', 'year_qnum'], axis=1)
        
        #return X_train, y_train, y_train_up, y_train_down, \
        #       X_test, y_test, y_test_up, y_test_down
        
        return X_train, y_train, X_test, y_test
    
    def train_predict(self,year_qnum):
        #X_train, y_train, y_train_up, y_train_down, X_test, y_test, y_test_up, y_test_down \
        #            = self.data_sets(year_qnum)
        
        X_train, y_train, X_test, y_test = self.data_sets(year_qnum)
        
        param_grid = {
            'classify__penalty':['l2', 'l1'],
            'classify__alpha':[0.0001,0.0001*10,0.0001*100.],
            'classify__l1_ratio':[.15,.3,.5,.7],
        }
        #--train + test
        print(self.full_pipeline.fit_transform(X_train))
                
        grid_search = GridSearchCV(self.predictor, cv=3, n_jobs=1, param_grid=param_grid)
        grid_search.fit(self.full_pipeline.fit_transform(X_train), y_train)
        self.grid_fit.append(grid_search)
    
        
        predict = grid_search.predict(X_test)
        self.predictions.append(predict)
        
        y_score = classifier.decision_function(X_test)
        average_precision = average_precision_score(y_test, y_score)
        self.pr_curve.append(average_precision)
        
        miss    = sum(np.abs(y_test - predict))/len(y_test)
        self.missed.append(miss)
        
        
        
        """
        #--up train and test
        grid_search = GridSearchCV(self.predictor, cv=3, n_jobs=1, param_grid=param_grid)
        grid_search.fit(self.full_pipeline.fit_transform(X_train), y_train_up)
        self.grid_fit.append(grid_search)
    
        
        predict = grid_search.predict(X_train)
        self.predictions.append(predict_up)
        
        missup    = sum(np.abs(y_test_up - predict))/len(y_test_up)
        self.missed.append(miss_up)
        
        #down train and test
        grid_search = GridSearchCV(self.predictor, cv=3, n_jobs=1, param_grid=param_grid)
        grid_search.fit(self.full_pipeline.fit_transform(X_train), y_train_down)
        self.grid_fit.append(grid_search)
    
        
        predict = grid_search.predict(X_test_down)
        self.predictions.append(predict_down)
        
        missup    = sum(np.abs(y_test_down - predict))/len(y_test_down)
        self.missed.append(miss_up)
        """
                
    def run_describe_predictors(self,predictors, X, y):
        res = pd.DataFrame()
        for name, predictor in predictors.items():
            pipe = self.create_full_pipeline_with_predictor(predictor)
            scores = cross_val_score(pipe, X, y,scoring='neg_mean_squared_error', cv=5) #5-fold cross-validation
            print(pd.Series(scores).describe())
            print('')
            res[str(name)] = pd.Series(scores).describe()
        return res


    def create_full_pipeline_with_predictor(self):
        pipe = Pipeline([
            ("preparation", self.full_pipeline),
            ("SGDClassifier", self.predictor)
        ])
        return pipe

    def run_describe_predictors(self,predictors, X, y):
        res = pd.DataFrame()
        for name, predictor in predictors.items():
            pipe = create_full_pipeline_with_predictor(predictor)
            scores = cross_val_score(pipe, X, y,scoring='neg_mean_squared_error', cv=5) #5-fold cross-validation
            print(pd.Series(scores).describe())
            print('')
            res[str(name)] = pd.Series(scores).describe()
        return res

In [18]:
predictor = {'SGD_log':  SGDClassifier(loss = 'log')}
a = analyst_forecasting_model(surprise_earn_df, predictor)
a.run_all_quarters()

[[ 1.00055742  0.9325602  -1.26912758 -0.09905152  0.27003852 -0.61147229]
 [ 1.00055742  0.9325602  -1.26912758 -0.09905152  0.27003852 -0.61147229]
 [ 1.00055742  0.9325602  -1.26912758 -0.09905152  0.27003852 -0.61147229]
 ..., 
 [ 0.31468851  0.30978217 -1.26912758 -0.05589352 -0.56705026  0.39748833]
 [-0.05534561 -0.04522838 -1.26912758 -0.05589352 -0.57533827  0.31126531]
 [-0.05534561 -0.04522838 -0.3753188  -0.05589352 -0.34327406  0.54270606]]


TypeError: estimator should be an estimator implementing 'fit' method, {'SGD_log': SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)} was passed

In [None]:
predictors = {'SGD_hinge':SGDClassifier()}
a = analyst_forecasting_model(surprise_earn_df, predictor)
a.run_all_quarters()