Pipelines for model trainng @author Tim Copeland

In [20]:
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [21]:
target = '20151'
# Create a class to select numerical columns
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ommit = ommit
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        #add year_qnum column
        X['year_qnum'] = X['date'].apply(lambda x: int(str(x[0:4]) + str(X['qnum'])))
        
        ret_X = X[X['year_qnum'] < target]
        return ret_X

SGDClassifier with both hinge loss (SVM) and log loss (logistic regression)

In [22]:
#pre processes data for regression
full_pipeline = Pipeline([        
        ('selector', DataFrameSelector()),
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler())
    ])

def create_full_pipeline_with_predictor(predictor):
    pipe = Pipeline([
        ("preparation", full_pipeline),
        ("linear", predictor)
    ])
    return pipe

def run_describe_predictors(predictors, X, y):
    res = pd.DataFrame()
    for name, predictor in predictors.items():
        pipe = create_full_pipeline_with_predictor(predictor)
        scores = cross_val_score(pipe, X, y,scoring='neg_mean_squared_error', cv=5) #5-fold cross-validation
        print(pd.Series(scores).describe())
        print('')
        res[str(name)] = pd.Series(scores).describe()
    return res

In [75]:
surprise_earn_df = pd.read_csv('AFE_data/Training DFs/surprise_df1.csv')
full_dist_df     = pd.read_csv('AFE_data/Training DFs/surprise_df2.csv')

In [76]:
#drop redundant index col
surprise_earn_df = surprise_earn_df.drop(surprise_earn_df.columns[0], axis=1)
full_dist_df = full_dist_df.drop(full_dist_df.columns[0], axis=1)

In [77]:
surprise_earn_df.head()

Unnamed: 0,surprise_median_fa,surprise_mean_fa,TICKER,year,quarternum,Skew,Kurtosis,delta_return,Y
0,10.392305,9.814955,ABBV,2015,0,-0.016738,0.83,-0.0381,0
1,10.392305,9.814955,ABBV,2015,0,-0.016738,0.83,-0.0381,0
2,10.392305,9.814955,ABBV,2015,0,-0.016738,0.83,-0.0381,0
3,0.0,0.0,ABBV,2016,2,0.0,1.26,0.0145,0
4,0.0,0.0,ABBV,2016,2,0.0,1.26,0.0145,0


In [78]:
full_dist_df.head()

Unnamed: 0,TICKER,quarternum,analyst,forecast_value,year,earn_value,std,Analyst_Counts,the max_forecast-min_forecast,delta_return,Y
0,AA,3,3700,4.3125,1996,0.5775,0.602728,3,1.125,57.375,0
1,AA,3,281,4.125,1996,0.5775,0.602728,3,1.125,57.375,0
2,AA,3,4088,3.1875,1996,0.5775,0.602728,3,1.125,57.375,0
3,AA,3,662,3.0,1997,0.9375,0.478768,3,0.9375,78.5625,0
4,AA,3,472,3.9375,1997,0.9375,0.478768,3,0.9375,78.5625,0


In [79]:
#add a year + qnum column
to_str = lambda x: str(x)
to_int = lambda x: int(x)

surprise_earn_df['year_qnum'] = surprise_earn_df['year'].apply(to_str) + surprise_earn_df['quarternum'].apply(to_str)
surprise_earn_df['year_qnum'] = surprise_earn_df['year_qnum'].apply(to_int)

full_dist_df['year_qnum']     = full_dist_df['year'].apply(to_str) + full_dist_df['quarternum'].apply(to_str)
full_dist_df['year_qnum']     = full_dist_df['year_qnum'].apply(to_int)

In [84]:
#generates X and y trainng and test sets for a given df and year_qnum
class analyst_forecasting_model:
    #df is either the full_dist_df or the surprise_earn_df
    def __init__(self,df, predictor):
        self.df               = df
        self.predictor        = predictor
        
        self.grid_fit         = [] #contains grid search fit paramaters for the quarter
        self.predictions      = [] #contains grid search best fit predictions for the quarter
        self.missed           = [] #contains sum of missed predictions / total # predictions for the quarter
    
        #pre processes data 
        self.full_pipeline = Pipeline([        
            #('selector', DataFrameSelector()),
            ('imputer', Imputer(strategy="median")),
            ('std_scaler', StandardScaler())
        ])
        
    def run_all_quarters(self):
        #need to skip first quarter because we don't have trainng data for it
        for i in self.df['year_qnum'].unique()[1:-1]: 
            self.train_predict(i)
    
    def data_sets(self, year_qnum):
        df = self.df
        
        X_train = df[df['year_qnum'] < year_qnum]
        y_train = X_train['Y']
        X_train = X_train.drop(['Y', 'year', 'TICKER', 'year_qnum'], axis=1)

        X_test = df[df['year_qnum'] == year_qnum]
        y_test = X_test['Y']
        X_test = X_test.drop(['Y', 'year', 'TICKER', 'year_qnum'], axis=1)
        return X_train, y_train, X_test, y_test
    
    def train_predict(self,year_qnum):
        X_train, y_train, X_test, y_test = self.data_sets(year_qnum)
        
        param_grid = {
            'classify__penalty':['l2', 'l1'],
            'classify__alpha':[0.0001,0.0001*10,0.0001*100.],
            'classify__l1_ratio':[.15,.3,.5,.7],
        }
        
        print(self.full_pipeline.fit_trainsform(X_train))
        
        grid_search = GridSearchCV(self.predictor, cv=3, n_jobs=1, param_grid=param_grid)
        grid_search.fit(self.full_pipeline.fit_transform(X_train), y_train)
        self.grid_fit.append(grid_search)
    
        
        predict = grid_search.predict(X_test)
        self.predictions.append(predict)
        
        miss    = sum(np.abs(y_test - predict))/len(y_test)
        self.missed.append(miss)
                
    def run_describe_predictors(self,predictors, X, y):
        res = pd.DataFrame()
        for name, predictor in predictors.items():
            pipe = self.create_full_pipeline_with_predictor(predictor)
            scores = cross_val_score(pipe, X, y,scoring='neg_mean_squared_error', cv=5) #5-fold cross-validation
            print(pd.Series(scores).describe())
            print('')
            res[str(name)] = pd.Series(scores).describe()
        return res
    

    def create_full_pipeline_with_predictor(self,predictor):
        pipe = Pipeline([
            ("preparation", full_pipeline),
            ("linear", predictor)
        ])
        return pipe

    def run_describe_predictors(self,predictors, X, y):
        res = pd.DataFrame()
        for name, predictor in predictors.items():
            pipe = create_full_pipeline_with_predictor(predictor)
            scores = cross_val_score(pipe, X, y,scoring='neg_mean_squared_error', cv=5) #5-fold cross-validation
            print(pd.Series(scores).describe())
            print('')
            res[str(name)] = pd.Series(scores).describe()
        return res

In [85]:
predictor = {'SGD_log':  SGDClassifier(loss = 'log')}
a = analyst_forecasting_model(surprise_earn_df, predictor)
a.run_all_quarters()

AttributeError: 'Pipeline' object has no attribute 'fit_trainsform'

In [None]:
predictors = {'SGD_hinge':SGDClassifier()}
a = analyst_forecasting_model(surprise_earn_df, predictor)
a.run_all_quarters()