In [1]:
import pandas as pd
from plotly.graph_objects import *
import plotly.express as px
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer

In [2]:
df = pd.read_csv('/Users/dianaterraza/Desktop/NLP/Data/airline-passengers.csv')

In [3]:
df.head()

Unnamed: 0,Month,Passengers
0,1949-01,112
1,1949-02,118
2,1949-03,132
3,1949-04,129
4,1949-05,121


### Add some features 

In [4]:
df['date'] = pd.to_datetime(df['Month'])

In [5]:
df['Year'] = df['date'].dt.year
df['Month'] = df['date'].dt.month

target = 'Passengers'
features = ['Year', 'Month']

In [6]:
df.head()

Unnamed: 0,Month,Passengers,date,Year
0,1,112,1949-01-01,1949
1,2,118,1949-02-01,1949
2,3,132,1949-03-01,1949
3,4,129,1949-04-01,1949
4,5,121,1949-05-01,1949


In [7]:
offset = 12 
lags = 24 
columns = ['Passengers']
for lag in range (offset, lags + offset +1):
    lagged_columns = [f'{col}--{lag}' for col in columns]
    df[lagged_columns] = df[columns].shift(lag)
    features += lagged_columns

In [8]:
df.head()

Unnamed: 0,Month,Passengers,date,Year,Passengers--12,Passengers--13,Passengers--14,Passengers--15,Passengers--16,Passengers--17,...,Passengers--27,Passengers--28,Passengers--29,Passengers--30,Passengers--31,Passengers--32,Passengers--33,Passengers--34,Passengers--35,Passengers--36
0,1,112,1949-01-01,1949,,,,,,,...,,,,,,,,,,
1,2,118,1949-02-01,1949,,,,,,,...,,,,,,,,,,
2,3,132,1949-03-01,1949,,,,,,,...,,,,,,,,,,
3,4,129,1949-04-01,1949,,,,,,,...,,,,,,,,,,
4,5,121,1949-05-01,1949,,,,,,,...,,,,,,,,,,


In [9]:
features

['Year',
 'Month',
 'Passengers--12',
 'Passengers--13',
 'Passengers--14',
 'Passengers--15',
 'Passengers--16',
 'Passengers--17',
 'Passengers--18',
 'Passengers--19',
 'Passengers--20',
 'Passengers--21',
 'Passengers--22',
 'Passengers--23',
 'Passengers--24',
 'Passengers--25',
 'Passengers--26',
 'Passengers--27',
 'Passengers--28',
 'Passengers--29',
 'Passengers--30',
 'Passengers--31',
 'Passengers--32',
 'Passengers--33',
 'Passengers--34',
 'Passengers--35',
 'Passengers--36']

### Apply XGBoost

Split the data into train and test

In [10]:
train, test = df[:-12], df[-12:]

mod = XGBRegressor()

param_grid = {
    'n_estimators': [100, 200, 300],  # Number of boosting rounds
    'max_depth': [3, 5, 7],           # Maximum depth of trees
    'learning_rate': [0.01, 0.1, 0.2],  # Learning rate (shrinkage)

}

tscv = TimeSeriesSplit(test_size = 12)
grid_search = GridSearchCV(estimator = mod,
                           param_grid = param_grid,
                           cv = tscv) #for nlp cv = 5

In [11]:
grid_search.fit(train[features],train[target])

In [12]:
grid_search.best_estimator_

In [13]:
preds = grid_search.predict(test[features])
r2_score(test[target], preds)

0.7616739869117737

In [14]:
df[target].rolling(3).mean().shift(12)

0             NaN
1             NaN
2             NaN
3             NaN
4             NaN
          ...    
139    526.333333
140    523.333333
141    476.333333
142    410.666667
143    391.333333
Name: Passengers, Length: 144, dtype: float64

In [15]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range =(-1,1)) #use init
scaler.fit(df[['Passengers']])

In [16]:
# for min max scaler
def fit(self,X):
    self.min = X.min()
    self.max = X.max()
    
def transform(self,X):
  return (X-self.min)/(self.max-self.min)

In [None]:
df[['Passengers']].rolling(3).shift(12)

In [18]:
class ShiftedRollingMeanTransformer (BaseEstimator):
    def __init__(self, window = 3, shift = 12):
        self.window = window
        self.shift = shift
    def fit(self, X, y = None): # we are using based estimator, we have x and y 
        return self
    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        return X.rolling(self.window_size).mean().shift(self.shift).values
    
    def fit_transform(self, X, y = None):
        return self.transform(X)

Preprocessing data, transformer 
we have to give the tuple 
custome name, 
actual transformer
the list of the columns 

Add passengers into the features 

In [19]:
features += ['Passengers']

In [20]:
features = list(set(features))

In [21]:
mod = XGBRegressor(**grid_search.best_params_)

col_transformer = ColumnTransformer(transformers = [('shifter_rolling_mean',ShiftedRollingMeanTransformer(),['Passengers'] ),
                                                    ('drop','drop','Passengers')],
                                    remainder = 'passthrough',
                                    force_int_remainder_cols=False)

In [22]:
col_transformer

Create the pipeline and also we have to pass a tuples with the size of 2

In [23]:
pipe = Pipeline([('column_transformer', col_transformer),
                 ('estimator', mod)])

In [24]:
pipe

In [28]:
param_grid = {'column_transformer__shifter_rolling_mean__window_size': [3,7,12]}

In [30]:
grid_search = GridSearchCV(pipe,
                           param_grid,
                          cv = tscv )
grid_search.fit(train[features], train[target])

ValueError: Invalid parameter 'window_size' for estimator ShiftedRollingMeanTransformer(). Valid parameters are: ['shift', 'window'].

In [63]:
grid_search.best_params_

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [64]:
from sklearn.impute import SimpleImputer

In [None]:
col_transformer = ColumnTransformer(transformers = [('imputer', SimpleImputer(), col_list)])

NameError: name 'col_list' is not defined

In [None]:
pipe = Pipeline(steps=[('column_transformer', col_transformer),
                       ('mod', mod)])

param_grid = {'column_transformer__imputer__strategy': ['mean', 'median']}

In [None]:
ColumnTransformer(transformers = [('tfidf', TfidfVectorizer(), ['text_cleaned'])])

param_grid = {'column_transformer__tfidf_n'}