# 3 Data preprocessing - data splitting

<b> Purpose of the action </b> - create two different data sets:
- Base dataset contains some continuous features - they will be dedicated to tree-based models.
- Categorical dataset contains the newly created categorical features in previous notebook - they will be dedicated to linear models

<b> </b>
<b> Action plan </b>:
- Select validation strategy and split training data on train and validation sets
- Create raw base and categorical data sets
- Process both sets of data using scaling and encoding
- Save all data sets for future use

## 3.1 Import necessary libraries and modules

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from data_preprocessing import calc_smooth_mean
from preprocessing_pipelines import DataFrameSelector, TwoColumnScaler

## 3.2 Read data

In [2]:
train_set = pd.read_csv('./preprocessed_data/train_set_stage2.csv', index_col=0)
test_set = pd.read_csv('./preprocessed_data/test_set_stage2.csv', index_col=0) 

## 3.3 Select validation strategy
A time-based validate has been chosen due to forecasting the results of the 2019-2020 season

In [3]:
# choose last season from training set  (2018-2019) as validation set
break_point = train_set.shape[0]//19 # 19 - total number of seasons in train set
validation_set = train_set.iloc[-break_point:]
train_set = train_set.iloc[:-break_point]

In [4]:
# show shape of each datasets
train_set.shape, validation_set.shape, test_set.shape

((5940, 84), (330, 84), (190, 84))

## 3.4 Assign manually features to the groups

In [5]:
train_set.columns

Index(['HomeTeam', 'AwayTeam', 'HomeTeamWinRatio', 'AwayTeamWinRatio',
       'HomeTeamGoalsScored', 'AwayTeamGoalsScored', 'HomeTeamGoalsLost',
       'AwayTeamGoalsLost', 'HomeTeamShootsMade', 'AwayTeamShootsMade',
       'HomeTeamTargetShootsMade', 'AwayTeamTargetShootsMade',
       'HomeTeamCorners', 'AwayTeamCorners', 'HomeTeamTotalPoints',
       'AwayTeamTotalPoints', 'HomeTeamPointsFromLast1Matches',
       'AwayTeamPointsFromLast1Matches', 'HomeTeamPointsFromLast3Matches',
       'AwayTeamPointsFromLast3Matches', 'HomeTeamPointsFromLast5Matches',
       'AwayTeamPointsFromLast5Matches', 'HomeTeamPointsFromLast10Matches',
       'AwayTeamPointsFromLast10Matches', 'HomeTeamLast1Match',
       'AwayTeamLast1Match', 'HomeTeamLast2Match', 'AwayTeamLast2Match',
       'HomeTeamLast3Match', 'AwayTeamLast3Match', 'HomeTeamLast4Match',
       'AwayTeamLast4Match', 'HomeTeamLast5Match', 'AwayTeamLast5Match',
       'HomeTeamWinStreak3', 'HomeTeamWinStreak5', 'HomeTeamLossStreak3',
     

In [6]:
target_col = ['FTR']

teams_cols =['HomeTeam','AwayTeam']

teams_ratio_cols = ['HomeTeamWinRatio', 'AwayTeamWinRatio']

teams_ratio_cat_cols = ['HomeTeamWinRatio_Cat', 'AwayTeamWinRatio_Cat']

last_year_postion_cols = ['HomeTeamLastYearPosition', 'AwayTeamLastYearPosition']

total_cols = ['HomeTeamGoalsScored','AwayTeamGoalsScored','HomeTeamGoalsLost','AwayTeamGoalsLost','HomeTeamShootsMade', 
              'AwayTeamShootsMade','HomeTeamTargetShootsMade','AwayTeamTargetShootsMade','HomeTeamCorners','AwayTeamCorners',
              'HomeTeamTotalPoints','AwayTeamTotalPoints']

total_cat_cols = ['HomeTeamTargetShootsMade_Cat', 'AwayTeamTargetShootsMade_Cat', 'HomeTeamGoalsScored_Cat',
                  'AwayTeamGoalsScored_Cat', 'HomeTeamGoalsLost_Cat','AwayTeamGoalsLost_Cat', 'HomeTeamShootsMade_Cat',
                  'AwayTeamShootsMade_Cat','HomeTeamCorners_Cat', 'AwayTeamCorners_Cat', 'HomeTeamTotalPoints_Cat',
                  'AwayTeamTotalPoints_Cat',]

last_matches_results_cols = ['HomeTeamLast1Match','AwayTeamLast1Match', 'HomeTeamLast2Match', 'AwayTeamLast2Match',
                             'HomeTeamLast3Match', 'AwayTeamLast3Match', 'HomeTeamLast4Match','AwayTeamLast4Match', 
                             'HomeTeamLast5Match', 'AwayTeamLast5Match',]

last_matches_points_cols = ['HomeTeamPointsFromLast3Matches','AwayTeamPointsFromLast3Matches', 
                            'HomeTeamPointsFromLast5Matches','AwayTeamPointsFromLast5Matches', 
                            'HomeTeamPointsFromLast10Matches','AwayTeamPointsFromLast10Matches']

binary_cols = ['HomeTeamWinStreak3', 'HomeTeamWinStreak5', 'HomeTeamLossStreak3','HomeTeamLossStreak5', 
               'AwayTeamWinStreak3', 'AwayTeamWinStreak5','AwayTeamLossStreak3', 'AwayTeamLossStreak5',
               'IsHomeTeamRegulars', 'IsAwayTeamRegulars', 'IsHomeTeamRookie', 'IsAwayTeamRookie']

diff_cols = ['HomeTeamGoalsDifference', 'AwayTeamGoalsDifference','TotalGoalsDifference','DifferenceTotalPoints',
             'Difference1MatchPoints', 'Difference3MatchesPoints','Difference5MatchesPoints','Difference10MatchesPoints',
             'DifferenceInShoots', 'DifferenceInTargetShoots', 'DifferenceInCorners','DifferenceInLastYearPosition'] 

diff_cat_cols = ['HomeTeamGoalsDifference_Cat','AwayTeamGoalsDifference_Cat', 'TotalGoalsDifference_Cat',
                 'DifferenceTotalPoints_Cat', 'Difference10MatchesPoints_Cat','DifferenceInShoots_Cat',
                 'DifferenceInTargetShoots_Cat','DifferenceInCorners_Cat']

## 3.5 Split data to base and categorical datasets

## 3.5.1 Create base datasets

In [7]:
# define base features column set
base_features = [*target_col, *binary_cols, *teams_ratio_cols, *last_matches_points_cols, *last_matches_results_cols,
                 *last_year_postion_cols, *diff_cols, *teams_cols, *total_cols]

# create base datasets using appropirate columns
base_train_set = train_set.loc[:, base_features]
base_validation_set = validation_set.loc[:, base_features]
base_test_set = test_set.loc[:, base_features]

# save base datasets
base_train_set.to_csv("./preprocessed_data/base_train_set.csv")
base_validation_set.to_csv("./preprocessed_data/base_validation_set.csv")
base_test_set.to_csv("./preprocessed_data/base_test_set.csv")

base_train_set.head()

Unnamed: 0,FTR,HomeTeamWinStreak3,HomeTeamWinStreak5,HomeTeamLossStreak3,HomeTeamLossStreak5,AwayTeamWinStreak3,AwayTeamWinStreak5,AwayTeamLossStreak3,AwayTeamLossStreak5,IsHomeTeamRegulars,...,HomeTeamGoalsLost,AwayTeamGoalsLost,HomeTeamShootsMade,AwayTeamShootsMade,HomeTeamTargetShootsMade,AwayTeamTargetShootsMade,HomeTeamCorners,AwayTeamCorners,HomeTeamTotalPoints,AwayTeamTotalPoints
0,1,0,0,0,0,0,0,0,0,0,...,5,8,60,40,35,20,26,18,8,5
1,1,0,0,0,0,0,0,0,0,0,...,11,6,45,50,25,25,35,28,5,10
2,0,0,0,0,0,0,0,0,0,1,...,8,3,30,85,20,45,30,27,7,11
3,0,0,0,0,0,0,0,0,0,0,...,5,8,55,60,30,35,35,22,7,4
4,1,0,0,0,0,0,0,0,0,0,...,9,4,65,50,45,35,41,27,3,10


## 3.5.2 Create categorical dataset

In [8]:
# define categorical features column set
categorical_features = [*target_col, *teams_cols, *binary_cols, *teams_ratio_cat_cols, *last_matches_points_cols,
                        *last_matches_results_cols, *last_year_postion_cols, *diff_cat_cols, *total_cat_cols]

# create base datasets using appropirate columns
categorical_train_set = train_set.loc[:, categorical_features]
categorical_validation_set = validation_set.loc[:, categorical_features]
categorical_test_set = test_set.loc[:, categorical_features]

# save categorical datasets
categorical_train_set.to_csv("./preprocessed_data/categorical_train_set.csv")
categorical_validation_set.to_csv("./preprocessed_data/categorical_validation_set.csv")
categorical_test_set.to_csv("./preprocessed_data/categorical_test_set.csv")

categorical_train_set.head()

Unnamed: 0,FTR,HomeTeam,AwayTeam,HomeTeamWinStreak3,HomeTeamWinStreak5,HomeTeamLossStreak3,HomeTeamLossStreak5,AwayTeamWinStreak3,AwayTeamWinStreak5,AwayTeamLossStreak3,...,HomeTeamGoalsScored_Cat,AwayTeamGoalsScored_Cat,HomeTeamGoalsLost_Cat,AwayTeamGoalsLost_Cat,HomeTeamShootsMade_Cat,AwayTeamShootsMade_Cat,HomeTeamCorners_Cat,AwayTeamCorners_Cat,HomeTeamTotalPoints_Cat,AwayTeamTotalPoints_Cat
0,1,Aston Villa,Bradford,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,Charlton,Tottenham,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,0,Everton,Man United,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3,0,Leeds,Ipswich,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,Southampton,Newcastle,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## 3.6 Process base datasets

### 3.6.1 Create processing pipelines for base datasets  (scalling, encoding)

In [9]:
# all transformes from module data_preprocessing.py
select_target_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([*target_col]))
])

standard_scaling_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([* binary_cols, *teams_ratio_cols, *last_matches_points_cols, 
                                       *last_matches_results_cols, *last_year_postion_cols, *diff_cols]) ),
    ('standard_scaler', StandardScaler() )
])

# label enocoding team names
ordinal_encoder_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([*teams_cols]) ),
    ('ordinal_encoder', OrdinalEncoder() ),
    ('standard_scaler', StandardScaler() )
])

# process two features to the same scale(leaving dependencies between them)
goals_scored_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([total_cols[0], total_cols[1]]) ),
    ('two_column_scaler', TwoColumnScaler(scaler=StandardScaler() ))
])

goals_lost_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([total_cols[2], total_cols[3]]) ),
    ('two_column_scaler', TwoColumnScaler(scaler=StandardScaler() ))
])

shoot_made_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([total_cols[4], total_cols[5]]) ),
    ('two_column_scaler', TwoColumnScaler(scaler=StandardScaler() ))
])

total_shoot_made_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([total_cols[6], total_cols[7]]) ),
    ('two_column_scaler', TwoColumnScaler(scaler=StandardScaler() ))
])

corners_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([total_cols[8], total_cols[9]]) ),
    ('two_column_scaler', TwoColumnScaler(scaler=StandardScaler() ))
])

total_points_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([total_cols[10], total_cols[11]]) ),
    ('two_column_scaler', TwoColumnScaler(scaler=StandardScaler() ))
])

### 3.6.2 Merge all pipelines in one

In [10]:
base_process_pipeline = FeatureUnion(transformer_list=[
                                    ('select_target_pipeline', select_target_pipeline),
                                    ('standard_scaling_pipeline', standard_scaling_pipeline),
                                    ('ordinal_encoder_pipeline', ordinal_encoder_pipeline),
                                    ('goals_scored_pipeline', goals_scored_pipeline),
                                    ('goals_lost_pipeline', goals_lost_pipeline),
                                    ('shoot_made_pipeline', shoot_made_pipeline),
                                    ('total_shoot_made_pipeline', total_shoot_made_pipeline),
                                    ('corners_pipeline', corners_pipeline),
                                    ('total_points_pipeline', total_points_pipeline),
])

### 3.6.3 Transform base datasets

In [11]:
# fit pipeline on training set
base_pipe = base_process_pipeline.fit(base_train_set)

processed_base_train_set_np = base_pipe.transform(base_train_set)
print(processed_base_train_set_np.shape)

processed_base_validation_set_np = base_pipe.transform(base_validation_set)
print(processed_base_validation_set_np.shape)

processed_base_test_set_np = base_pipe.transform(base_test_set)
print(processed_base_test_set_np.shape)

(5940, 59)
(330, 59)
(190, 59)


### 3.6.4 Save datasets to csv files

In [12]:
# first make data frame of each sets
processed_base_train_set_df = pd.DataFrame(data=processed_base_train_set_np, columns=base_features)
processed_base_validation_set_df = pd.DataFrame(data=processed_base_validation_set_np, columns=base_features)
processed_base_test_set_df = pd.DataFrame(data=processed_base_test_set_np, columns=base_features)

# save data to files
processed_base_train_set_df.to_csv("./preprocessed_data/processed_base_train_set.csv")
processed_base_validation_set_df.to_csv("./preprocessed_data/processed_base_validation_set.csv")
processed_base_test_set_df.to_csv("./preprocessed_data/processed_base_test_set.csv")

## 3.7 Process categorical datasets 

### 3.7.1 Transform team names using target mean encoding

In [13]:
# function from data_preprocessing.py
categorical_train_set['HomeTeam'], categorical_test_set['HomeTeam'], categorical_validation_set['HomeTeam'] = calc_smooth_mean(
                                                       categorical_train_set, categorical_test_set, categorical_validation_set, 
                                                                                   cat_name='HomeTeam', target='FTR', weight=10)

categorical_train_set['AwayTeam'], categorical_test_set['AwayTeam'], categorical_validation_set['AwayTeam'] = calc_smooth_mean(
                                                       categorical_train_set, categorical_test_set, categorical_validation_set, 
                                                                                   cat_name='AwayTeam', target='FTR', weight=10)

### 3.7.2  Create processing pipelines for categorical datasets  (scalling)

In [14]:
# all transformes from preprocessing_pipelines.py
select_target_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([*target_col]))
])

standard_scaling_pipeline = Pipeline([
    ('select_cols', DataFrameSelector([*teams_cols, *binary_cols, *teams_ratio_cat_cols, 
                                       *last_matches_points_cols, *last_matches_results_cols,
                                       *last_year_postion_cols, *diff_cat_cols, *total_cat_cols]) ),
    ('standard_scaler', StandardScaler() )
])

### 3.7.3  Merge all pipelines in one

In [15]:
cat_process_pipeline = FeatureUnion(transformer_list=[
                                    ('select_target_pipeline', select_target_pipeline),
                                    ('standard_scaling_pipeline', standard_scaling_pipeline),
])

### 3.7.4 Transform features

In [16]:
# fit pipeline on training set
cat_pipe = cat_process_pipeline.fit(categorical_train_set)

processed_cat_train_set_np = cat_pipe.transform(categorical_train_set)
print(processed_cat_train_set_np.shape)

processed_cat_validation_set_np = cat_pipe.transform(categorical_validation_set)
print(processed_cat_validation_set_np.shape)

processed_cat_test_set_np = cat_pipe.transform(categorical_test_set)
print(processed_cat_test_set_np.shape)

(5940, 55)
(330, 55)
(190, 55)


### 3.7.4 Save datasets to csv files

In [17]:
# first make data frame of each data sets
processed_cat_train_set_df = pd.DataFrame(data=processed_cat_train_set_np, columns=categorical_features)
processed_cat_validation_set_df = pd.DataFrame(data=processed_cat_validation_set_np, columns=categorical_features)
processed_cat_test_set_df = pd.DataFrame(data=processed_cat_test_set_np, columns=categorical_features)

# save data to files
processed_cat_train_set_df.to_csv("./preprocessed_data/processed_categorical_train_set.csv")
processed_cat_validation_set_df.to_csv("./preprocessed_data/processed_categorical_validation_set.csv")
processed_cat_test_set_df.to_csv("./preprocessed_data/processed_categorical_test_set.csv")