In [1]:
# Import libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings("ignore", category = FutureWarning)

# Feature engineering

### Arithmetical features

In [10]:
# Given
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

kf = KFold(n_splits = 5, shuffle = True, random_state = 123)

def get_kfold_rmse(train):
    mse_scores = []

    for train_index, test_index in kf.split(train):
        train = train.fillna(0)
        feats = [x for x in train.columns if x not in ['Id', 'SalePrice', 'RoofStyle', 'CentralAir']]
        
        fold_train, fold_test = train.loc[train_index], train.loc[test_index]

        # Fit the data and make predictions
        # Create a Random Forest object
        rf = RandomForestRegressor(n_estimators=10, min_samples_split=10, random_state=123)

        # Train a model
        rf.fit(X=fold_train[feats], y=fold_train['SalePrice'])

        # Get predictions for the test set
        pred = rf.predict(fold_test[feats])
    
        fold_score = mean_squared_error(fold_test['SalePrice'], pred)
        mse_scores.append(np.sqrt(fold_score))
        
    return round(np.mean(mse_scores) + np.std(mse_scores), 2)

In [11]:
train = pd.read_csv('house_prices_train.csv')
test = pd.read_csv('house_prices_test.csv')

In [15]:
# Part 1
# Look at the initial RMSE
print('RMSE before feature engineering:', get_kfold_rmse(train))

# Find the total area of the house
train['TotalArea'] = train['TotalBsmtSF'] + train['FirstFlrSF'] + train['SecondFlrSF']

# Look at the updated RMSE
print('RMSE with total area:', get_kfold_rmse(train))

# Part 2
# Find the area of the garden
train['GardenArea'] = train['LotArea'] - train['FirstFlrSF']
print('RMSE with garden area:', get_kfold_rmse(train))


# Part 3
# Find total number of bathrooms
train['TotalBath'] = train['FullBath'] + train['HalfBath']
print('RMSE with number of bathrooms:', get_kfold_rmse(train))


RMSE before feature engineering: 34413.55
RMSE with total area: 34413.55
RMSE with garden area: 34413.55
RMSE with number of bathrooms: 34506.78


### Date features

In [17]:
train = pd.read_csv('taxi_train_chapter_4.csv')
test = pd.read_csv('taxi_test_chapter_4.csv')

In [18]:
# Concatenate train and test together
taxi = pd.concat([train, test])

# Convert pickup date to datetime object
taxi['pickup_datetime'] = pd.to_datetime(taxi['pickup_datetime'])

# Create a day of week feature
taxi['dayofweek'] = taxi['pickup_datetime'].dt.dayofweek

# Create an hour feature
taxi['hour'] = taxi['pickup_datetime'].dt.hour

# Split back into train and test
new_train = taxi[taxi['id'].isin(train['id'])]
new_test = taxi[taxi['id'].isin(test['id'])]

# Categorical features

### Label encoding

In [20]:
train = pd.read_csv('house_prices_train.csv')
test = pd.read_csv('house_prices_test.csv')

In [21]:
# Concatenate train and test together
houses = pd.concat([train, test])

# Label encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# Create new features
houses['RoofStyle_enc'] = le.fit_transform(houses["RoofStyle"])
houses['CentralAir_enc'] = le.fit_transform(houses["CentralAir"])

# Look at new features
print(houses[['RoofStyle', 'RoofStyle_enc', 'CentralAir', 'CentralAir_enc']].head())

  RoofStyle  RoofStyle_enc CentralAir  CentralAir_enc
0     Gable              1          Y               1
1     Gable              1          Y               1
2     Gable              1          Y               1
3     Gable              1          Y               1
4     Gable              1          Y               1


### One-Hot encoding

In [23]:
# Part 1
# Concatenate train and test together
houses = pd.concat([train, test])

# Look at feature distributions
print(houses['RoofStyle'].value_counts(), '\n')
print(houses['CentralAir'].value_counts())

# Part 2
# Which of the features is binary?
# Answer: "CentralAir".

# Part 3
le = LabelEncoder()
houses['CentralAir_enc'] = le.fit_transform(houses['CentralAir'])

# Part 4
# Create One-Hot encoded features
ohe = pd.get_dummies(houses['RoofStyle'], prefix = 'RoofStyle')

# Concatenate OHE features to houses
houses = pd.concat([houses, ohe], axis=1)

# Look at OHE features
print(houses[[col for col in houses.columns if 'RoofStyle' in col]].head(3))

Gable      2310
Hip         551
Gambrel      22
Flat         20
Mansard      11
Shed          5
Name: RoofStyle, dtype: int64 

Y    2723
N     196
Name: CentralAir, dtype: int64
  RoofStyle  RoofStyle_Flat  RoofStyle_Gable  RoofStyle_Gambrel  \
0     Gable               0                1                  0   
1     Gable               0                1                  0   
2     Gable               0                1                  0   

   RoofStyle_Hip  RoofStyle_Mansard  RoofStyle_Shed  
0              0                  0               0  
1              0                  0               0  
2              0                  0               0  


# Target Encoding

### Mean Target Encoding

In [24]:
# Part 1
def test_mean_target_encoding(train, test, target, categorical, alpha = 5):
    # Calculate global mean on the train data
    global_mean = train[target].mean()
    
    # Group by the categorical feature and calculate its properties
    train_groups = train.groupby(categorical)
    category_sum = train_groups[target].sum()
    category_size = train_groups.size()
    
    # Calculate smoothed mean target statistics
    train_statistics = (category_sum + global_mean * alpha) / (category_size + alpha)
    
    # Apply statistics to the test data and fill new categories
    test_feature = test[categorical].map(train_statistics).fillna(global_mean)
    return test_feature.values


# Part 2
def train_mean_target_encoding(train, target, categorical, alpha = 5):
    # Create 5-fold cross-validation
    kf = KFold(n_splits = 5, random_state = 123, shuffle = True)
    train_feature = pd.Series(index=train.index)
    
    # For each folds split
    for train_index, test_index in kf.split(train):
        cv_train, cv_test = train.iloc[train_index], train.iloc[test_index]
      
        # Calculate out-of-fold statistics and apply to cv_test
        cv_test_feature = test_mean_target_encoding(cv_train, cv_test, target, categorical, alpha)
        
        # Save new feature for this particular fold
        train_feature.iloc[test_index] = cv_test_feature       
    return train_feature.values


# Part 3
def mean_target_encoding(train, test, target, categorical, alpha = 5):
  
    # Get the train feature
    train_feature = train_mean_target_encoding(train, target, categorical, alpha)
  
    # Get the test feature
    test_feature = test_mean_target_encoding(train, test, target, categorical, alpha)
    
    # Return new features to add to the model
    return train_feature, test_feature


### K-fold cross-validation

In [28]:
bryant_shots = pd.read_csv('bryant_shots.csv')

In [35]:
# Create 5-fold cross-validation
kf = KFold(n_splits=5, random_state=123, shuffle=True)

# For each folds split
for train_index, test_index in kf.split(bryant_shots):
    cv_train, cv_test = bryant_shots.iloc[train_index].copy(), bryant_shots.iloc[test_index].copy()
    
    # Create mean target encoded feature
    cv_train['game_id_enc'], cv_test['game_id_enc'] = mean_target_encoding(train=cv_train,
                                                                           test=cv_test,
                                                                           target='shot_made_flag',
                                                                           categorical='game_id',
                                                                           alpha=5)
    
    # Look at the encoding
    print(cv_train[['game_id', 'shot_made_flag', 'game_id_enc']].sample(n = 1))

       game_id  shot_made_flag  game_id_enc
9707  20700101             0.0     0.461172
       game_id  shot_made_flag  game_id_enc
3703  20200806             0.0     0.370094
      game_id  shot_made_flag  game_id_enc
829  20000651             1.0     0.281348
       game_id  shot_made_flag  game_id_enc
5097  20301110             0.0      0.36457
       game_id  shot_made_flag  game_id_enc
5595  20400328             1.0     0.590904


### Beyond binary classification

In [36]:
train = pd.read_csv('house_prices_train.csv')
test = pd.read_csv('house_prices_test.csv')

In [38]:
# Create mean target encoded feature
train['RoofStyle_enc'], test['RoofStyle_enc'] = mean_target_encoding(train = train,
                                                                     test = test,
                                                                     target = 'SalePrice',
                                                                     categorical = 'RoofStyle',
                                                                     alpha = 10)

# Look at the encoding
print(test[['RoofStyle', 'RoofStyle_enc']].drop_duplicates())

     RoofStyle  RoofStyle_enc
0        Gable  171565.947836
1          Hip  217594.645131
98     Gambrel  164152.950424
133       Flat  188703.563431
362    Mansard  180775.938759
1053      Shed  188267.663242


# Missing data

### Find missing data

In [41]:
# Part 1
# Read dataframe
rental_listings = pd.read_csv('twosigma_train.csv')

# Find the number of missing values in each column
print(rental_listings.isnull().sum())

# Part 2
# Look at the columns with the missing values
print(rental_listings[['building_id', 'price']].head())

id                 0
bathrooms          0
bedrooms           0
building_id       13
latitude           0
longitude          0
manager_id         0
price             32
interest_level     0
dtype: int64
                        building_id   price
0  53a5b119ba8f7b61d4e010512e0dfc85  3000.0
1  c5c8a357cba207596b04d1afd1e4f130  5465.0
2  c3ba40552e2120b0acfc3cb5730bb2aa  2850.0
3  28d9ad350afeaab8027513a3e52ac8d5  3275.0
4                               NaN  3350.0


### Impute missing data

In [43]:
# Part 1
# Import SimpleImputer
from sklearn.impute import SimpleImputer

# Create mean imputer
mean_imputer = SimpleImputer(strategy = 'mean')

# Price imputation
rental_listings[['price']] = mean_imputer.fit_transform(rental_listings[['price']])


# Part 2
# Import SimpleImputer
from sklearn.impute import SimpleImputer

# Create constant imputer
constant_imputer = SimpleImputer(strategy = 'constant', fill_value = 'MISSING')

# building_id imputation
rental_listings[['building_id']] = constant_imputer.fit_transform(rental_listings[['building_id']])

In [44]:
rental_listings.isnull().sum()

id                0
bathrooms         0
bedrooms          0
building_id       0
latitude          0
longitude         0
manager_id        0
price             0
interest_level    0
dtype: int64