In [1]:
import pandas as pd
# create a series of datetime with a frequency of 10 hours
s = pd.date_range('2020-01-06', '2020-01-10', freq='10H').to_series()
# create some features based on datetime
features = {
"dayofweek": s.dt.dayofweek.values, "dayofyear": s.dt.dayofyear.values, "hour": s.dt.hour.values,
"is_leap_year": s.dt.is_leap_year.values, "quarter": s.dt.quarter.values, "weekofyear": s.dt.weekofyear.values
}

  "is_leap_year": s.dt.is_leap_year.values, "quarter": s.dt.quarter.values, "weekofyear": s.dt.weekofyear.values


In [2]:
def generate_features(df):
    # create a bunch of features using the date column
    df.loc[:, 'year'] = df['date'].dt.year
    df.loc[:, 'weekofyear'] = df['date'].dt.weekofyear
    df.loc[:, 'month'] = df['date'].dt.month
    df.loc[:, 'dayofweek'] = df['date'].dt.dayofweek
    df.loc[:, 'weekend'] = (df['date'].dt.weekday >=5).astype(int)
    # create an aggregate dictionary
    aggs = {}
    # for aggregation by month, we calculate the
    # number of unique month values and also the mean 
    aggs['month'] = ['nunique', 'mean']
    aggs['weekofyear'] = ['nunique', 'mean']
    # we aggregate by num1 and calculate sum, max, min
    # and mean values of this column
    aggs['num1'] = ['sum','max','min','mean']
    # for customer_id, we calculate the total count 
    aggs['customer_id'] = ['size']
    # again for customer_id, we calculate the total unique 
    aggs['customer_id'] = ['nunique']
    # we group by customer_id and calculate the aggregates
    agg_df = df.groupby('customer_id').agg(aggs) 
    agg_df = agg_df.reset_index()
    return agg_df

In [3]:
import numpy as np
# generate a random dataframe with # 2 columns and 100 rows
df = pd.DataFrame(
np.random.rand(100, 2),
columns=[f"f_{i}" for i in range(1, 3)] )

In [4]:
df

Unnamed: 0,f_1,f_2
0,0.799735,0.086239
1,0.603219,0.104448
2,0.784079,0.752337
3,0.578152,0.985857
4,0.317394,0.661319
...,...,...
95,0.107624,0.117019
96,0.890627,0.990325
97,0.469581,0.317788
98,0.706427,0.067967


In [6]:
from sklearn import preprocessing
# initialize polynomial features class object # for two-degree polynomial features
pf = preprocessing.PolynomialFeatures(
degree=2, interaction_only=False, include_bias=False
)
# fit to the features
pf.fit(df)
# create polynomial features
poly_feats = pf.transform(df)
# create a dataframe with all the features
num_feats = poly_feats.shape[1] 
df_transformed = pd.DataFrame(
poly_feats,
columns=[f"f_{i}" for i in range(1, num_feats + 1)] )

In [7]:
df_transformed

Unnamed: 0,f_1,f_2,f_3,f_4,f_5
0,0.799735,0.086239,0.639576,0.068968,0.007437
1,0.603219,0.104448,0.363873,0.063005,0.010909
2,0.784079,0.752337,0.614780,0.589892,0.566011
3,0.578152,0.985857,0.334260,0.569975,0.971914
4,0.317394,0.661319,0.100739,0.209898,0.437342
...,...,...,...,...,...
95,0.107624,0.117019,0.011583,0.012594,0.013693
96,0.890627,0.990325,0.793216,0.882010,0.980744
97,0.469581,0.317788,0.220506,0.149227,0.100989
98,0.706427,0.067967,0.499039,0.048014,0.004619


In [8]:
# create bins of the numerical columns
# 10 bins
df["f_bin_10"] = pd.cut(df["f_1"], bins=10, labels=False)
# 100 bins
df["f_bin_100"] = pd.cut(df["f_1"], bins=100, labels=False)

In [9]:
df

Unnamed: 0,f_1,f_2,f_bin_10,f_bin_100
0,0.799735,0.086239,7,79
1,0.603219,0.104448,5,59
2,0.784079,0.752337,7,78
3,0.578152,0.985857,5,57
4,0.317394,0.661319,3,30
...,...,...,...,...
95,0.107624,0.117019,0,8
96,0.890627,0.990325,8,89
97,0.469581,0.317788,4,45
98,0.706427,0.067967,7,70


In [10]:
import numpy as np
from sklearn import impute
# create a random numpy array with 10 samples
# and 6 features and values ranging from 1 to 15 
X = np.random.randint(1, 15, (10, 6))

In [11]:
# convert the array to float
X = X.astype(float)
# randomly assign 10 elements to NaN (missing)
X.ravel()[np.random.choice(X.size, 10, replace=False)] = np.nan
# use 2 nearest neighbours to fill na values
knn_imputer = impute.KNNImputer(n_neighbors=2)
knn_imputer.fit_transform(X)

array([[10.5,  8. , 14. , 14. ,  2. ,  2. ],
       [ 9. ,  5. ,  3. , 14. ,  4. ,  1. ],
       [13. ,  8. ,  8.5,  1.5, 10. , 11. ],
       [ 2. ,  5. ,  4. ,  8. ,  9. ,  6.5],
       [ 9. ,  4. ,  7. ,  3. ,  3. ,  5. ],
       [10. ,  2. ,  6. ,  2. , 14. , 12. ],
       [ 5.5,  5. ,  1. , 12. , 12. ,  1. ],
       [ 9. ,  1. ,  1. ,  7. ,  6. ,  3.5],
       [ 7. ,  2. , 11. ,  1. ,  8.5, 12. ],
       [12. ,  2. ,  4. ,  8. ,  2. ,  2. ]])