# Data Cleaning & Data Pipeline

<div class="alert alert-block alert-warning">
  
<b>Notebook objectives:</b>
    
* Define a data cleaning function and transform raw form data set
    
    
* Define a data preprocessing pipeline and transform cleaned data set
    
    
* Export X and y preprocessed data in pickle form ready for ML modeling    

# 1. Notebook set up

In [9]:
# Try downgrading pandas version if pcikle throws an error while loading
# !pip install pandas==1.4.1

In [1]:
###### Import packages

# Data handling
import numpy as np
import pandas as pd
import datetime as dt
#from IPython.display import HTML, Image #display formatted texts
import warnings
warnings.filterwarnings('ignore')

# Plotting packages
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn') # pretty graphs
import matplotlib.ticker as mticker
from matplotlib.ticker import FormatStrFormatter, StrMethodFormatter, FuncFormatter
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go


# Files to pickle
import pickle
import bz2
import _pickle as cPickle

# sampling
from sklearn.model_selection import train_test_split

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

# Models
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score, average_precision_score
from sklearn.inspection import partial_dependence

# Load bars
from tqdm.notebook import tqdm_notebook
from tqdm import tqdm
import time

# Path set up
path = "/project/data/"
path_w1 = "/project/data/w1/"
path_w2 = "/project/data/w2/"
path_w3 = "/project/data/w3/"
path_w4 = "/project/data/w4/"
path_w5 = "/project/data/w5/"
path_feature = "/project/data/feature_importance/"
json_path = "/project/notebooks/map/KEYFILE.json"

# Fixed values
seed = 2323
colors = {'c1':['blue', 'red'], 'c2': ['red', 'blue', 'grey', 'purple']}
bar_width = 0.3
bin_num = 25
size = {'small_tick': 9, 'tick': 10 , 'label': 14, 'sub_title': 16, 'main_title': 20}
fig_size = {'large': (30,25), 'small': (10, 5)}

# pandas display set up
pd.options.display.max_columns = None

### 1.1 Load samples data sets

In [2]:
### Loading pickle files that will be clean and preprocess for ML modeling

# load pickled data for w2
pickled_data = bz2.BZ2File(path_w2 + 'strata_target_w2_1pct', 'rb')
sample_df = cPickle.load(pickled_data)
sample_df.drop(list(sample_df.filter(regex = 'ID')), axis = 1, inplace = True) # removes ID columns that do not hold explanatorry value
pickled_data.close()
# n_df = pd.concat([w1_df,w2_df], ignore_index = True)

In [3]:
# make a new reference to the data set
data = sample_df.reset_index()
print(F'Sample data set size:\n\
{data.shape[0]} rows\n\
{data.shape[1]} columns')

Sample data set size:
397979 rows
42 columns


In [4]:
### Loading pickle file for droppped prep df

# load pickled data
pickled_data = bz2.BZ2File(path_feature + 'df_prep_dropped', 'rb')
df_prep_dropped = cPickle.load(pickled_data)
#df_prep_dropped.drop(list(sample_df.filter(regex = 'ID')), axis = 1, inplace = True) # removes ID columns that do not hold explanatorry value
pickled_data.close()

# 2. Data cleaning and pre-processing for ML modeling

Defining a data cleaning function that later will be used as input into a transformer.

### 2.1 Data Cleaning function

In [6]:
def clean_data_step(df, ordinal_apply = True):
    
    # 1. mapping features renamed
    
    # mapper to rename columns
    dict_features = {'Advertiser Currency': 'ad_currency', 'Date': 'date', 'Time of Day': 'tod', 'Advertiser': 'advertiser_name', 'Campaign': 'campaign_name', 
                    'Insertion Order': 'insertion_order', 'Line Item': 'line_item', 'Line Item Type': 'line_item_type', 'Platform': 'platform',
                    'Device Type': 'device_type', 'Device Make': 'device_make', 'Device Model': 'device_model', 'Operating System': 'os', 'Browser': 'browser',
                    'ISP or Carrier': 'isp_carrier', 'Environment': 'environment', 'Creative Type': 'creative_type', 'Creative': 'creative_name', 'Creative Size': 'creative_size',
                    'App/URL': 'app_url', 'Channel Type': 'channel_type', 'Channel': 'channel_name', 'Exchange': 'exchange', 'Inventory Source': 'inventory_source',
                    'Ad Position': 'ad_position', 'Ad Type': 'ad_type', 'Inventory Source Type': 'inventory_source_type', 'Position in Content': 'position_in_content',
                    'Public Inventory': 'public_inventory', 'Country': 'country', 'City': 'city', 'Impressions': 'impressions', 'Billable Impressions': 'billable_impressions',
                    'Active View: Viewable Impressions': 'viewable_impressions', 'Clicks': 'clicks', 'Total Conversions': 'total_conversions', 
                    'Post-Click Conversions': 'post_click_conversions', 'Post-View Conversions': 'post_view_conversions', 
                    'Revenue (Adv Currency)': 'media_cost', 'Media Cost (Advertiser Currency)': 'total_media_cost'}
    
    # rename columns to ease future referencing
    df.rename(columns = dict_features, inplace = True)
    
    # 2. Truncated instances are dropped which represent ~0.00008% fo total data set size
    df.dropna(inplace=True)
    
    # 3. Droppping unique value categorical features
    unique_v_df = pd.DataFrame(df.nunique()).rename(columns={0: "unique_values_count"})
    drop_unique_list = unique_v_df[unique_v_df['unique_values_count'] == 1].index.values.tolist()
    df.drop(drop_unique_list, axis=1, inplace = True)
    
    # 4. changing data timestamp format
    df['date'] = pd.to_datetime(df['date'])

    # 5. setting day/month/year format
    df['date'] = df['date'].dt.strftime('%d/%m/%Y')
    # return as date timestampe format
    df['date'] = pd.to_datetime(df['date'], format = "%d/%m/%Y")
    
    # 6. transforming floats to integers
    exclude_num_cols = ['media_cost','total_media_cost','index']
    float_cols = [var for var in df.select_dtypes(include=['int64', 'float64']).columns if var not in exclude_num_cols]
    
    # 7. transfor floats to int
    try:
        for col in float_cols:
            df[col] = df[col].astype('int64')
    except ValueError as excp:
        print('Make sure to pass columns with float dtype only') # friendly error message
        print(excp) # technical message as output
        print(type(excp)) # type error as output
    else:
        pass#print(F'{len(float_cols)} features tranformed to int64 dtype')
        
        
    # 8. Anonymizing data
    
    # insertion order
    insertion_orders = {n:"insertion_order{}".format(i+1) for i, n in enumerate(df['insertion_order'].unique())}
    df["insertion_order"] = df['insertion_order'].map(insertion_orders)
    
    # anonymizing line items
    line_items = {n:"line_item{}".format(i+1) for i, n in enumerate(df['line_item'].unique())}
    df["line_item"] = df['line_item'].map(line_items)
    
    # anonymizing creatives
    creatives = {n:"creative_name{}".format(i+1) for i, n in enumerate(df['creative_name'].unique())}
    df["creative_name"] = df['creative_name'].map(creatives)
    
    
    # 9. converting to string data types
    df['day_of_week'] = df['date'].dt.day_name()
    df['date'] = df['date'].dt.strftime('%d/%m/%Y')
    
    mapped_times = {n:"tod_{}".format(i+1) for i, n in enumerate(df['tod'].unique())}
    df["tod"] = df['tod'].map(mapped_times)

    
    # 10. Get the target feature
    df['user_response'] = np.where(df['clicks'] == 0, 0, 1) # generates target feature
    
    # 11. log transform
    log_transform_list = ['media_cost', 'total_media_cost']
    
    for col in log_transform_list:
        df[col+'_log'] = np.log(df[col]+0.000001)
        
        df.drop([col], axis = 1, inplace = True)
        
    df.reset_index(drop = True, inplace =  True)
        
    # 12. Apply OrdinalEncoding if true
    
    if ordinal_apply:
        
        ### iter over variables
        catg_cols_1 = ['device_model', 'app_url', 'city']
        order_dict = {}

        for idx, var in enumerate(catg_cols_1):

            ### creating data form to plot
            temp_df = df[[var]] # get feature selected as a df
            temp_df = df.groupby(var, as_index = False).size().rename(columns={'size': 'count'})

            ### computing distribution % frequency 
            temp_df['distribution'] = temp_df['count']/temp_df['count'].sum()
            temp_df.sort_values(by = 'distribution', inplace = True, ascending = False)
            order_dict[idx+1] = temp_df[var].values.tolist() # append ordered categorical values


        # Categoric and numeric features
        
        ordinal_encoding = make_column_transformer(
            (OrdinalEncoder(categories=[order_dict[1]]), ['device_model']),
            (OrdinalEncoder(categories=[order_dict[2]]), ['app_url']),
            (OrdinalEncoder(categories=[order_dict[3]]), ['city'])
        )

        encoding_df = pd.DataFrame(ordinal_encoding.fit_transform(df), columns=df[catg_cols_1].columns)

        df.drop(columns = catg_cols_1, inplace = True)
        df = pd.concat([df, encoding_df], axis = 1)
    
    #return cleaned df
    print(f'cleaned data set with {df.shape[0]} rows and {df.shape[1]} columns.')
    return df

### 2.3 Data cleaning using a transformer

In [7]:
# Defining custom cleaning step transformer
datacleaning = FunctionTransformer(clean_data_step)

In [8]:
# fitting cleaning step transformer
df_cleaned = datacleaning.fit_transform(data)

cleaned data set with 397979 rows and 31 columns.


In [None]:
# Run when prepping Pearson Regularized df
# df_cleaned = df_prep_dropped

### 2.4 Defining X and y for preprocessing pipeline

In [9]:
# Defining X and y matrices
X = df_cleaned.drop(['user_response','clicks', 'index'],axis=1)
y = df_cleaned['user_response']

In [10]:
# check point unique values in media cot log variable
len(X['media_cost_log'].unique())

5931

In [11]:
# check point matrix form
X.shape[0] == y.shape[0]

True

### 2.5 Defining Pipeline steps for categorical and numerical features

In [12]:
#### Defining data pipeline

exclude_list = ['index']

# Categoric and numeric features                
numeric_cols = [var for var in X.select_dtypes(include=['int64', 'float64']).columns if var not in exclude_list]
catg_cols = X.select_dtypes(include=['object']).columns

# transform steps numeric
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
    ,('std_scaler', StandardScaler())
])

# transform steps categoric
categorical_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant'))
      ,('onehot', OneHotEncoder(drop = 'first', sparse = False)) #swith true when a sparse matrix is required
])

# complete pipeline
pipeline = ColumnTransformer([
    ('numeric_pipeline', numeric_transformer, numeric_cols),
    ('categoric_pipeline', categorical_transformer,catg_cols)
])

### 2.6 Applying data preprocessing pipeline to X

**Note**: The output of X of the preprocessing has the form of a sparse matrix given the one hot encoding of the categorical variables with multiple unique values (e.g., the url_app feature has ~4K unique values).

In [13]:
# Fit data pipeline
X = pipeline.fit_transform(X)

In [14]:
##### ONLY RUN WHEN USING A DENSE MATRIX FORM TO MAP BACK FEATURE NAMES #####

n = pipeline.transformers_[0][2] # mapping numeric
c = pipeline.transformers_[1][1]\
['onehot'].get_feature_names(catg_cols).tolist() # mapping categorical
full = n + c # mapping complete columns list

X = pd.DataFrame(X, columns = full) # getting df form

In [15]:
X.shape[0] == y.shape[0]

True

### 2.7 Define X and y data set split


In [18]:
# transform train and test files to pickle form
export_files = {1:(X, 'X_complete'), 2:(y,'y_complete')}

for i in tqdm(export_files):
    sfile = bz2.BZ2File(path_feature + export_files[i][1] +'_w2_1pct', 'w') #'_w1_1pct'
    pickle.dump(export_files[i][0],sfile)
    sfile.close()

100%|██████████| 2/2 [00:16<00:00,  8.28s/it]


In [None]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state = seed, stratify = y)
print(F"X_train matrix: {X_train.shape}\ny_train vector:  {y_train.shape[0]}\nX_test matrix: {X_test.shape}\ny_test vector:  {y_test.shape[0]} ")

# 3. Export preprocessed train and test ready for ML modeling

In [None]:
# transform train and test files to pickle form
export_files = {1:(X_train, 'X_train'), 2:(y_train,'y_train'), 3: (X_test,'X_test'), 4: (y_test,'y_test')}

for i in tqdm(export_files):
    sfile = bz2.BZ2File(path_w2 + export_files[i][1] +'_dense_w2_1pct', 'w') #'_w1_1pct'
    pickle.dump(export_files[i][0],sfile)
    sfile.close()

# 4. Train a simple classifier to test X_train and y_train

**Note**: If the X sparse matrix has a very large form the server might run out of RAM, which would require a more powerful machine in place to train a simple classifier.

In [None]:
pickled_data = bz2.BZ2File(path + 'X_cleaned_w1', 'rb')
w1_df = cPickle.load(pickled_data)
w1_df.drop(list(w1_df.filter(regex = 'ID')), axis = 1, inplace = True) # removes ID columns that do not hold explanatorry value
pickled_data.close()

In [None]:
first_training = []

for i in tqdm(range(1), desc = 'Progress cross-validation'):
    # Training a naive classifier logit regression
    # Using 5 fold cross-validation
    first_training.append(cross_val_score(LogisticRegression(random_state = seed),
                          X = X_train,
                          y = y_train,
                          cv = 5, # 5 fold cross validation
                          #verbose = 5,
                          scoring = 'roc_auc'))
    time.sleep(0.5)
    print('Logistic regression trained')
    
first_training

# Appendix

In [None]:
# transform train and test files to pickle form
export_files = {1:(X, 'X'), 2:(y,'y')}

for i in tqdm(export_files):
    sfile = bz2.BZ2File(path_w2 + export_files[i][1] +'_dropped_w1_5pct', 'w') #'_w1_1pct'
    pickle.dump(export_files[i][0],sfile)
    sfile.close()

### A. Get unique values in list

In [None]:
unique_values = []
single_values = []
for col in X.columns:
    unique_values.append(X[col].unique())

for i in unique_values:
    for j in i:
        single_values.append(j)    

In [5]:
# Defining X and y matrices
X = df_prep_dropped.drop(['user_response','clicks', 'index'],axis=1)
y = df_prep_dropped['user_response']

In [5]:
y.to_csv(path + 'ycomp_pearson_w2_1pct.csv')

In [9]:
# transform train and test files to pickle form
export_files = {1:(X, 'Xcomp'), 2:(y,'ycomp')}

for i in tqdm(export_files):
    sfile = bz2.BZ2File(path + export_files[i][1] +'_pearson_w2_1pct', 'w') #'_w1_1pct'
    pickle.dump(export_files[i][0],sfile)
    sfile.close()

100%|██████████| 2/2 [00:14<00:00,  7.06s/it]
