In [1]:
# Importing necessary libraries
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize

In [None]:
# Code to transform the json format columns in table
def load_df(csv_path='./Data/train.csv', n_rows=None):
    json_cols = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    # Importing the dataset
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in json_cols}, # Loading the json columns properly
                     dtype={'fullVisitorId': 'str'}, # Transforming this column to string
                     nrows=n_rows)# Number of rows that will be imported randomly
    
    for column in json_cols: # Loop to finally transform the columns in data frame
        # This will normalize and set the json to a table
        column_as_df = json_normalize(df[column]) 
        # The column name is set using the category and subcategory of json columns
        column_as_df.columns = [f"{column}_{subcolumn}" for subcolumn in column_as_df.columns] 
        # After extracting the values, we drop the original columns
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
        
    # Printing the shape of dataframes that was imported     
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df #Returning the df after importing and transforming


In [None]:
%%time

# Load the data frame
df_train = load_df() 

In [None]:
pd.set_option('display.max_columns', None)
df_train.head() # First five rows

In [None]:
# Function to find columns with a single value (non-informative columns) and drop them
def uniq_col_dropper(df):
    cols_uniq_value = [col for col in df.columns if df[col].nunique(dropna=False) == 1]
    df.drop(cols_uniq_value, axis=1, inplace=True)
    return df

In [None]:
# Dropping columns with a unique value from the data
df_train = uniq_col_dropper(df_train) 

pd.set_option('display.max_columns', None)
df_train.head() # First five rows

In [None]:
# Determining the number of missing values
def missing_values(df):
    count_null = df.isnull().sum().sort_values(ascending=False) # Counting missing values and sorting
    percent_null = count_null / len(df) * 100  #  Percentage of rows with a missing value in each column
    null = pd.concat([count_null, percent_null], axis=1, keys=['count_null', 'percent_null']) # Concatenating count and percent
    
    #percent_nonnull_tranrev = 100 - percent_null['totals.transactionRevenue']
    
    print("Columns with at least one value: ")
    print(null[(null['count_null']!=0)]) # Returning info of columns with at least one value
    #print(f"\n Percentage of sessions with transactions data: {percent_nonnull_tranrev:.3f}") # Percentage of sessions with transactions
    return 

In [None]:
missing_values(df_train) # Info on missing value in the data

In [None]:
# Dealing with missing data and changing column types
def fill_na(df):    # fillna numeric feature
    df['totals_pageviews'].fillna(1, inplace=True) #filling NA's with 1
    df['totals_newVisits'].fillna(0, inplace=True) #filling NA's with 0
    df['totals_bounces'].fillna(0, inplace=True)   #filling NA's with 0
    df['trafficSource_isTrueDirect'].fillna(False, inplace=True) # filling boolean with False
    df['trafficSource_adwordsClickInfo.isVideoAd'].fillna(True, inplace=True) # filling boolean with True
    df["totals_transactionRevenue"] = df["totals_transactionRevenue"].fillna(0.).astype(float) #filling NA with 0
    df['totals_pageviews'] = df['totals_pageviews'].astype(int) # setting numerical column as integer
    df['totals_newVisits'] = df['totals_newVisits'].astype(int) # setting numerical column as integer
    df['totals_bounces'] = df['totals_bounces'].astype(int)  # setting numerical column as integer
    df["totals_hits"] = df["totals_hits"].astype(int) # setting numerical to float
    df['totals_newVisits'] = df['totals_newVisits'].astype(int) # seting as int
    return df # Return the transformed dataframe

In [None]:
# Dealing with missing data
df_train = fill_na(df_train)

In [None]:
df_train.dtypes

In [None]:
# Function to find columns with a single value (non-informative columns) and drop them
def uniq_col_drop_final(df):
    cols_uniq_value = [col for col in df.columns if df[col].nunique() == 1]
    df.drop(cols_uniq_value, axis=1, inplace=True)
    return df

In [None]:
# Dropping columns with a unique value from the data
df_train = uniq_col_drop_final(df_train) 

pd.set_option('display.max_columns', None)
df_train.head(5) # First three rows

In [None]:
# Dropping the columns that are directly derived from other columns
df_train = df_train.drop(['totals_bounces', 'totals_newVisits'], axis=1)

In [None]:
# Replacing the duplicated revenue and hits by their sum and other features with the first entry when sessionId is the same
def dup_remover(df):
    g_cols = ['sessionId'] # Group by this column
    sum_cols = ['totals_hits', 'totals_pageviews', 'totals_transactionRevenue'] # Return sums of these columns
    min_cols = ['visitStartTime'] # Return the minimum of this column
    cols = df_train.columns[~df_train.columns.isin(sum_cols+min_cols+g_cols)]
    d_sum = {col:'sum' for col in sum_cols} # dict comprehension for the sum of columns
    d_min = {col:'min' for col in min_cols} # dict comprehension for the min of columns
    d = {col:'first' for col in cols} # dict comprehension for the first column
    d.update(d_sum)
    d.update(d_min)
    df = df.groupby(g_cols).agg(d).reset_index()
    return df

In [None]:
df_train = dup_remover(df_train) # Remove duplicates

pd.set_option('display.max_columns', None)
df_train.head()

In [None]:
pd.set_option('display.max_columns', None)
dup = df_train[df_train.duplicated(['sessionId'], keep=False)].sort_values(by=['sessionId','date'])

dup[dup.sessionId.isin(dup[dup['totals_transactionRevenue'].notnull()].sessionId)]

In [None]:
# Dealing with the time data
def date_process(df):
    df['date'] = pd.to_datetime(df.date, format='%Y%m%d') # Convert to datetime format
    # Extract year, month, day, and day of week from date
    df['_year'] = df.date.dt.year
    df['_month'] = df.date.dt.month
    df['_day'] = df.date.dt.day
    df['_dayofWeek'] = df.date.dt.dayofweek
    # Extract hour from the "visitStartTime" column
    df['_hour'] = pd.to_datetime(df.visitStartTime, unit='s').dt.hour
    df.drop(['date', 'visitStartTime'], axis=1, inplace=True)
    return df

In [None]:
df_train = date_process(df_train) # Getting time and date data

pd.set_option('display.max_columns', None)
df_train.head()

In [None]:
# Normalize the variables
def normalize(df):
    # Use MinMaxScaler to normalize the column
    #df['totals.hits'] =  (df['totals.hits'] - min(df['totals.hits'])) / (max(df['totals.hits'])  - min(df['totals.hits']))
    # Normalizing the transaction Revenue
    df['totals_transactionRevenue'] = df_train['totals_transactionRevenue'].apply(lambda x: np.log1p(x))
    # return the modified df
    return df 

In [None]:
df_train = normalize(df_train) # Normalizing two parameters

In [None]:
%%time

# Save the flattened data set
df_train.to_csv("./Data/train-flattened.csv", index=False)

In [None]:
#%%time

# Read the flattened data set to check it and also the time
train_flattened = pd.read_csv('./Data/train-flattened.csv', dtype={'fullVisitorId': 'str'}, index_col='sessionId')

In [None]:
pd.set_option('display.max_columns', None)
train_flattened.head()