<a href="https://colab.research.google.com/github/EAsencios/DEEP-LEARING/blob/master/data_wrangling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Data Wrangling**

In [34]:
import random
import numpy as np
import pandas as pd

import datetime
from random import randrange

In [35]:
from time import strftime
startDate = datetime.datetime(2020, 1, 1,8)


def _random_date(start,date_count):
    """This function generates a random date based on params
    Args:
        start (date object): the base date
        date_count (int): number of dates to be generated
    Returns:
        list of random dates

    """
    current = start
    while date_count > 0:
        curr = current + datetime.timedelta(days=randrange(42))
        yield curr
        date_count-=1


def generate_sample_data(row_count=100):
    """This function generates a random transaction dataset
    Args:
        row_count (int): number of rows for the dataframe
    Returns:
        a pandas dataframe

    """

    # sentinels
    startDate = datetime.datetime(2020, 1, 1, 8)
    serial_number_sentinel = 1000
    user_id_sentinel = 5001
    product_id_sentinel = 101
    price_sentinel = 2000

    data_dict = {
    'Serial No': np.arange(row_count)+serial_number_sentinel,
    'Date': np.random.permutation(pd.to_datetime([x.strftime('%d-%m-%Y') for x in _random_date(startDate,row_count)]).date),
    'User ID': np.random.permutation(np.random.randint(0, row_count, size=int(row_count/10)) + user_id_sentinel).tolist()*10,
    'Product ID': np.random.permutation(np.random.randint(0, row_count, size=int(row_count/10))+ product_id_sentinel).tolist()*10 ,
    'Quantity Purchased': np.random.permutation(np.random.randint(1, 42, size=row_count)),
    'Price': np.round(np.abs(np.random.randn(row_count)+1)*price_sentinel, decimals=2),
    'User Type':np.random.permutation([chr(random.randrange(97, 97 + 3 + 1)) for i in range(row_count)])
    }

    
    # introduce missing values
    for index in range(int(np.sqrt(row_count))): 
        data_dict['Price'][np.argmax(data_dict['Price'] == random.choice(data_dict['Price']))] = np.nan
        data_dict['User Type'][np.argmax(data_dict['User Type'] == random.choice(data_dict['User Type']))] = np.nan
        data_dict['Date'][np.argmax(data_dict['Date'] == random.choice(data_dict['Date']))] = np.nan
        data_dict['Product ID'][np.argmax(data_dict['Product ID'] == random.choice(data_dict['Product ID']))] = 0
        data_dict['Serial No'][np.argmax(data_dict['Serial No'] == random.choice(data_dict['Serial No']))] = -1
        data_dict['User ID'][np.argmax(data_dict['User ID'] == random.choice(data_dict['User ID']))] = -101
        

    df = pd.DataFrame(data_dict)

    return(df)


def describe_dataframe(df=pd.DataFrame()):
    """This function generates descriptive stats of a dataframe
    Args:
        df (dataframe): the dataframe to be analyzed
    Returns:
        None

    """
    print("\n\n")
    print("*"*30)
    print("About the Data")
    print("*"*30)

    print("Number of rows::",df.shape[0])
    print("Number of columns::",df.shape[1])
    print("\n")
    
    print("Column Names::",df.columns.values.tolist())
    print("\n")
    
    print("Column Data Types::\n",df.dtypes)
    print("\n")
        
    print("Columns with Missing Values::",df.columns[df.isnull().any()].tolist())
    print("\n")
       
    print("Number of rows with Missing Values::",len(pd.isnull(df).any(1).to_numpy().nonzero()[0].tolist()))
    print("\n")

    
    print("Sample Indices with missing data::",pd.isnull(df).any(1).to_numpy().nonzero()[0].tolist()[0:5])
    print("\n")
    
    print("General Stats::")
    print(df.info())
    print("\n")

    
    print("Summary Stats::")
    print(df.describe())
    print("\n")
    
    print("Dataframe Sample Rows::")
    display(df.head(5))
    
    return


def cleanup_column_names(df,rename_dict={},do_inplace=True):
    """This function renames columns of a pandas dataframe
       It converts column names to snake case if rename_dict is not passed. 
    Args:
        rename_dict (dict): keys represent old column names and values point to 
                            newer ones
        do_inplace (bool): flag to update existing dataframe or return a new one
    Returns:
        pandas dataframe if do_inplace is set to False, None otherwise

    """
    if not rename_dict:
        return df.rename(columns={col: col.lower().replace(' ','_') 
                    for col in df.columns.values.tolist()}, 
                  inplace=do_inplace)
    else:
        return df.rename(columns=rename_dict,inplace=do_inplace)

def expand_user_type(u_type):
    """This function maps user types to user classes
    Args:
        u_type (str): user type value
    Returns:
        (str) user_class value

    """
    if u_type in ['a','b']:
        return 'new'
    elif u_type == 'c':
        return 'existing'
    elif u_type == 'd':
        return 'loyal_existing'
    else:
        return 'error'


In [36]:
df = generate_sample_data(1000)

In [37]:
describe_dataframe(df)




******************************
About the Data
******************************
Number of rows:: 1000
Number of columns:: 7


Column Names:: ['Serial No', 'Date', 'User ID', 'Product ID', 'Quantity Purchased', 'Price', 'User Type']


Column Data Types::
 Serial No               int64
Date                   object
User ID                 int64
Product ID              int64
Quantity Purchased      int64
Price                 float64
User Type              object
dtype: object


Columns with Missing Values:: ['Date', 'Price']


Number of rows with Missing Values:: 61


Sample Indices with missing data:: [0, 1, 4, 5, 8]


General Stats::
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Serial No           1000 non-null   int64  
 1   Date                970 non-null    object 
 2   User ID             1000 non-null   int64  
 3   Pro

Unnamed: 0,Serial No,Date,User ID,Product ID,Quantity Purchased,Price,User Type
0,1000,,-101,0,6,4010.77,n
1,1001,,5630,1061,25,5387.04,n
2,1002,2020-01-16,5750,978,40,3523.46,n
3,1003,2020-09-02,5235,772,9,3434.69,n
4,1004,,5477,323,38,764.0,n


In [38]:
print('Dataframe columns:\n{}'.format(df.columns.tolist()))

Dataframe columns:
['Serial No', 'Date', 'User ID', 'Product ID', 'Quantity Purchased', 'Price', 'User Type']


In [39]:
cleanup_column_names(df)

In [40]:
print('Dataframe columns:\n{}'.format(df.columns.tolist()))

Dataframe columns:
['serial_no', 'date', 'user_id', 'product_id', 'quantity_purchased', 'price', 'user_type']


In [41]:
print("Using Column Name::")
df['quantity_purchased'].values[:10]

Using Column Name::


array([ 6, 25, 40,  9, 38,  1, 40, 40, 14,  3])

In [42]:
print("Using Column Name::" )
df.quantity_purchased.values[:10]

Using Column Name::


array([ 6, 25, 40,  9, 38,  1, 40, 40, 14,  3])

In [43]:
print("Using Column Data Type::" )
df.select_dtypes(include=['float64']).values[:10]

Using Column Data Type::


array([[4010.77],
       [5387.04],
       [3523.46],
       [3434.69],
       [ 764.  ],
       [ 900.14],
       [4085.32],
       [4659.8 ],
       [3260.03],
       [2269.51]])

In [44]:
print('Select Specific row indices::')
df.iloc[[20, 30, 50, 60]]

Select Specific row indices::


Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
20,1020,2020-02-01,5377,128,27,2454.31,n
30,1030,2020-06-01,5747,644,4,5781.25,n
50,1050,2020-08-01,5881,1062,36,2937.03,b
60,1060,2020-01-16,5005,439,25,1294.13,a


In [45]:
print('Excluding Specific Row indices::')
df.drop([1, 14, 21], axis=0).head(20)

Excluding Specific Row indices::


Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
0,1000,,-101,0,6,4010.77,n
2,1002,2020-01-16,5750,978,40,3523.46,n
3,1003,2020-09-02,5235,772,9,3434.69,n
4,1004,,5477,323,38,764.0,n
5,1005,,5807,695,1,900.14,n
6,1006,2020-03-02,5390,988,40,4085.32,n
7,1007,2020-09-02,5933,884,40,4659.8,n
8,1008,,5801,788,14,3260.03,n
9,1009,,5878,1067,3,2269.51,n
10,1010,,5870,471,38,2041.59,n


In [46]:
print('Subsetting based on logical condition(s)::')
df[df.quantity_purchased>25].head()

Subsetting based on logical condition(s)::


Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
2,1002,2020-01-16,5750,978,40,3523.46,n
4,1004,,5477,323,38,764.0,n
6,1006,2020-03-02,5390,988,40,4085.32,n
7,1007,2020-09-02,5933,884,40,4659.8,n
10,1010,,5870,471,38,2041.59,n


In [47]:
print('Subsetting based on offset from top (bottom)::')
df[100:].head()  # df.tail(-100)

Subsetting based on offset from top (bottom)::


Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
100,1100,2020-10-02,5343,282,1,3130.55,a
101,1101,2020-01-02,5630,1061,27,3711.49,b
102,1102,,5750,978,27,2365.15,b
103,1103,2020-11-01,5235,772,11,4121.53,b
104,1104,2020-01-22,5477,323,11,2619.14,a


In [48]:
df.tail(-100)

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
100,1100,2020-10-02,5343,282,1,3130.55,a
101,1101,2020-01-02,5630,1061,27,3711.49,b
102,1102,,5750,978,27,2365.15,b
103,1103,2020-11-01,5235,772,11,4121.53,b
104,1104,2020-01-22,5477,323,11,2619.14,a
...,...,...,...,...,...,...,...
995,1995,2020-02-02,5722,142,23,2.53,d
996,1996,2020-06-01,5184,947,20,1504.71,d
997,1997,2020-02-01,5979,171,19,307.55,a
998,1998,2020-01-13,5419,135,16,13.51,d


In [49]:
df['date'] = pd.to_datetime(df.date)
df.dtypes

serial_no                      int64
date                  datetime64[ns]
user_id                        int64
product_id                     int64
quantity_purchased             int64
price                        float64
user_type                     object
dtype: object

In [50]:
df['user_class'] = df['user_type'].map(expand_user_type)
df

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class
0,1000,NaT,-101,0,6,4010.77,n,error
1,1001,NaT,5630,1061,25,5387.04,n,error
2,1002,2020-01-16,5750,978,40,3523.46,n,error
3,1003,2020-09-02,5235,772,9,3434.69,n,error
4,1004,NaT,5477,323,38,764.00,n,error
...,...,...,...,...,...,...,...,...
995,1995,2020-02-02,5722,142,23,2.53,d,loyal_existing
996,1996,2020-06-01,5184,947,20,1504.71,d,loyal_existing
997,1997,2020-02-01,5979,171,19,307.55,a,new
998,1998,2020-01-13,5419,135,16,13.51,d,loyal_existing


In [51]:
df['purchase_week'] = df[['date']].applymap(lambda dt:dt.week if not pd.isnull(dt.week) else 0)
df

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week
0,1000,NaT,-101,0,6,4010.77,n,error,0
1,1001,NaT,5630,1061,25,5387.04,n,error,0
2,1002,2020-01-16,5750,978,40,3523.46,n,error,3
3,1003,2020-09-02,5235,772,9,3434.69,n,error,36
4,1004,NaT,5477,323,38,764.00,n,error,0
...,...,...,...,...,...,...,...,...,...
995,1995,2020-02-02,5722,142,23,2.53,d,loyal_existing,5
996,1996,2020-06-01,5184,947,20,1504.71,d,loyal_existing,23
997,1997,2020-02-01,5979,171,19,307.55,a,new,5
998,1998,2020-01-13,5419,135,16,13.51,d,loyal_existing,3


In [52]:
df.select_dtypes(include=[np.number]).apply(lambda x: x.max() - x.min())

serial_no             2000.00
user_id               6082.00
product_id            1084.00
quantity_purchased      40.00
price                 8143.66
purchase_week           49.00
dtype: float64

In [53]:
print('Drop Row with missing dates::')
df_dropped = df.dropna(subset=['date'])
print("Shape::", df_dropped.shape)
df_dropped

Drop Row with missing dates::
Shape:: (970, 9)


Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week
2,1002,2020-01-16,5750,978,40,3523.46,n,error,3
3,1003,2020-09-02,5235,772,9,3434.69,n,error,36
6,1006,2020-03-02,5390,988,40,4085.32,n,error,10
7,1007,2020-09-02,5933,884,40,4659.80,n,error,36
13,1013,2020-01-16,5242,157,23,586.19,n,error,3
...,...,...,...,...,...,...,...,...,...
995,1995,2020-02-02,5722,142,23,2.53,d,loyal_existing,5
996,1996,2020-06-01,5184,947,20,1504.71,d,loyal_existing,23
997,1997,2020-02-01,5979,171,19,307.55,a,new,5
998,1998,2020-01-13,5419,135,16,13.51,d,loyal_existing,3


In [54]:
print('Fill Missing Price values with mean price:::')
df_dropped['price'].fillna(value=np.round(df.price.mean(), decimals=2), inplace=False)

Fill Missing Price values with mean price:::


2      3523.46
3      3434.69
6      4085.32
7      4659.80
13      586.19
        ...   
995       2.53
996    1504.71
997     307.55
998      13.51
999    5311.36
Name: price, Length: 970, dtype: float64

In [55]:
print('Fill Missing user_type values with value from \ previous row (forward fill)::')
df_dropped['user_type'].fillna(method='ffill', inplace=False)

Fill Missing user_type values with value from \ previous row (forward fill)::


2      n
3      n
6      n
7      n
13     n
      ..
995    d
996    d
997    a
998    d
999    a
Name: user_type, Length: 970, dtype: object

In [56]:
print('Fill Missing user_type values with valve from \ next row (backward fill)::')
df_dropped['user_type'].fillna(method='bfill', inplace=False)

Fill Missing user_type values with valve from \ next row (backward fill)::


2      n
3      n
6      n
7      n
13     n
      ..
995    d
996    d
997    a
998    d
999    a
Name: user_type, Length: 970, dtype: object

In [57]:
df_dropped[df_dropped.duplicated(subset=['serial_no'])]

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week
85,-1,2020-07-01,5273,991,19,956.17,b,new,27
91,-1,2020-01-13,5018,199,8,1616.73,b,new,3
110,-1,2020-04-02,5870,471,41,1824.35,b,new,14
135,-1,2020-06-02,5965,131,22,4670.6,c,existing,23
142,-1,2020-06-02,5486,270,27,1266.48,b,new,23
165,-1,2020-01-01,5693,703,22,6623.95,b,new,1
169,-1,2020-02-01,5494,377,16,1577.82,a,new,5
183,-1,2020-01-26,5065,950,26,5977.39,b,new,4
205,-1,2020-02-02,5807,695,36,1518.26,c,existing,5
240,-1,2020-02-01,5549,491,36,2347.97,d,loyal_existing,5


In [58]:
df_dropped.drop_duplicates(subset=['serial_no'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [59]:
df_dropped[df_dropped.duplicated(subset=['serial_no'])]

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week


In [61]:
# using map to dummy encode
type_map={'a':0, 'b':1, 'c':2, 'd':3, np.NAN:-1}
df['encoded_user_type'] = df.user_type.map(type_map)
df.tail()


Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week,encoded_user_type
995,1995,2020-02-02,5722,142,23,2.53,d,loyal_existing,5,3.0
996,1996,2020-06-01,5184,947,20,1504.71,d,loyal_existing,23,3.0
997,1997,2020-02-01,5979,171,19,307.55,a,new,5,0.0
998,1998,2020-01-13,5419,135,16,13.51,d,loyal_existing,3,3.0
999,1999,2020-03-01,5491,779,11,5311.36,a,new,9,0.0


In [62]:
# using get_dummies to one hot encode
pd.get_dummies(df, columns=['user_type'])

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_class,purchase_week,encoded_user_type,user_type_a,user_type_b,user_type_c,user_type_d,user_type_n
0,1000,NaT,-101,0,6,4010.77,error,0,,0,0,0,0,1
1,1001,NaT,5630,1061,25,5387.04,error,0,,0,0,0,0,1
2,1002,2020-01-16,5750,978,40,3523.46,error,3,,0,0,0,0,1
3,1003,2020-09-02,5235,772,9,3434.69,error,36,,0,0,0,0,1
4,1004,NaT,5477,323,38,764.00,error,0,,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1995,2020-02-02,5722,142,23,2.53,loyal_existing,5,3.0,0,0,0,1,0
996,1996,2020-06-01,5184,947,20,1504.71,loyal_existing,23,3.0,0,0,0,1,0
997,1997,2020-02-01,5979,171,19,307.55,new,5,0.0,1,0,0,0,0
998,1998,2020-01-13,5419,135,16,13.51,loyal_existing,3,3.0,0,0,0,1,0
