In [1]:
import numpy as np
import pandas as pd

In [2]:
data_path = 'data.csv' # retail dataset
df = pd.read_csv(data_path, encoding= 'unicode_escape', 
                 parse_dates=["InvoiceDate"],
                 infer_datetime_format = True,
                )

In [3]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


#### Changing datatype of InvoiceNo and CustomerID to str

In [7]:
df['CustomerID'] = df['CustomerID'].astype(str).apply(lambda x: x.split(".")[0])
df['InvoiceNo'] = df['InvoiceNo'].astype(str).apply(lambda x: x.split(".")[0])

In [19]:
df.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID             object
Country                object
dtype: object

<h2><br> Understanding Data <br></h2>

In [54]:
from IPython.display import display, HTML

In [101]:
def dataframe_details(dataframe):
    print("\n\n")
    print(" NUMBER OF ROWS ".center(120,'-'), end = "\n\n")
    print("There are {} total records/rows.\n".format(dataframe.shape[0]))
    
    print(" NUMBER OF Columns ".center(120,'-'), end = "\n\n")
    print("There are {} features/attributes.\n".format(dataframe.shape[1]))
    
    print(" ATTRIBUTES TYPES ".center(120,'-'), end = "\n\n")
    print(dataframe.dtypes)
    
    print(" HEAD ".center(120,'-'), end = "\n\n")
    display(HTML(dataframe.head().to_html()))
    print("\n")
    
    print(" TAIL ".center(120,'-'), end = "\n\n")
    display(HTML(dataframe.tail().to_html()))
    print("\n")
    
    print(" NULL VALUES ".center(120,'-'), end = "\n\n")
    print(dataframe.isna().sum())
    

In [102]:
dataframe_details(df)




---------------------------------------------------- NUMBER OF ROWS ----------------------------------------------------

There are 541909 total records/rows.

-------------------------------------------------- NUMBER OF Columns ---------------------------------------------------

There are 8 features/attributes.

--------------------------------------------------- ATTRIBUTES TYPES ---------------------------------------------------

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID             object
Country                object
dtype: object
--------------------------------------------------------- HEAD ---------------------------------------------------------



Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom




--------------------------------------------------------- TAIL ---------------------------------------------------------



Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.1,12680,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680,France
541908,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680,France




----------------------------------------------------- NULL VALUES ------------------------------------------------------

InvoiceNo         0
StockCode         0
Description    1454
Quantity          0
InvoiceDate       0
UnitPrice         0
CustomerID        0
Country           0
dtype: int64
