In [1]:
# Yaks Hotel Data Cleaning
# Import library and settings.
import pandas as pd
import psycopg2
from getpass import getpass
pd.set_option('display.max_columns', 40, 'display.max_rows', 100)

In [12]:
# Read csv from given path, return the DataFrame
def get_data(path):
    # Load the csv into dataframe.
    try:
        raw_data = pd.read_csv(path, header=None)
        return raw_data
    except:
        print("csv now found")
        return 0

# Given a dirty DataFrame from get_data(), return the cleaned DataFrame
def cleaning(raw_data):
    # create an empty dataframe
    try:
        columns_name = ["Code", "Description", "Qty", "Food Rev", "Bev Rev", "Oth FB Rev", "Amount +/+", "SvChg", "Tax",
                        "Amount Nett", "Cash", "Credit Card", "City Ledger", "Bill FO", "Other Pymt", "Remarks", "Bill No"]
        cleaned = pd.DataFrame(columns=columns_name)

        # The code values have a pattern, it consists of 2 letters followed by numbers.
        data_by_code = raw_data[raw_data[0].str.contains("[A-Z]{2}\d+", na=False)]
        data_by_bill = raw_data.loc[raw_data[0]=="Bill No :",[0,1,2]]
        data_by_total = raw_data.loc[raw_data[3]=="Total",3]

        # using above filtered dataframe, we fill our cleaned dataframe.
        col_ins_1 = ["Code","Description","Qty","Food Rev","Bev Rev","Oth FB Rev","Amount +/+", "SvChg", "Tax","Amount Nett"]
        cleaned[col_ins_1] = data_by_code[[0,2,8,9,11,12,14,15,17,19]] #number of column positions from row data

        # Filling "Bill No" column.
        for index in data_by_bill.index:
            index_bill = index + 1
            while index_bill in data_by_code.index:
                cleaned.loc[index_bill,"Bill No"] = data_by_bill.loc[index,2]
                index_bill += 1

        # Filling the value associated with bill number (Cash - Other Pymt).
        # Because there are several rows with the same bill number, we count those rows and fill the values in a batch.
        col_ins_2 = ["Cash", "Credit Card", "City Ledger", "Bill FO", "Other Pymt", "Remarks"]
        filled=True # flag for scanning rows
        for index in data_by_code.index:
            if filled:
                # Initialize index_start and index_end
                index_start = index
                index_end = index
                filled=False
            # Find the index_end position
            if index+1 in data_by_code.index:
                index_end = index+1
            else:
                # Find where the total row exist by scanning through the index in data_by_total
                index_total = index_end
                while index_total not in data_by_total:
                    index_total += 1
                cleaned.loc[index_start:index_end,col_ins_2] = list(raw_data.loc[index_total,[21,23,25,27,29,31]])
                filled = True

        # Fill missing data
        cleaned["Remarks"] = cleaned["Remarks"].fillna("210-00935")
        cleaned[["Cash", "Credit Card", "City Ledger", "Bill FO", "Other Pymt"]] = cleaned[["Cash", "Credit Card", "City Ledger", "Bill FO", "Other Pymt"]].fillna("0")
        cleaned["Remarks"] = cleaned["Remarks"].fillna("Not Remarks")

        # Convert to int
        col_to_clean = ["Food Rev", "Bev Rev", "Oth FB Rev", "Amount +/+", "SvChg", "Tax", "Amount Nett", 
                        "Cash", "Credit Card", "City Ledger", "Bill FO", "Other Pymt"]
        map_to_int64 = {}
        for col_name in col_to_clean:
            map_to_int64[col_name] = "int64"
            # strip spaces and periods from the column.
            if cleaned[col_name].dtype == "object":
                cleaned[col_name] = cleaned[col_name].str.replace(" ","")
                cleaned[col_name] = cleaned[col_name].str.replace(".","")
        cleaned = cleaned.astype(map_to_int64)
        return cleaned
    except:
        print("Invalid raw DataFrame")

def export_xlsx(path,df):
    try:
        df.to_excel(path, index=False)
    except:
        print(path,"is opened\nPlease close and run again")
    
def export_sql(df):
    print('Connecting to the PostgreSQL database...')
    password = getpass("Input password for PostgreSQL: ")

    try:
        conn = psycopg2.connect(host="localhost",database="yaks", user="postgres", password=password)
        cur = conn.cursor()
    except:
        print("Wrong Username/Password or database did not exist")
    else:
        print('Connected\nPostgreSQL database version:')
        cur.execute('SELECT version()')
        print(cur.fetchone())

        try:
            query = """
            CREATE TABLE report(
                Code TEXT,
                Description TEXT,
                Qty INTEGER,
                "Food Rev" INTEGER,
                "Bev Rev" INTEGER,
                "Oth FB Rev" INTEGER,
                "Amount +/+" INTEGER,
                "SvChg" INTEGER, 
                "Tax" INTEGER,
                "Amount Nett" INTEGER,
                "Cash" INTEGER, 
                "Credit Card" INTEGER, 
                "City Ledger" INTEGER, 
                "Bill FO" INTEGER, 
                "Other Pymt" INTEGER, 
                "Remarks" TEXT, 
                "Bill No" TEXT);    
            """
            cur.execute(query)
            conn.commit()
            print("Table creation failed")
        except:
            print("table report already created")
        else:
            # Insert DataFrame records one by one. 
            for i,row in df.iterrows():
                query = "INSERT INTO report VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);"
                cur.execute(query, tuple(row))
                conn.commit()
            print("Data inserted into report table")

        conn.close()
        print("Connection closed")

In [13]:
path_input = "POS-Report-Detail Outlet-201701.csv"
path_output = "POS-Report-Detail Outlet-201701 CLEANED.xlsx"

print("(1/4) Importing dataset from",path_input)
raw_data = get_data(path_input)

print("(2/4) Cleaning DataFrame")
clean_data = cleaning(raw_data)

print("(3/4) Exporting to xslx: ",path_output)
export_xlsx(path_output,clean_data)

# The password is password
print("(4/4) Exporting to SQL")
export_sql(clean_data)

(1/4) Importing dataset from POS-Report-Detail Outlet-201701.csv
(2/4) Cleaning DataFrame
(3/4) Exporting to xslx:  POS-Report-Detail Outlet-201701 CLEANED.xlsx
(4/4) Exporting to SQL
Connecting to the PostgreSQL database...
Input password for PostgreSQL: ········
Wrong Username/Password or database did not exist
