In [1811]:
def dict_factory(cursor, row):
    fields = [column[0] for column in cursor.description]
    return {key: value for key, value in zip(fields, row)}

# DB Connect

In [1812]:
import sqlite3
import pandas as pd
import datetime

con = sqlite3.connect("superstore3.db")
con.row_factory = dict_factory
cur = con.cursor()
con.execute('PRAGMA foreign_keys = ON;')
cur.execute('PRAGMA foreign_keys = ON;')


<sqlite3.Cursor at 0x2c0989ccc00>

In [1813]:
# to show all rows and columns 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [1814]:
# read the csv into a pandas dataframe
orders = pd.read_csv('Data/Global-Superstore-Orders.csv')
returns = pd.read_csv('Data/Global-Superstore-Returns.csv')
people = pd.read_csv('Data/Global-Superstore-People.csv')

# Convert column names to lowercase and remove spaces and hyphens
orders.rename(columns=lambda x: x.replace(' ', '_').replace('-', ''), inplace=True)
returns.rename(columns=lambda x: x.replace(' ', '_').replace('-', ''), inplace=True)
people.rename(columns=lambda x: x.replace(' ', '_').replace('-', ''), inplace=True)

orders['Order_Date'] = pd.to_datetime(orders['Order_Date'])
orders['Ship_Date'] = pd.to_datetime(orders['Ship_Date'])

orders['Sales'] = orders['Sales'].str.replace(',', '.')
orders['Discount'] = orders['Discount'].str.replace(',', '.')
orders['Profit'] = orders['Profit'].str.replace(',', '.')
orders['Shipping_Cost'] = orders['Shipping_Cost'].str.replace(',', '.')
orders = orders.astype({
    'Sales': 'float64',
    'Discount': 'float64',
    'Profit': 'float64',
    'Shipping_Cost': 'float64'
})


In [1815]:
for x, data in orders[orders["State"] == "California"].groupby(["State"]):
    print(x, sum(data["Sales"]))

California 457687.6315000017


## Merge Returns

In [1816]:
if "Region" in returns: 
    returns = returns.drop(["Region"], axis=1)
    returns = returns.rename(columns={'f': 'Returned'})
    
#display(returns.head())
orders = pd.merge(orders, returns, on="Order_ID", how="outer").drop_duplicates()
orders['Returned'] = orders['Returned'].fillna(False)
orders['Returned'] = orders['Returned'].replace("Yes", True)
orders = orders.astype({'Returned': 'bool'})
orders.head(1)

Unnamed: 0,Row_ID,Order_ID,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,Postal_Code,City,State,Country,Region,Market,Product_ID,Product_Name,SubCategory,Category,Sales,Quantity,Discount,Profit,Shipping_Cost,Order_Priority,Returned
0,24599,IN-2017-CA120551-42816,2017-03-22,2017-03-29,Standard Class,CA-120551,Cathy Armstrong,Home Office,,Herat,Hirat,Afghanistan,Southern Asia,Asia Pacific,FUR-BO-4861,"Ikea Library with Doors, Mobile",Bookcases,Furniture,731.82,2,0.0,102.42,39.66,Medium,True


In [1817]:
for x, data in orders[orders["State"] == "California"].groupby(["State"]):
    print(x, sum(data["Sales"]))

California 457687.6315000017


## Merge People

In [1818]:
# Define the division of Western and Eastern Canada
western_provinces = ['Alberta', 'British Columbia', 'Manitoba', 'Saskatchewan']
eastern_provinces = ['Newfoundland', 'Nova Scotia', 'Ontario', 'Quebec']

# Create a new column "Region" and assign the corresponding region
orders.loc[orders['State'].isin(western_provinces), 'Region'] = 'Western Canada'
orders.loc[orders['State'].isin(eastern_provinces), 'Region'] = 'Eastern Canada'

In [1819]:
orders = pd.merge(orders, people, on="Region", how="outer")
print(orders.shape)
orders.dropna(subset=["Order_ID"], inplace=True)
#orders["People_ID"] = orders.groupby("Person").ngroup() + 1
print(orders.shape)
orders.head(1)  

(51290, 26)
(51290, 26)


Unnamed: 0,Row_ID,Order_ID,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,Postal_Code,City,State,Country,Region,Market,Product_ID,Product_Name,SubCategory,Category,Sales,Quantity,Discount,Profit,Shipping_Cost,Order_Priority,Returned,Person
0,24599,IN-2017-CA120551-42816,2017-03-22,2017-03-29,Standard Class,CA-120551,Cathy Armstrong,Home Office,,Herat,Hirat,Afghanistan,Southern Asia,Asia Pacific,FUR-BO-4861,"Ikea Library with Doors, Mobile",Bookcases,Furniture,731.82,2,0.0,102.42,39.66,Medium,True,Chandrakant Chaudhri


In [1820]:
orders[orders.duplicated(['Row_ID'], keep=False)].sort_values('Row_ID')

Unnamed: 0,Row_ID,Order_ID,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,Postal_Code,City,State,Country,Region,Market,Product_ID,Product_Name,SubCategory,Category,Sales,Quantity,Discount,Profit,Shipping_Cost,Order_Priority,Returned,Person


In [1821]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51290 entries, 0 to 51289
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Row_ID          51290 non-null  int64         
 1   Order_ID        51290 non-null  object        
 2   Order_Date      51290 non-null  datetime64[ns]
 3   Ship_Date       51290 non-null  datetime64[ns]
 4   Ship_Mode       51290 non-null  object        
 5   Customer_ID     51290 non-null  object        
 6   Customer_Name   51290 non-null  object        
 7   Segment         51290 non-null  object        
 8   Postal_Code     9994 non-null   float64       
 9   City            51290 non-null  object        
 10  State           51290 non-null  object        
 11  Country         51290 non-null  object        
 12  Region          51290 non-null  object        
 13  Market          51290 non-null  object        
 14  Product_ID      51290 non-null  object        
 15  Pr

In [1822]:
for key in orders.columns:
    #print(key + ":",  orders[key].unique())
    if len(orders[key].unique()) <= 10:
        print(key, orders[key].sort_values().unique())
    else:
        print(key, orders[key].sort_values().unique()[:5], "...", orders[key].sort_values().unique()[-5:])
    #break

Row_ID [1 2 3 4 5] ... [51286 51287 51288 51289 51290]
Order_ID ['AE-2014-PO8865138-41914' 'AE-2016-EB4110138-42657'
 'AE-2016-MY7380138-42735' 'AE-2017-GH4665138-43082'
 'AE-2017-JD5790138-42801'] ... ['ZA-2017-RC9960146-42988' 'ZA-2017-RP9390146-42830'
 'ZA-2017-SM10005146-42972' 'ZA-2017-SW10350146-42792'
 'ZA-2017-TS11205146-42793']
Order_Date ['2014-01-01T00:00:00.000000000' '2014-01-02T00:00:00.000000000'
 '2014-01-03T00:00:00.000000000' '2014-01-04T00:00:00.000000000'
 '2014-01-05T00:00:00.000000000'] ... ['2017-12-27T00:00:00.000000000' '2017-12-28T00:00:00.000000000'
 '2017-12-29T00:00:00.000000000' '2017-12-30T00:00:00.000000000'
 '2017-12-31T00:00:00.000000000']
Ship_Date ['2014-01-03T00:00:00.000000000' '2014-01-05T00:00:00.000000000'
 '2014-01-06T00:00:00.000000000' '2014-01-07T00:00:00.000000000'
 '2014-01-08T00:00:00.000000000'] ... ['2018-01-03T00:00:00.000000000' '2018-01-04T00:00:00.000000000'
 '2018-01-05T00:00:00.000000000' '2018-01-06T00:00:00.000000000'
 '2018-01-

# CustomerID (1)

## Extract RegionID From CustomerID

In [1823]:
import warnings

# Let's remove the last digit if the Country is "United States"
orders['Region_Number'] = orders.apply(lambda row: int(row['Customer_ID'][-1]) if row['Country'] == 'United States' else 0, axis=1)
orders['Customer_ID'] = orders.apply(lambda row: row['Customer_ID'][:-1] if row['Country'] == 'United States' else row['Customer_ID'], axis=1)
display(orders[orders["Country"] == "United States"].head(2))
display(orders[orders["Country"] != "United States"].head(2))
print(orders[orders["Country"] == "United States"]["Region_Number"].value_counts())

if orders[orders["Country"] == "United States"]["Region_Number"].sum() != 47506:
    warnings.warn("The sum() of all RegionIDs should be 47506")


Unnamed: 0,Row_ID,Order_ID,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,Postal_Code,City,State,Country,Region,Market,Product_ID,Product_Name,SubCategory,Category,Sales,Quantity,Discount,Profit,Shipping_Cost,Order_Priority,Returned,Person,Region_Number
41296,33312,CA-2016-AM10705140-42632,2016-09-19,2016-09-24,Standard Class,AM-10705140,Anne McFarland,Consumer,36830.0,Auburn,Alabama,United States,Southern US,USCA,OFF-ST-6289,Tennsco Double-Tier Lockers,Storage,Office Supplies,900.08,4,0.0,117.0104,53.84,Medium,False,Flannery Newton,8
41297,33310,CA-2016-AM10705140-42632,2016-09-19,2016-09-24,Standard Class,AM-10705140,Anne McFarland,Consumer,36830.0,Auburn,Alabama,United States,Southern US,USCA,FUR-CH-5431,Office Star - Professional Matrix Back Chair w...,Chairs,Furniture,350.98,1,0.0,84.2352,17.04,Medium,False,Flannery Newton,8


Unnamed: 0,Row_ID,Order_ID,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,Postal_Code,City,State,Country,Region,Market,Product_ID,Product_Name,SubCategory,Category,Sales,Quantity,Discount,Profit,Shipping_Cost,Order_Priority,Returned,Person,Region_Number
0,24599,IN-2017-CA120551-42816,2017-03-22,2017-03-29,Standard Class,CA-120551,Cathy Armstrong,Home Office,,Herat,Hirat,Afghanistan,Southern Asia,Asia Pacific,FUR-BO-4861,"Ikea Library with Doors, Mobile",Bookcases,Furniture,731.82,2,0.0,102.42,39.66,Medium,True,Chandrakant Chaudhri,0
1,24598,IN-2017-CA120551-42816,2017-03-22,2017-03-29,Standard Class,CA-120551,Cathy Armstrong,Home Office,,Herat,Hirat,Afghanistan,Southern Asia,Asia Pacific,TEC-MA-4211,"Epson Receipt Printer, White",Machines,Technology,346.32,3,0.0,13.77,14.1,Medium,True,Chandrakant Chaudhri,0


4    3203
6    2848
2    2323
8    1620
Name: Region_Number, dtype: int64


# OrderID

In [1824]:
check = True
for date, data in orders.groupby("Order_Date"):
    dates = (data['Order_ID'].str[-5:].drop_duplicates().astype("int").values)
    
    if (len(dates) > 1):
        print(date, dates)
        break

    new_date = datetime.datetime(1899, 12, 30) + datetime.timedelta(days=int(dates[0]))
    if str(date) != str(new_date):
        print(date, "!=", date.strftime('%Y-%m-%m 00:00:00'))
        check = False
        break

if check == True:
    orders['Order_ID'] = orders['Order_ID'].str.replace(r'-(\d{5})$', '', regex=True)


### Delete CustomerID from OrderID

In [1825]:
orders.tail()

Unnamed: 0,Row_ID,Order_ID,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,Postal_Code,City,State,Country,Region,Market,Product_ID,Product_Name,SubCategory,Category,Sales,Quantity,Discount,Profit,Shipping_Cost,Order_Priority,Returned,Person,Region_Number
51285,40095,US-2016-VM21685140,2016-04-07,2016-04-11,Standard Class,VM-21685140,Valerie Mitchum,Home Office,5408.0,Burlington,Vermont,United States,Eastern US,USCA,TEC-PH-5364,Nortel Meridian M5316 Digital phone,Phones,Technology,1294.75,5,0.0,336.635,214.54,High,False,Dolores Davis,6
51286,39193,CA-2017-EH14125140,2017-06-22,2017-06-26,Standard Class,EH-14125140,Eugene Hildebrand,Home Office,26003.0,Wheeling,West Virginia,United States,Eastern US,USCA,OFF-PA-6459,Xerox 1908,Paper,Office Supplies,447.84,8,0.0,219.4416,32.88,Medium,False,Dolores Davis,6
51287,39191,CA-2017-EH14125140,2017-06-22,2017-06-26,Standard Class,EH-14125140,Eugene Hildebrand,Home Office,26003.0,Wheeling,West Virginia,United States,Eastern US,USCA,OFF-BI-4837,Ibico Standard Transparent Covers,Binders,Office Supplies,82.4,5,0.0,40.376,7.22,Medium,False,Dolores Davis,6
51288,39192,CA-2017-EH14125140,2017-06-22,2017-06-26,Standard Class,EH-14125140,Eugene Hildebrand,Home Office,26003.0,Wheeling,West Virginia,United States,Eastern US,USCA,OFF-BI-6634,Zipper Ring Binder Pockets,Binders,Office Supplies,6.24,2,0.0,3.0576,1.49,Medium,False,Dolores Davis,6
51289,40783,CA-2017-NF18385140,2017-10-13,2017-10-13,Same Day,NF-18385140,Natalie Fritzler,Consumer,26003.0,Wheeling,West Virginia,United States,Eastern US,USCA,FUR-TA-3751,"Chromcraft 48"" x 96"" Racetrack Double Pedestal...",Tables,Furniture,673.344,3,0.3,-76.9536,56.9,Medium,True,Dolores Davis,6


In [1826]:
customer_ids = orders['Customer_ID'].str.replace('-', '')
if (orders['Order_ID'].str.endswith(customer_ids)).all():
    for index, row in orders.iterrows():
        customer_id = "-" + row['Customer_ID'].replace('-', '')
        if row['Order_ID'].endswith(customer_id):
            orders.loc[index, 'Order_ID'] = row['Order_ID'][:-len(customer_id)]



In [1827]:
orders['Order_ID'] = orders.apply(lambda row: row['Order_ID'][:-5] if row['Order_ID'].endswith(str(row['Order_Date'].year)) else row['Order_ID'], axis=1)

In [1828]:
orders = orders.rename(columns={'Order_ID': 'Order_TwoLC'})

# Customer ID (2)

## Extract Initials from CustomerID

In [1829]:
# Let's generate the Initials on our own
# We need to take extra care for Names with more than words or just one word  
orders['Initials'] = orders['Customer_Name'].apply(lambda name: str((''.join(word[:1] for word in name.split()) if len(name.split()) > 1 else name)[0:2]))

In [1830]:
# Check if Customer ID contains our generated Initials
filtered_df = orders[orders.apply(lambda row: not row['Customer_ID'].startswith(row['Initials']), axis=1)]
display(filtered_df)

Unnamed: 0,Row_ID,Order_TwoLC,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,Postal_Code,City,State,Country,Region,Market,Product_ID,Product_Name,SubCategory,Category,Sales,Quantity,Discount,Profit,Shipping_Cost,Order_Priority,Returned,Person,Region_Number,Initials
26318,33,US,2016-05-24,2016-05-31,Standard Class,SC-2005055,Kai Rey,Home Office,,Tegucigalpa,Francisco Morazán,Honduras,Central America,LATAM,TEC-CO-5998,"Sharp Fax Machine, Digital",Copiers,Technology,941.77824,8,0.402,-601.74176,80.281,Medium,False,Nicodemo Bautista,0,KR
26319,32,US,2016-05-24,2016-05-31,Standard Class,SC-2005055,Kai Rey,Home Office,,Tegucigalpa,Francisco Morazán,Honduras,Central America,LATAM,OFF-AP-3582,"Breville Toaster, Black",Appliances,Office Supplies,152.28,5,0.4,-99.02,16.245,Medium,False,Nicodemo Bautista,0,KR
26320,28,US,2016-05-24,2016-05-31,Standard Class,SC-2005055,Kai Rey,Home Office,,Tegucigalpa,Francisco Morazán,Honduras,Central America,LATAM,OFF-BI-6383,"Wilson Jones Binding Machine, Durable",Binders,Office Supplies,141.288,7,0.4,-73.052,9.465,Medium,False,Nicodemo Bautista,0,KR
26321,30,US,2016-05-24,2016-05-31,Standard Class,SC-2005055,Kai Rey,Home Office,,Tegucigalpa,Francisco Morazán,Honduras,Central America,LATAM,TEC-PH-5262,"Motorola Office Telephone, with Caller ID",Phones,Technology,114.816,4,0.4,-49.824,5.923,Medium,False,Nicodemo Bautista,0,KR
26322,34,US,2016-05-24,2016-05-31,Standard Class,SC-2005055,Kai Rey,Home Office,,Tegucigalpa,Francisco Morazán,Honduras,Central America,LATAM,OFF-AR-6118,"Stanley Pencil Sharpener, Easy-Erase",Art,Office Supplies,10.416,1,0.4,-0.704,1.931,Medium,False,Nicodemo Bautista,0,KR
26323,31,US,2016-05-24,2016-05-31,Standard Class,SC-2005055,Kai Rey,Home Office,,Tegucigalpa,Francisco Morazán,Honduras,Central America,LATAM,OFF-AR-3488,"Binney & Smith Markers, Blue",Art,Office Supplies,19.272,2,0.4,2.872,1.843,Medium,False,Nicodemo Bautista,0,KR
26324,29,US,2016-05-24,2016-05-31,Standard Class,SC-2005055,Kai Rey,Home Office,,Tegucigalpa,Francisco Morazán,Honduras,Central America,LATAM,OFF-FA-6208,"Stockwell Thumb Tacks, Metal",Fasteners,Office Supplies,21.84,4,0.4,-3.28,1.091,Medium,False,Nicodemo Bautista,0,KR


### There are multiple things suspicious with Kai Rey
* The RowIDs are very low
* The Customer_ID contains a SC instead of his initials KR
* There is another Customer with the same Customer ID named "Sample Company A" - That is where the initials SC comes from

In [1831]:
orders.loc[orders['Customer_ID'].isin(filtered_df['Customer_ID']), 'Customer_ID'] = orders['Customer_ID'].str.replace(r'^SC-', 'KR-')

  orders.loc[orders['Customer_ID'].isin(filtered_df['Customer_ID']), 'Customer_ID'] = orders['Customer_ID'].str.replace(r'^SC-', 'KR-')


In [1832]:
orders['Customer_ID'] = orders.apply(lambda row: row['Customer_ID'][3:] if row['Customer_ID'].startswith(row['Initials']) else row['Customer_ID'], axis=1)

## Extract CountyCode from CustomerID

In [1833]:
import re

prev_id = 0
out = ""

matching = []
manual_aliases = {
    "Belize": 82, # Matches Mexico (!)
    "Bhutan": 58, # Matches India (!)
    "Botswana": 117, # Matches South Africa (!)
    "Burkina Faso": 111, # Matches Senegal (!)
    "Costa Rica": 28, # Matches Mexico (!)
    "Cyprus": 64, # Matches Italy (!!!)
    "French Guiana": 18, # Matches Brazil (!)
    "Guyana": 54, # Matches Brazil (!)
    "Kuwait": 110, # Matches Saudi Arabia (!)
    "Laos": 144, # Matches Laos (!)
    "Luxembourg": 48, # Matches Germany (!)
    "Malawi": 87, # Matches Mozambique (!)
    "Oman": 110, # Matches Saudi Arabia (!)
    "Serbia": 19, # Matches Bulgaria (!)
    "Suriname": 18, # Matches Brazil (!)
    "The Gambia": 111, # Matches Senegal (!)

}
manual_matching = {
    "Bahrain": 10,
    "Burundi": 20,
    "Chad": 25,
    "Equatorial Guinea": 40,
    "Eritrea": 41,
    "Greece": 148, # No zf found(!!!!!) / Must be added very late since it has the highest CountryID
    "Guadeloupe": 50,
    "Montenegro": 85,
    "Republic of the Congo": 106,
    "South Sudan": 119,
    "Tajikistan": 128, # [128]
    "Western Sahara": 29, # No clue why western sahara has such a low CountryID
}

for country, data in orders.groupby("Country"):
    zf_found = False
    for i in reversed(range(1,5)):

        ids = (data['Customer_ID'].str[-i:].drop_duplicates().astype("int").values)
        #print(i, prev_id, ids)

        # lets see if we can find the "0" and "5" before each countryID
        if (len(ids) == 2) and (abs(ids[0]-ids[-1]) in [50, 500, 5000]):
            zf_found = True

        if (zf_found == True) & (len(ids) == 1) & (int(ids[0]) not in matching):
            prev_id = int(ids[0])
            #print("Found CountyID for", country, "=", prev_id)
            # TODO matching[country] = prev_id
            matching.append({'Name': country, 'ID': prev_id, 'Alias': 0})
            #display(data[["Row ID", "Customer ID", "Country"]].head())
            orders.loc[orders['Country'] == country, 'Country_ID'] = prev_id
            orders.loc[orders['Country'] == country, 'CountryAlias_ID'] = 0
            orders.loc[orders['Country'] == country, 'Customer_ID'] = orders.loc[orders['Country'] == country, 'Customer_ID'].str[:-i]
            break

    if country in manual_matching:
        #print("Manual CountyID for", country, "=", manual_matching[country])
        # TODO matching[country] = manual_matching[country]
        matching.append({'Name': country, 'ID': prev_id, 'Alias': 0})
        orders.loc[orders['Country'] == country, 'Country_ID'] = prev_id
        orders.loc[orders['Country'] == country, 'CountryAlias_ID'] = 0
        orders.loc[orders['Country'] == country, 'Customer_ID'] = orders.loc[orders['Country'] == country, 'Customer_ID'].str[:-len(str(manual_matching[country]))]
    elif country in manual_aliases:
        #print("Manual AliasID for", country, "=", manual_aliases[country])
        # TODO matching[country] = manual_matching[country]
        matching.append({'Name': country, 'ID': prev_id + 1000, 'Alias': manual_aliases[country]})
        orders.loc[orders['Country'] == country, 'Country_ID'] = prev_id+1000
        orders.loc[orders['Country'] == country, 'CountryAlias_ID'] = manual_aliases[country]
        orders.loc[orders['Country'] == country, 'Customer_ID'] = orders.loc[orders['Country'] == country, 'Customer_ID'].str[:-len(str(manual_aliases[country]))]
    elif zf_found == False:
        print("Coudn't find a CountryID for", country)
        ids = (data['Customer_ID'].str[-4:].drop_duplicates().astype("int").values)
        print(ids, len(ids))
        out = out + "\n" + '"' + country + '": ' + str(prev_id + 1) + ', # ' + ids[0]
print(out)
#display(matching)




In [1834]:
countries = pd.DataFrame(matching)
display(countries.head())

Unnamed: 0,Name,ID,Alias
0,Afghanistan,1,0
1,Albania,2,0
2,Algeria,3,0
3,Angola,4,0
4,Argentina,5,0


In [1835]:
display(orders["Customer_ID"].head())

0    12055
1    12055
2    12055
3    11605
4    11605
Name: Customer_ID, dtype: object

In [1836]:
last_id = 0

# TODO The reason why there a sometimes multiple ids (i.e.:AG-270 and AG-10270) has something to do with the row ID? 
# Somewhere over the ID 40.000 the ID in Customer ID changes by 10.000 

for name, data in orders.groupby("Customer_Name"):
    
    temp_id = last_id + 15

    ids = (data['Customer_ID'].drop_duplicates().astype("int").values)
    ids = ids[~((ids == 88887) | (ids == 88888))]  # Just needed for "Aleksandra Gannaway" and "Denny Joy" in Greece
    
    if (len(ids) == 1) & (temp_id == ids[0]):
        # print("Treffer:", name, len(str(temp_id)), temp_id, ids)
        last_id = temp_id
    elif (len(ids) == 1) & (temp_id + 10000 == ids[0]):
        # print("Treffer:", name, len(str(temp_id)), temp_id, ids)
        last_id = temp_id
    elif name == "Kai Rey":
        # TODO No idea what happened here, but Kai Rey shares the same ID than the stupid Customer Name "Sample Company A"
        # Since Kai Rey also has the Initials "SC" (same as Sample Company A) I guess Kai Rey shoud be renamed "Sample Company A"
        print("WTF:", name, len(str(temp_id)), temp_id, ids)
        continue
    elif (len(ids) == 2) & (abs(ids[0] - ids[-1]) == 10000) & (ids.min() == temp_id):
        # print("Treffer:", name, len(str(temp_id)), temp_id, ids)
        last_id = temp_id
        #break
    else:
        print("Unknown:", name, len(str(temp_id)), temp_id, ids)
    #break
    #for country, data2 in data.group("Country"):


WTF: Kai Rey 4 6225 [20050]


In [1837]:
orders = orders.drop(["Customer_ID"], axis=1)

In [1838]:
orders.head()

Unnamed: 0,Row_ID,Order_TwoLC,Order_Date,Ship_Date,Ship_Mode,Customer_Name,Segment,Postal_Code,City,State,Country,Region,Market,Product_ID,Product_Name,SubCategory,Category,Sales,Quantity,Discount,Profit,Shipping_Cost,Order_Priority,Returned,Person,Region_Number,Initials,Country_ID,CountryAlias_ID
0,24599,IN,2017-03-22,2017-03-29,Standard Class,Cathy Armstrong,Home Office,,Herat,Hirat,Afghanistan,Southern Asia,Asia Pacific,FUR-BO-4861,"Ikea Library with Doors, Mobile",Bookcases,Furniture,731.82,2,0.0,102.42,39.66,Medium,True,Chandrakant Chaudhri,0,CA,1.0,0.0
1,24598,IN,2017-03-22,2017-03-29,Standard Class,Cathy Armstrong,Home Office,,Herat,Hirat,Afghanistan,Southern Asia,Asia Pacific,TEC-MA-4211,"Epson Receipt Printer, White",Machines,Technology,346.32,3,0.0,13.77,14.1,Medium,True,Chandrakant Chaudhri,0,CA,1.0,0.0
2,24597,IN,2017-03-22,2017-03-29,Standard Class,Cathy Armstrong,Home Office,,Herat,Hirat,Afghanistan,Southern Asia,Asia Pacific,FUR-FU-5726,"Rubbermaid Door Stop, Erganomic",Furnishings,Furniture,169.68,4,0.0,79.68,11.01,Medium,True,Chandrakant Chaudhri,0,CA,1.0,0.0
3,29465,ID,2015-09-01,2015-09-04,Second Class,Brian Dahlen,Consumer,,Herat,Hirat,Afghanistan,Southern Asia,Asia Pacific,OFF-SU-2988,"Acme Scissors, Easy Grip",Supplies,Office Supplies,243.54,9,0.0,104.49,18.72,Medium,False,Chandrakant Chaudhri,0,BD,1.0,0.0
4,29464,ID,2015-09-01,2015-09-04,Second Class,Brian Dahlen,Consumer,,Herat,Hirat,Afghanistan,Southern Asia,Asia Pacific,OFF-EN-3664,"Cameo Interoffice Envelope, with clear poly wi...",Envelopes,Office Supplies,203.88,4,0.0,24.36,5.72,Medium,False,Chandrakant Chaudhri,0,BD,1.0,0.0


# Split the dataframe

In [1839]:
display(orders.head(5))

Unnamed: 0,Row_ID,Order_TwoLC,Order_Date,Ship_Date,Ship_Mode,Customer_Name,Segment,Postal_Code,City,State,Country,Region,Market,Product_ID,Product_Name,SubCategory,Category,Sales,Quantity,Discount,Profit,Shipping_Cost,Order_Priority,Returned,Person,Region_Number,Initials,Country_ID,CountryAlias_ID
0,24599,IN,2017-03-22,2017-03-29,Standard Class,Cathy Armstrong,Home Office,,Herat,Hirat,Afghanistan,Southern Asia,Asia Pacific,FUR-BO-4861,"Ikea Library with Doors, Mobile",Bookcases,Furniture,731.82,2,0.0,102.42,39.66,Medium,True,Chandrakant Chaudhri,0,CA,1.0,0.0
1,24598,IN,2017-03-22,2017-03-29,Standard Class,Cathy Armstrong,Home Office,,Herat,Hirat,Afghanistan,Southern Asia,Asia Pacific,TEC-MA-4211,"Epson Receipt Printer, White",Machines,Technology,346.32,3,0.0,13.77,14.1,Medium,True,Chandrakant Chaudhri,0,CA,1.0,0.0
2,24597,IN,2017-03-22,2017-03-29,Standard Class,Cathy Armstrong,Home Office,,Herat,Hirat,Afghanistan,Southern Asia,Asia Pacific,FUR-FU-5726,"Rubbermaid Door Stop, Erganomic",Furnishings,Furniture,169.68,4,0.0,79.68,11.01,Medium,True,Chandrakant Chaudhri,0,CA,1.0,0.0
3,29465,ID,2015-09-01,2015-09-04,Second Class,Brian Dahlen,Consumer,,Herat,Hirat,Afghanistan,Southern Asia,Asia Pacific,OFF-SU-2988,"Acme Scissors, Easy Grip",Supplies,Office Supplies,243.54,9,0.0,104.49,18.72,Medium,False,Chandrakant Chaudhri,0,BD,1.0,0.0
4,29464,ID,2015-09-01,2015-09-04,Second Class,Brian Dahlen,Consumer,,Herat,Hirat,Afghanistan,Southern Asia,Asia Pacific,OFF-EN-3664,"Cameo Interoffice Envelope, with clear poly wi...",Envelopes,Office Supplies,203.88,4,0.0,24.36,5.72,Medium,False,Chandrakant Chaudhri,0,BD,1.0,0.0


In [1840]:
# Let's try to create a card_ID for every bought product per Customer and per Day
# There are Orders with different destinations - these needs to be separated!
# 10124,US-2014-BT1130518-41643,1/4/2014,1/11/2014,Standard Class,BT-1130518,Beth Thompson,Home Office,,Pilar,Alagoas,Brazil,South America,LATAM,OFF-EN-4912,"Jiffy Interoffice Envelope, Set of 50",Envelopes,Office Supplies,"74,304",6,"0,6","-107,856","7,042",Medium
# 130,MX-2014-BT1130531-41643,1/4/2014,1/11/2014,Standard Class,BT-1130531,Beth Thompson,Home Office,,Manzanillo,Granma,Cuba,Caribbean,LATAM,OFF-EN-4912,"Jiffy Interoffice Envelope, Set of 50",Envelopes,Office Supplies,"185,76",6,0,"3,6","16,394",Medium

orders['Cart_ID'] = orders.groupby(['Order_Date', 'Customer_Name', 'City', 'Order_TwoLC', 'Order_Priority']).ngroup() + 1



In [1841]:
#display(orders[(orders['Customer_ID'] == 595) & (orders['Order_Date'] == "2017-09-24")].head())

In [1842]:
orders['Region_ID'] = orders.groupby("Region").ngroup() + 1

In [1843]:
orders["Customer_ID"] = orders.groupby("Customer_Name").ngroup() + 1

In [1844]:
# Check if the ProductID can be reduced to the ID:
for name, data in orders.groupby("Product_ID"):
    e = name.split(sep="-")
    if ( len(data.Category.unique()) == 1 and data.Category.unique()[0].upper()[0:3] != e[0]) & ( len(data.SubCategory.unique()) == 1 and data.SubCategory.unique()[0].upper()[0:2] != e[1]):
        print(name, e)
        print(data.Category.unique()[0].upper()[0:3])
        print(data.SubCategory.unique()[0].upper()[0:2])
        break
orders['Product_ID'] = orders['Product_ID'].str.rsplit('-', n=1).str[-1]
orders["Product_ID"].head()

0    4861
1    4211
2    5726
3    2988
4    3664
Name: Product_ID, dtype: object

In [1845]:
most_used_product = None
most_used_count = 0

for pid, data in orders.groupby("Product_ID"):
    group_size = len(data)
    if group_size > most_used_count:
        most_used_product = pid
        most_used_count = group_size

print("Most frequently used Product_ID:", most_used_product)
print("Count:", most_used_count)

Most frequently used Product_ID: 6129
Count: 227


In [1846]:
#  * (1-orders["Discount"])
orders["Unit_Price"] = round(orders["Sales"] / orders["Quantity"] / (1-orders["Discount"]), 4)

In [1847]:
# Unfortunately the Column "ProductID" is more a "CategoryID" therefore we create our own ProductID per Unit_Price and "Product Category"
orders = orders.rename(columns={'Product_ID': 'Category_ID'})
orders["Product_ID"] = orders.groupby(["Category_ID", "Unit_Price"]).ngroup() + 1

In [1848]:
orders["City_ID"] = orders.groupby(["Country", "City", "State"]).ngroup() + 1

In [1849]:
display(orders[orders["City"] == "Fairfield"])

Unnamed: 0,Row_ID,Order_TwoLC,Order_Date,Ship_Date,Ship_Mode,Customer_Name,Segment,Postal_Code,City,State,Country,Region,Market,Category_ID,Product_Name,SubCategory,Category,Sales,Quantity,Discount,Profit,Shipping_Cost,Order_Priority,Returned,Person,Region_Number,Initials,Country_ID,CountryAlias_ID,Cart_ID,Region_ID,Customer_ID,Unit_Price,Product_ID,City_ID
43269,37041,US,2017-03-29,2017-04-01,First Class,Dave Hallsten,Corporate,94533.0,Fairfield,California,United States,Western US,USCA,5754,SAFCO Boltless Steel Shelving,Storage,Office Supplies,795.48,7,0.0,7.9548,139.58,High,False,Derrick Snyders,4,DH,140.0,0.0,18224,24,205,113.64,4477,3320
43270,37038,US,2017-03-29,2017-04-01,First Class,Dave Hallsten,Corporate,94533.0,Fairfield,California,United States,Western US,USCA,6155,StarTech.com 10/100 VDSL2 Ethernet Extender Kit,Machines,Technology,532.72,2,0.2,53.272,83.24,High,False,Derrick Snyders,4,DH,140.0,0.0,18224,24,205,332.95,5199,3320
43271,37042,US,2017-03-29,2017-04-01,First Class,Dave Hallsten,Corporate,94533.0,Fairfield,California,United States,Western US,USCA,6099,Stackable Trays,Furnishings,Furniture,21.56,7,0.0,6.8992,5.58,High,False,Derrick Snyders,4,DH,140.0,0.0,18224,24,205,3.08,5061,3320
43272,37040,US,2017-03-29,2017-04-01,First Class,Dave Hallsten,Corporate,94533.0,Fairfield,California,United States,Western US,USCA,6534,Xerox 1977,Paper,Office Supplies,20.04,3,0.0,9.6192,5.39,High,False,Derrick Snyders,4,DH,140.0,0.0,18224,24,205,6.68,5748,3320
43273,37037,US,2017-03-29,2017-04-01,First Class,Dave Hallsten,Corporate,94533.0,Fairfield,California,United States,Western US,USCA,3909,DAX Black Cherry Wood-Tone Poster Frame,Furnishings,Furniture,26.48,1,0.0,10.0624,2.2,High,False,Derrick Snyders,4,DH,140.0,0.0,18224,24,205,26.48,1660,3320
43274,37036,US,2017-03-29,2017-04-01,First Class,Dave Hallsten,Corporate,94533.0,Fairfield,California,United States,Western US,USCA,6532,Xerox 1975,Paper,Office Supplies,12.96,2,0.0,6.3504,1.84,High,False,Derrick Snyders,4,DH,140.0,0.0,18224,24,205,6.48,5746,3320
43275,37039,US,2017-03-29,2017-04-01,First Class,Dave Hallsten,Corporate,94533.0,Fairfield,California,United States,Western US,USCA,6524,Xerox 1968,Paper,Office Supplies,26.72,4,0.0,12.8256,1.67,High,False,Derrick Snyders,4,DH,140.0,0.0,18224,24,205,6.68,5738,3320
43276,32594,CA,2017-12-31,2018-01-04,Standard Class,Erica Bern,Corporate,94533.0,Fairfield,California,United States,Western US,USCA,4335,GBC Binding covers,Binders,Office Supplies,20.72,2,0.2,6.475,2.06,Medium,False,Derrick Snyders,4,EB,140.0,0.0,25724,24,265,12.95,2325,3320
43277,32593,CA,2017-12-31,2018-01-04,Standard Class,Erica Bern,Corporate,94533.0,Fairfield,California,United States,Western US,USCA,3741,"Cardinal Slant-D Ring Binder, Heavy Gauge Vinyl",Binders,Office Supplies,13.904,2,0.2,4.5188,1.89,Medium,False,Derrick Snyders,4,EB,140.0,0.0,25724,24,265,8.69,1400,3320
48454,36225,CA,2017-12-02,2017-12-07,Standard Class,Alan Dominguez,Home Office,6824.0,Fairfield,Connecticut,United States,Eastern US,USCA,4397,Global Comet Stacking Armless Chair,Chairs,Furniture,897.15,3,0.0,251.202,58.41,Medium,False,Dolores Davis,6,AD,140.0,0.0,24696,10,12,299.05,2387,3321


In [1850]:
orders.head(5)

Unnamed: 0,Row_ID,Order_TwoLC,Order_Date,Ship_Date,Ship_Mode,Customer_Name,Segment,Postal_Code,City,State,Country,Region,Market,Category_ID,Product_Name,SubCategory,Category,Sales,Quantity,Discount,Profit,Shipping_Cost,Order_Priority,Returned,Person,Region_Number,Initials,Country_ID,CountryAlias_ID,Cart_ID,Region_ID,Customer_ID,Unit_Price,Product_ID,City_ID
0,24599,IN,2017-03-22,2017-03-29,Standard Class,Cathy Armstrong,Home Office,,Herat,Hirat,Afghanistan,Southern Asia,Asia Pacific,4861,"Ikea Library with Doors, Mobile",Bookcases,Furniture,731.82,2,0.0,102.42,39.66,Medium,True,Chandrakant Chaudhri,0,CA,1.0,0.0,18106,17,137,365.91,3155,1
1,24598,IN,2017-03-22,2017-03-29,Standard Class,Cathy Armstrong,Home Office,,Herat,Hirat,Afghanistan,Southern Asia,Asia Pacific,4211,"Epson Receipt Printer, White",Machines,Technology,346.32,3,0.0,13.77,14.1,Medium,True,Chandrakant Chaudhri,0,CA,1.0,0.0,18106,17,137,115.44,2151,1
2,24597,IN,2017-03-22,2017-03-29,Standard Class,Cathy Armstrong,Home Office,,Herat,Hirat,Afghanistan,Southern Asia,Asia Pacific,5726,"Rubbermaid Door Stop, Erganomic",Furnishings,Furniture,169.68,4,0.0,79.68,11.01,Medium,True,Chandrakant Chaudhri,0,CA,1.0,0.0,18106,17,137,42.42,4424,1
3,29465,ID,2015-09-01,2015-09-04,Second Class,Brian Dahlen,Consumer,,Herat,Hirat,Afghanistan,Southern Asia,Asia Pacific,2988,"Acme Scissors, Easy Grip",Supplies,Office Supplies,243.54,9,0.0,104.49,18.72,Medium,False,Chandrakant Chaudhri,0,BD,1.0,0.0,7505,17,107,27.06,205,1
4,29464,ID,2015-09-01,2015-09-04,Second Class,Brian Dahlen,Consumer,,Herat,Hirat,Afghanistan,Southern Asia,Asia Pacific,3664,"Cameo Interoffice Envelope, with clear poly wi...",Envelopes,Office Supplies,203.88,4,0.0,24.36,5.72,Medium,False,Chandrakant Chaudhri,0,BD,1.0,0.0,7505,17,107,50.97,1263,1


In [1851]:
orders['Postal_Code'].fillna(0, inplace=True)
orders["Address_ID"] = orders.groupby(["Postal_Code", "City", "State", "Country", "Region", "Market"]).ngroup() + 1

In [1852]:
orders[["Postal_Code", "City", "State", "Country", "Region", "Market", "Address_ID"]].drop_duplicates().head()

Unnamed: 0,Postal_Code,City,State,Country,Region,Market,Address_ID
0,0.0,Herat,Hirat,Afghanistan,Southern Asia,Asia Pacific,1205
5,0.0,Kabul,Kabul,Afghanistan,Southern Asia,Asia Pacific,1396
46,0.0,Kandahar,Kandahar,Afghanistan,Southern Asia,Asia Pacific,1419
50,0.0,Jalalabad,Nangarhar,Afghanistan,Southern Asia,Asia Pacific,1334
55,0.0,Chittagong,Chittagong,Bangladesh,Southern Asia,Asia Pacific,657


# Splitting

In [1853]:
list(orders.Market.drop_duplicates().values)
#orders.dtypes

['Asia Pacific', 'Europe', 'Africa', 'LATAM', 'USCA']

In [1854]:
# TODO We could just use city ID and then outsource State, Country, Region and Market into another tables
db = {
    "Purchase": {
        "groupby": {
            'Customer_ID': { "NOT NULL": True }, 
            'Order_Date': { "NOT NULL": True, },
            'Cart_ID': { "NOT NULL": True, 'PRIMARY KEY': True },
        },
        "fields": {
            'Address_ID': { "NOT NULL": True },
            'Order_TwoLC': { "NOT NULL": True },
            'Ship_Date': { 'CHECK': "(Ship_Date >= Order_Date)"},
            'Ship_Mode': { 'CHECK': "(Ship_Mode IN ('Standard Class', 'Second Class', 'First Class', 'Same Day'))" } , 
            'Order_Priority': { 'CHECK': "(Order_Priority IN ('Medium', 'High', 'Critical', 'Low'))" },
            'Returned': { 'DEFAULT': 'False' }
        }
    },
    "Address": {
        "groupby": {
            'Address_ID': { "NOT NULL": True, 'PRIMARY KEY': True }
        },
        # Postal_Code	City	State	Country	Region	Market  Country_ID CountryAlias_ID Region_ID
        "fields": {
            'Postal_Code': { "NOT NULL": False },
            'City_ID': { "NOT NULL": True },
        }
    },
    "City": {
        "groupby": {
            'City_ID': { "NOT NULL": True, 'PRIMARY KEY': True }
        },
        "fields": {
            'City': { "NOT NULL": True },
            'State': { "NOT NULL": True },
            'Country_ID': { "NOT NULL": True},
            'Region_ID': { "DEFAULT": "0", "NOT NULL": True }
        }
    },
    "Country": {
        "groupby": {
            'Country_ID': { "NOT NULL": True, 'PRIMARY KEY': True }
        },
        "fields": {
            'Country': { "NOT NULL": True },
            'Market': { "CHECK": "(Market IN ('Asia Pacific', 'Europe', 'Africa', 'LATAM', 'USCA'))" },
            'CountryAlias_ID': { "DEFAULT": "0", "NOT NULL": True },
        }
    },
    "Region": {
        "groupby": {
            'Region_ID': { "NOT NULL": True, 'PRIMARY KEY': True }
        },
        "fields": {
            'Region': { "RENAME": "Name", "NOT NULL": True },
            'Region_Number': { "DEFAULT": "0", "NOT NULL": True},
            'Person': { "NOT NULL": True }
        }
    },
    "Customer": {
        "groupby": {
            'Customer_ID': { "NOT NULL": True, 'PRIMARY KEY': True }
        },
        "fields": {
            'Customer_Name': { "NOT NULL": True },
            'Segment': { 'CHECK': "(Segment IN ('Consumer', 'Corporate', 'Home Office'))" },
            'Initials': {}
        }
    },
    "Cart": {
        "groupby": {
            'Cart_ID': { "NOT NULL": True },
            'Row_ID': { "NOT NULL": True, 'PRIMARY KEY': True }
        },
        "fields": {
            'Product_ID': { "NOT NULL": True },
            'Quantity': { "NOT NULL": True },
            'Sales': { "NOT NULL": True },
            'Discount': { "DEFAULT": "0" , "NOT NULL": True },
            'Profit': { "NOT NULL": True },
            'Shipping_Cost': { "NOT NULL": True },
        }
    },
    "Product": {
        "groupby": {
            'Product_ID': { "NOT NULL": True, 'PRIMARY KEY': True }
        },
        "fields": {
            'Product_Name': { "NOT NULL": True },
            'Category_ID': { "NOT NULL": True },
            'Unit_Price': { "NOT NULL": True },
        }
    },
    "Category": {
        "groupby": {
            'Category_ID': { "NOT NULL": True, 'PRIMARY KEY': True }
        },
        "fields": {
            'Category': { "NOT NULL": True, 'CHECK': "(Category IN ('Office Supplies', 'Technology', 'Furniture'))" },
            'SubCategory': { "NOT NULL": True, 'CHECK': "(SubCategory in ('Bookcases', 'Supplies', 'Machines', 'Furnishings', 'Envelopes', 'Tables', 'Chairs', 'Phones', 'Appliances', 'Copiers', 'Storage', 'Paper', 'Accessories', 'Art', 'Binders', 'Labels', 'Fasteners'))" }
        }
    }
}

In [1855]:
check = {column: 0 for column in orders.columns.to_series().to_dict()}
references = {}
for table, data in db.items():
    for t, data in db[table].items():
        if (t == "REF"):
            continue
        elif (t != "groupby" and t != "fields"):
            print(f"Unknown type {t}")
            break
        for column, data in db[table][t].items():

            if "_ID" in column:
                if column in references:
                    print(f"Found Ref for {column} in table {table} to {references[column]}")
                    db[table][t][column]["RENAME"] = "ID"
                    if "REF" not in db[references[column]]:
                        db[references[column]]['REF'] = []
                    db[references[column]]['REF'].append(f"FOREIGN KEY ({column}) REFERENCES {table}(ID)")
                else:
                    references[column] = table
                    check[column] += 1
            else:
                check[column] += 1

for key in check.keys():
    if check[key] == 0:
        print(f"Column {key} wasn't used")
    elif check[key] > 1:
        print(f"Column {key} was used {check[key]} times")

Found Ref for Address_ID in table Address to Purchase
Found Ref for City_ID in table City to Address
Found Ref for Country_ID in table Country to City
Found Ref for Region_ID in table Region to City
Found Ref for Customer_ID in table Customer to Purchase
Found Ref for Cart_ID in table Cart to Purchase
Found Ref for Product_ID in table Product to Cart
Found Ref for Category_ID in table Category to Product


In [1856]:
product = orders.groupby(['Product_ID']).agg({
    'Product_Name': 'first',
    'Category_ID': 'first',
    'Unit_Price': 'first'
}).reset_index()

In [1857]:
# TODO We might want to get the max length of TEXT fields and limit them to that size with "VARCHAR(X)"

datatypes = {
    "datetime64[ns]": "DATE",
    "int64": "INT",
    "float64": "INT",
    "object": "TEXT"
}

def createTable(name):
    if name not in db:
        return ""
    
    fields = []
    for t in db[name].keys():
        if t != "REF":

            for key in db[name][t]:
                datatype = str(orders[key].dtype)

                if "_ID" in key:
                    datatype = "INT"
                elif key == "Returned":
                    datatype = "BOOL"
                elif datatype in datatypes.keys():
                    datatype = datatypes[datatype]
                else:
                    print(f"Unknown datatype {datatype} in {datatypes.keys()}")
                    datatype = "UNKNOWN"

                key2 = key
                if "RENAME" in db[name][t][key]:
                    key2 = db[name][t][key]["RENAME"]

                f = "\n\t" + key2 + " " + datatype
                
                for constraint in db[name][t][key]:
                    if constraint == "RENAME":
                        continue
                    elif constraint == "CHECK":    
                        f = f + " CHECK " + db[name][t][key][constraint]
                    elif constraint == "DEFAULT":
                        f = f + " DEFAULT " + db[name][t][key][constraint]
                    elif db[name][t][key][constraint] == True:
                        f = f + " " + constraint
                fields.append(f)
    
    sql = "CREATE TABLE " + name + " (" + ",".join(fields)
    if "REF" in db[name].keys():
        sql = sql + ",\n\n\t" + ",\n\t".join(db[name]["REF"])
    sql = sql + "\n)"
    return sql


def createData(name):
    if name not in db:
        return ""
    
    d = {}
    for k in db[name]["fields"]:
        d[k] = 'first'
    data = orders.groupby(list(db[name]["groupby"].keys())).agg(d).reset_index()

    for k in db[name]["fields"]:
        if "RENAME" in db[name]["fields"][k]:
            data = data.rename(columns={k: db[name]["fields"][k]["RENAME"]})

    for k in db[name]["groupby"]:
        if "RENAME" in db[name]["groupby"][k]:
            data = data.rename(columns={k: db[name]["groupby"][k]["RENAME"]})            

    return data
    

In [1858]:
db.keys()

dict_keys(['Purchase', 'Address', 'City', 'Country', 'Region', 'Customer', 'Cart', 'Product', 'Category'])

In [1859]:
display(orders[orders.Cart_ID == 22557])

Unnamed: 0,Row_ID,Order_TwoLC,Order_Date,Ship_Date,Ship_Mode,Customer_Name,Segment,Postal_Code,City,State,Country,Region,Market,Category_ID,Product_Name,SubCategory,Category,Sales,Quantity,Discount,Profit,Shipping_Cost,Order_Priority,Returned,Person,Region_Number,Initials,Country_ID,CountryAlias_ID,Cart_ID,Region_ID,Customer_ID,Unit_Price,Product_ID,City_ID,Address_ID
29731,14,MX,2017-09-24,2017-10-01,Standard Class,Paul Knutson,Home Office,0.0,Managua,Managua,Nicaragua,Central America,LATAM,5034,"Kraft Mailers, Security-Tint",Envelopes,Office Supplies,80.1,3,0.0,37.62,8.863,Low,False,Nicodemo Bautista,0,PK,93.0,0.0,22557,3,595,26.7,3440,2378,1777


In [1860]:
for x, data in orders[orders["State"] == "California"].groupby(["State"]):
    print(x, data.shape)

California (2001, 36)


# Import

In [1861]:
for name in reversed(db.keys()):
    print(name)

    sql = createTable(name)
    print(sql)
    #continue
    cur.execute("DROP TABLE IF EXISTS " + name)
    cur.execute(sql)

    data = createData(name)
    display(data.tail(1))
    data.to_sql(name, con, if_exists='replace', index=False)

Category
CREATE TABLE Category (
	ID INT NOT NULL PRIMARY KEY,
	Category TEXT NOT NULL CHECK (Category IN ('Office Supplies', 'Technology', 'Furniture')),
	SubCategory TEXT NOT NULL CHECK (SubCategory in ('Bookcases', 'Supplies', 'Machines', 'Furnishings', 'Envelopes', 'Tables', 'Chairs', 'Phones', 'Appliances', 'Copiers', 'Storage', 'Paper', 'Accessories', 'Art', 'Binders', 'Labels', 'Fasteners'))
)


Unnamed: 0,ID,Category,SubCategory
3787,6634,Office Supplies,Binders


Product
CREATE TABLE Product (
	ID INT NOT NULL PRIMARY KEY,
	Product_Name TEXT NOT NULL,
	Category_ID INT NOT NULL,
	Unit_Price INT NOT NULL,

	FOREIGN KEY (Category_ID) REFERENCES Category(ID)
)


Unnamed: 0,ID,Product_Name,Category_ID,Unit_Price
5871,5872,Zipper Ring Binder Pockets,6634,3.12


Cart
CREATE TABLE Cart (
	ID INT NOT NULL,
	Row_ID INT NOT NULL PRIMARY KEY,
	Product_ID INT NOT NULL,
	Quantity INT NOT NULL,
	Sales INT NOT NULL,
	Discount INT DEFAULT 0 NOT NULL,
	Profit INT NOT NULL,
	Shipping_Cost INT NOT NULL,

	FOREIGN KEY (Product_ID) REFERENCES Product(ID)
)


Unnamed: 0,ID,Row_ID,Product_ID,Quantity,Sales,Discount,Profit,Shipping_Cost
51289,25754,41744,5421,4,173.76,0.6,-117.36,13.72


Customer
CREATE TABLE Customer (
	ID INT NOT NULL PRIMARY KEY,
	Customer_Name TEXT NOT NULL,
	Segment TEXT CHECK (Segment IN ('Consumer', 'Corporate', 'Home Office')),
	Initials TEXT
)


Unnamed: 0,ID,Customer_Name,Segment,Initials
795,796,Zuschuss Donatelli,Consumer,ZD


Region
CREATE TABLE Region (
	ID INT NOT NULL PRIMARY KEY,
	Name TEXT NOT NULL,
	Region_Number INT DEFAULT 0 NOT NULL,
	Person TEXT NOT NULL
)


Unnamed: 0,ID,Name,Region_Number,Person
23,24,Western US,4,Derrick Snyders


Country
CREATE TABLE Country (
	ID INT NOT NULL PRIMARY KEY,
	Country TEXT NOT NULL,
	Market TEXT CHECK (Market IN ('Asia Pacific', 'Europe', 'Africa', 'LATAM', 'USCA')),
	CountryAlias_ID INT DEFAULT 0 NOT NULL
)


Unnamed: 0,ID,Country,Market,CountryAlias_ID
152,1130.0,The Gambia,Africa,111.0


City
CREATE TABLE City (
	ID INT NOT NULL PRIMARY KEY,
	City TEXT NOT NULL,
	State TEXT NOT NULL,
	Country_ID INT NOT NULL,
	Region_ID INT DEFAULT 0 NOT NULL,

	FOREIGN KEY (Country_ID) REFERENCES Country(ID),
	FOREIGN KEY (Region_ID) REFERENCES Region(ID)
)


Unnamed: 0,ID,City,State,Country_ID,Region_ID
3827,3828,Victoria Falls,Matabeleland North,147.0,6


Address
CREATE TABLE Address (
	ID INT NOT NULL PRIMARY KEY,
	Postal_Code INT,
	City_ID INT NOT NULL,

	FOREIGN KEY (City_ID) REFERENCES City(ID)
)


Unnamed: 0,ID,Postal_Code,City_ID
3855,3856,99301.0,3570


Purchase
CREATE TABLE Purchase (
	Customer_ID INT NOT NULL,
	Order_Date DATE NOT NULL,
	Cart_ID INT NOT NULL PRIMARY KEY,
	Address_ID INT NOT NULL,
	Order_TwoLC TEXT NOT NULL,
	Ship_Date DATE CHECK (Ship_Date >= Order_Date),
	Ship_Mode TEXT CHECK (Ship_Mode IN ('Standard Class', 'Second Class', 'First Class', 'Same Day')),
	Order_Priority TEXT CHECK (Order_Priority IN ('Medium', 'High', 'Critical', 'Low')),
	Returned BOOL DEFAULT False,

	FOREIGN KEY (Address_ID) REFERENCES Address(ID),
	FOREIGN KEY (Customer_ID) REFERENCES Customer(ID),
	FOREIGN KEY (Cart_ID) REFERENCES Cart(ID)
)


Unnamed: 0,Customer_ID,Order_Date,Cart_ID,Address_ID,Order_TwoLC,Ship_Date,Ship_Mode,Order_Priority,Returned
25753,796,2017-12-30,25718,2422,ES,2018-01-04,Standard Class,Medium,False


# DB Disconnect

In [1862]:
cur.close()
con.close()