In [None]:
%pip install pandas
%pip install matplotlib

In [None]:
import pandas as pd

In [None]:
# If witrina_all_orders_original.csv exists, read it and make a bool variable original
# If not, read the witrina_all_orders_anonymized.csv and make a bool variable original
try:
    df = pd.read_csv('../data/original/witrina_all_orders_original.csv')
    isOriginalDataUsed = True
except:
    df = pd.read_csv('../data/anonymous/witrina_all_orders_anonymized.csv')
    isOriginalDataUsed = False

In [None]:
df.head()

In [None]:
# Trim column names
df.columns = df.columns.str.strip()

In [None]:
# Drop unnecessary columns: password, Billing Address 2, order notes
df = df.drop(['Password', 'Billing Address 2', 'Order note', 'Order notes', 'Order number', 'Weglot Language', 'Modified Date', 'Stripe Charge Captured'], axis=1)

In [None]:
# Rename columns
df = df.rename(columns={'Payment Method Title': 'Payment Method'})

# Renaming the first occurrence of "Shipping Method Title"
cols = df.columns.tolist()
cols[cols.index('Shipping Method Title')] = 'Shipping Method 1'
df.columns = cols

# Renaming the second occurrence of "Shipping Method Title"
cols = df.columns.tolist()
cols[cols.index('Shipping Method Title')] = 'Shipping Method 2'
df.columns = cols


In [None]:
# Check last rows of the data where Billing Country is Croatia
df[df['Billing Country'] == "HR"].tail(40)

In [None]:
# Define a conditional expression for HRK
condition = (df['Billing Country'] == "HR") & (df['Date'] <= "2022-08-12")
exchange_rate = 7.5345

In [None]:
# Convert HRK to EUR for orders from Croatia that happened before 2022-08-12 (inclusive) in a new column called Total (EUR)
df.loc[(df['Billing Country'] == "HR") & (df['Date'] <= "2022-08-12"), 'Total (EUR)'] = (df['Total'] / exchange_rate).round(2)
# Do the same for 'Order Total' column
df.loc[(df['Billing Country'] == "HR") & (df['Date'] <= "2022-08-12"), 'Order Total (EUR)'] = (df['Order Total'] / exchange_rate).round(2)
# Do the same for 'Subtotal' column
df.loc[(df['Billing Country'] == "HR") & (df['Date'] <= "2022-08-12"), 'Subtotal (EUR)'] = (df['Subtotal'] / exchange_rate).round(2)

In [None]:
# Fill NaN values in Total (EUR) column with values from Total column
df['Total (EUR)'].fillna(df['Total'], inplace=True)
# Fill NaN values in Order Total (EUR) column with values from Order Total column
df['Order Total (EUR)'].fillna(df['Order Total'], inplace=True)
# Fill NaN values in Subtotal (EUR) column with values from Subtotal column
df['Subtotal (EUR)'].fillna(df['Subtotal'], inplace=True)

In [None]:
# Drop Total column
df = df.drop(['Total', 'Order Total', 'Subtotal'], axis=1)

In [None]:
df[df['Billing Country'] == 'HR'].tail(20)

In [None]:
# Maybe Convert HRK to EUR for other columns as well?

In [None]:
# Fill NaN values with 0: Order Shipping
df['Order Shipping'] = df['Order Shipping'].fillna(0)

In [None]:
# Check if the following is true in a newly created column: above 52 EUR Cro free shipping & above 290 EUR outside Cro free shipping
df['Free Shipping'] = (df['Total (EUR)'] >= 52) & (df['Billing Country'] == "HR") | (df['Total (EUR)'] >= 290) & (df['Billing Country'] != "HR")
# Find the first row where Order Shipping > 0 and Free Shipping is True to find the last date where free shipping was not applied
df.loc[(df['Order Shipping'] > 0) & (df['Free Shipping'] == True)].head(1)

In [None]:
# Clean the payment method column
df.loc[df['Payment Method'].str.contains('PayPal', case=False, na=False), 'Payment Method'] = 'PayPal'
df.loc[df['Payment Method'].str.contains('Stripe Safe Card', case=False, na=False), 'Payment Method'] = 'Stripe Safe Card'
df.loc[df['Payment Method'].str.contains('bank transfer', case=False, na=False), 'Payment Method'] = 'Bank Transfer'

In [None]:
# # Clean the Stripe Charge Captured column with 0 and 1 values
# df['Stripe Charge Captured'] = df['Stripe Charge Captured'].replace(['Yes', 'No'], [1, 0])
# # # make that column a number
# # df['Stripe Charge Captured'] = df['Stripe Charge Captured'].cast(pl.Int32)
# df.head()


In [None]:
# Merge the 2 Shipping Method Title columns
df['Shipping Method'] = df['Shipping Method 1'].fillna(df['Shipping Method 2'])
# Drop the Shipping Method Title.1 column
df = df.drop(['Shipping Method 1', 'Shipping Method 2'], axis=1)


In [None]:
# Merge 'First Name' and 'Last Name' columns into 'Full Name' column
df['Billing Full Name'] = df['Billing First Name'] + ' ' + df['Billing Last Name']

In [None]:
# Drop 'First Name' and 'Last Name' columns
df = df.drop(['Billing First Name', 'Billing Last Name'], axis=1)

In [None]:
# Make pie chart of shipping methods
df['Shipping Method'].value_counts().plot(kind='pie', figsize=(10, 10), autopct='%1.1f%%', startangle=90, title='Shipping Methods')


In [None]:
# Make pie chart of payment methods
df['Payment Method'].value_counts().plot(kind='pie', figsize=(10, 10), autopct='%1.1f%%', startangle=90, title='Payment Methods')

Save rows with Status "wc-completed"

In [None]:
# Filter only "wc-completed" orders into new dataframe
completed_orders = df[df['Status'] == "wc-completed"]

# Save the dataframe to a csv file. 
if isOriginalDataUsed:
    completed_orders.to_csv('../data/original/completed_orders.csv', index=False)
else:
    completed_orders.to_csv('../data/anonymous/completed_orders.csv', index=False)

In [None]:
completed_orders[(completed_orders['Shipping Country'] == "HR") & pd.notna(completed_orders['Payment Method'])].tail(20)

In [None]:
# Compare the rest of the statuses with the "wc-completed" orders
