In [30]:
#Setup
import pandas as pd
import altair as alt
from pathlib import Path
import numpy as np
import statsmodels.api as sm
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [31]:
parent_path = str(Path().resolve().parent.parent) + "/"
data_path = "data/"
processed_path = "processed/"
processed_data_file = "processed_data.csv"

In [32]:
df = pd.read_csv(parent_path + data_path + processed_path + processed_data_file)

### Käufe vs Rückgaben 

In [33]:
purchases = df[~df['InvoiceNo'].str.contains('C')]
returns = df[df['InvoiceNo'].str.contains('C')]
purchases_count = purchases.shape[0]
returns_count = returns.shape[0]
# Create a dataframe with the counts
data = pd.DataFrame({'Category': ['Purchases', 'Returns'], 'Count': [purchases_count, returns_count]})
# Create the chart
chart = alt.Chart(data).mark_bar().encode(
    x='Category',
    y='Count'
).configure_view(
    strokeWidth=0
).configure_title(
    fontSize=22,
    font="Arial",
    color="black",
    anchor="start"
).configure_axis(
    labelFont="Arial",
    titleFont="Arial",
    labelFontSize=14,
    titleFontSize=18
)
chart

### Kunden mit den meisten Einkäufen und Rückgaben

In [34]:

df2 = df[df['InvoiceNo'].str.contains('C') == False].groupby('CustomerID')['InvoiceNo'].count().reset_index(name='count')
df2['canceled'] = df[df['InvoiceNo'].str.contains('C') == True].groupby('CustomerID')['InvoiceNo'].count().reset_index(name='count')['count']
df2
df2['CustomerID'] = df2['CustomerID'].astype(int).astype(str)


In [35]:

#Top 10 canceled orders
alt.Chart(df2.sort_values('canceled', ascending=False).head(10)).mark_bar().encode(
    x='canceled',
    y=alt.Y('CustomerID', sort='-x'),
    
    
).configure_view(
    strokeWidth=0
).configure_title(
    fontSize=22,
    font="Arial",
    color="black",
    anchor="start"
).configure_axis(
    labelFont="Arial",
    titleFont="Arial",
    labelFontSize=14,
    titleFontSize=18
)

In [36]:
#Top 10 orders
alt.Chart(df2.sort_values('count', ascending=False).head(10)).mark_bar().encode(
    x='count',
    y=alt.Y('CustomerID', sort='-x'),  
).configure_view(
    strokeWidth=0
).configure_title(
    fontSize=22,
    font="Arial",
    color="black",
    anchor="start"
).configure_axis(
    labelFont="Arial",
    titleFont="Arial",
    labelFontSize=14,
    titleFontSize=18
)

###  NUR Kunden die Einkäufe und Rückaben getätigt haben

In [37]:
#drop nan
df3 = df2.dropna()

In [38]:
alt.Chart(df3).mark_circle().encode(
    x='count',
    y='canceled',
    tooltip='CustomerID'
).configure_view(
    strokeWidth=0
).configure_title(
    fontSize=22,
    font="Arial",
    color="black",
    anchor="start"
).configure_axis(
    labelFont="Arial",
    titleFont="Arial",
    labelFontSize=14,
    titleFontSize=18
)


In [39]:
chart = alt.Chart(df3).mark_circle().encode(
    x='count',
    y='canceled',
    tooltip='CustomerID'
).interactive()

# Filter out data points with count values greater than 1000 or canceled values greater than 20
chart = chart.transform_filter(
    (alt.datum.count < 2000) & (alt.datum.canceled < 80)
).configure_view(
    strokeWidth=0
).configure_title(
    fontSize=22,
    font="Arial",
    color="black",
    anchor="start"
).configure_axis(
    labelFont="Arial",
    titleFont="Arial",
    labelFontSize=14,
    titleFontSize=18
)

chart

In [40]:
chart = alt.Chart(df3).mark_circle().encode(
    x='count',
    y='canceled',
    tooltip='CustomerID'
).interactive()

# Filter out data points with count values greater than 1000 or canceled values greater than 20
chart = chart.transform_filter(
    (alt.datum.count < 800) & (alt.datum.canceled < 50)
).configure_view(
    strokeWidth=0
).configure_title(
    fontSize=22,
    font="Arial",
    color="black",
    anchor="start"
).configure_axis(
    labelFont="Arial",
    titleFont="Arial",
    labelFontSize=14,
    titleFontSize=18
)

chart

## Alt nicht löschen

In [73]:
# group the dataframe by CustomerID and count the number of purchases and returns
data = df.groupby(['CustomerID', 'InvoiceNo']).size().reset_index(name='Count')

# Create a new column 'Category' with 'Purchases' or 'Returns' based on whether the InvoiceNo contains 'C'
data['Category'] = np.where(data['InvoiceNo'].str.contains('C'), 'Returns', 'Purchases')
# CustomerID as a string without decimals
data['CustomerID'] = data['CustomerID'].astype(int).astype(str)

data

Unnamed: 0,CustomerID,InvoiceNo,Count,Category
0,12346,541431,1,Purchases
1,12346,C541433,1,Returns
2,12347,537626,31,Purchases
3,12347,542237,29,Purchases
4,12347,549222,24,Purchases
...,...,...,...,...
21786,18283,579673,52,Purchases
21787,18283,580872,50,Purchases
21788,18287,554065,29,Purchases
21789,18287,570715,38,Purchases


In [88]:

# Create the chart sorted by top 10 returns
alt.Chart(data[data['Category'] == 'Returns'].groupby('CustomerID')['Count'].sum().reset_index(name='Count').sort_values('Count', ascending=False).head(10)).mark_bar().encode(
    x='Count',
    y=alt.Y('CustomerID', sort='-x'),

)


In [47]:
base = alt.Chart(df2.sort_values('count', ascending=False).head(10)).encode(
    x=alt.X("CustomerID:N", title = "Customer ID"),
)
bar1 = base.mark_bar().encode(
    y=alt.Y("count", title = "count"),
    color=alt.Color("count", legend=None)
)
bar2 = base.mark_bar().encode(
    y=alt.Y("canceled", title = "canceled"),
    color=alt.Color("canceled", legend=None)
)

alt.layer(
    alt.repeat(column=["count", "canceled"]),
    data=df
).resolve_scale(
    y="independent"
).resolve_legend(
    color="independent"
)

TypeError: repeat() got an unexpected keyword argument 'column'