In [None]:
import numpy as np
import pandas as pd

In [None]:
train_df = pd.read_csv("../datasets/train-data.csv", sep=";")

In [None]:
# remove duplicate rows
train_df = train_df.drop_duplicates()
train_df.head(5)

In [None]:
# Site produced -> Country sold
unique_sites_count = train_df['Site'].nunique()
print("Number of unique rows for the 'Site' column:", unique_sites_count)


## Graph
Nodes represent countries and edges show the number of products sold from a country to another.

In [None]:
# Read the country code mapping from the file
country_code_df = pd.read_csv("../datasets/country_codes.txt")

# Function to extract country code from Site column
def extract_country_code(site):
    if site.split('_')[0] == 'OOS':
        return 'US' # only exceptional case
    return site.split('_')[0]

# Add a new column 'Site Country' to train_df
train_df['Site Country'] = train_df['Site'].apply(extract_country_code)

# Merge train_df with country_code_df to get country names
train_df = pd.merge(train_df, country_code_df, left_on='Site Country', right_on='Code', how='left')

# Drop unnecessary columns and rename columns
train_df = train_df.drop(['Site Country', 'Code'], axis=1)
train_df = train_df.rename(columns={'Name': 'Site Country'})

# Display the final DataFrame
train_df.head(5)

In [None]:
# Merge train_df with country_code_df to map country codes to country names
train_df['Country'] = train_df['Country'].str.upper()
train_df = pd.merge(train_df, country_code_df, left_on='Country', right_on='Code', how='left')

# Fill NaN values in the 'Name' column (corresponding to missing country codes) with the original country codes
train_df['Name'].fillna(train_df['Country'], inplace=True)

# Drop unnecessary columns and rename columns
train_df = train_df.drop(['Country', 'Code'], axis=1)
train_df = train_df.rename(columns={'Name': 'Country'})

train_df.head(5)


In [None]:
train_df['Month 1'] = pd.to_numeric(train_df['Month 1'], errors='coerce').astype('Int64')
train_df['Month 2'] = pd.to_numeric(train_df['Month 2'], errors='coerce').astype('Int64')
train_df['Month 3'] = pd.to_numeric(train_df['Month 3'], errors='coerce').astype('Int64')

train_df['Site Country'] = train_df['Site Country'].astype(str)
train_df['Country'] = train_df['Country'].astype(str)

train_df.dtypes

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Aggregate sales data by country
aggregated_df = train_df.groupby(['Site Country', 'Country']).agg({
    'Month 1': 'sum',
    'Month 2': 'sum',
    'Month 3': 'sum'
}).reset_index()

aggregated_df['Sales'] = aggregated_df[['Month 1', 'Month 2', 'Month 3']].sum(axis=1)

# Drop the individual month columns
aggregated_df = aggregated_df.drop(['Month 1', 'Month 2', 'Month 3'], axis=1)

# Filter out rows with the sum of Month 1,2,3 less than or equal to 100
aggregated_df = aggregated_df[aggregated_df['Sales'] > 100]


# Filter out rows where 'Site Country' and 'Country' have the same name
aggregated_df = aggregated_df[aggregated_df['Site Country'] != aggregated_df['Country']]


# Assuming 'aggregated_df' is your DataFrame
top_10_sales = aggregated_df.nlargest(10, 'Sales')

# Display the top 10 rows with the highest sales
print(top_10_sales)

In [None]:
# Create a directed graph
G = nx.DiGraph()

# Add nodes and edges with weights
for _, row in aggregated_df.iterrows():
    site_country = row['Site Country']
    country = row['Country']
    total_sales = row['Sales']
    
    if site_country not in G.nodes:
        G.add_node(site_country, bipartite=0)
    if country not in G.nodes:
        G.add_node(country, bipartite=1)
    
    G.add_edge(site_country, country, weight=total_sales)


plt.figure(figsize=(25, 15))

# pos = nx.circular_layout(G)
pos = nx.shell_layout(G)

# Draw nodes and edges
nx.draw(G, pos, with_labels=True, font_weight='bold')

# Draw edge labels
edge_labels = nx.get_edge_attributes(G, 'weight')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)

# Show the plot
plt.show()


In [None]:
# Extract year and month information
train_df['Year'] = train_df['Date'].str.extract(r'(\d{4})')
train_df['Month'] = train_df['Date'].str.extract(r'([a-zA-Z]+)')

# Create a mapping of month names to numerical values
month_mapping = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
                 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}

# Map month names to numerical values
train_df['Month'] = train_df['Month'].str.lower().map(month_mapping)

# Group by 'id_product' and sort each group by year and month
sorted_df = train_df.sort_values(['id_product', 'Year', 'Month']).groupby('id_product')
sorted_df.head(5)

Create a new column to store the sequence of 'Site Country' values
train_df['Site_Country_Sequence'] = sorted_df['Site Country'].transform(lambda x: '; '.join(x))

# Drop temporary columns 'Year' and 'Month'
train_df = train_df.drop(['Year', 'Month'], axis=1)


In [None]:
# Set the display option to show full content of columns
pd.set_option('display.max_colwidth', None)

# Display the DataFrame
print(train_df.head(2))

# Reset the display option to its default value (optional)
pd.reset_option('display.max_colwidth')

In [None]:
# Count the number of unique countries in each row
train_df['Unique_Country_Count'] = train_df['Site_Country_Sequence'].apply(lambda x: len(set(x.split('; '))))

# Filter rows where there are more than 1 unique country
rows_with_multiple_countries = train_df[train_df['Unique_Country_Count'] > 1]

# Display the rows
print(rows_with_multiple_countries)

# Drop the temporary column
train_df = train_df.drop(['Unique_Country_Count'], axis=1)

There are no products for which their Site Count has changed throughout the dataset.