In [2]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


In [5]:
df = pd.read_csv('Financials.csv')

In [None]:
df.shape

In [None]:
df.head(1)

Приведем названия колонок в удобный вид.

In [None]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

Проверим на наличие пропусков и дубликатов

In [None]:
print(df.isna().any())
print(df.)

In [None]:
df.nunique()

In [None]:
df.head(1)

In [None]:
df.columns.to_list()

Очищаем данные от ненужных символов и приводим данные к нужным форматам.

In [None]:
columns_to_clean = ['units_sold',
                    'manufacturing_price',
                    'sale_price',
                    'gross_sales',
                    'discounts',
                    'sales',
                    'cogs',
                    'profit']

df[columns_to_clean] = df[columns_to_clean].apply(lambda x: x.str.replace('[$,)(]', '', regex=True).str.strip().str.replace('-', '0'))

In [None]:
df[columns_to_clean] = df[columns_to_clean].astype(float)
df['date'] = pd.to_datetime(df['date'], dayfirst=True)
df['year_month'] = df.date.dt.strftime('%Y-%m')
df.head(5)

Используем метод `applymap` чтобы избавиться от лишних пробелов по всему датафрейму.

In [None]:
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [None]:
df.head()

In [None]:
product_sold_by_country = df.groupby(['country', 'product'], as_index=False).units_sold.sum()

In [None]:
sns.barplot(x = 'product', y = 'units_sold', data = product_sold_by_country, errorbar=None, hue='country')

In [None]:
segment_representation = df.segment.value_counts()
plt.pie(segment_representation.values, labels = segment_representation.index, autopct='%1.1f%%')
plt.show()

In [None]:
sales_df = df.groupby(['date', 'country'], as_index=False).sales.sum()

sns.lineplot(x='date', y='sales', data = sales_df, hue = 'country')
plt.xticks(rotation=45)
plt.show()

In [None]:
df.groupby('segment').sales.mean()

In [None]:
df.groupby('segment').profit.mean()

In [None]:
fig,axes = plt.subplots(1,2, figsize=(15, 5))

sns.barplot(df.groupby('segment').sales.mean(), ax=axes[0], color='lightgreen')
sns.barplot(df.groupby('segment').profit.mean(), ax=axes[1], color='skyblue')

In [None]:
df.to_csv('financials_clean.csv', index=False)