In [None]:
import os

def scale_input_data(scale_factor):
  file_bases = ['./input/articles', './input/customers', './input/transactions_train']
  for file_base in file_bases:
    import pandas as pd
    import shutil
    if scale_factor == 1.0:
      shutil.copyfile(file_base + '.csv', file_base + '.scaled.csv')
      continue
    df_to_scale = pd.read_csv(file_base + '.csv')
    new_num_rows = int(scale_factor * len(df_to_scale))
    if scale_factor <= 1.0:
      df_to_scale = df_to_scale.iloc[:new_num_rows]
    else:
      while len(df_to_scale) < new_num_rows:
        df_to_scale = pd.concat([df_to_scale, df_to_scale[:min(new_num_rows - len(df_to_scale), len(df_to_scale))]])
    df_to_scale.to_csv(file_base + '.scaled.csv', index=False)

if 'INPUT_SCALE_FACTOR' in os.environ:
  scale_input_data(float(os.environ['INPUT_SCALE_FACTOR']))

## **<span style="color:#023e8a;font-size:200%"><center> 🔥🔥EDA H&M🔥🔥</center></span>**
## **<center><span style="color:#FEF1FE;background-color:#023e8a;border-radius: 5px;padding: 5px">If you find this notebook useful or interesting, please, support with an upvote :)</span></center>**

## **<span style="color:#023e8a;font-size:1000%"><center>EDA</center></span><span style="color:#023e8a;font-size:200%"><center>Exploratory Data Analysis. H&M</center></span>**

# **<a id="Content" style="color:#023e8a;">Table of Content</a>**
* [**<span style="color:#023e8a;">1. First steps</span>**](#First)  
* [**<span style="color:#023e8a;">2. Articles</span>**](#Articles)  
* [**<span style="color:#023e8a;">3. Customers</span>**](#Customers)  
* [**<span style="color:#023e8a;">4. Transactions</span>**](#Transactions)  
* [**<span style="color:#023e8a;">5. Images with description and price</span>**](#Images)  


## **<span style="color:#023e8a;">Intro</span>**

**<span style="color:#023e8a;">The competition is dedicated to the product recomendations (H&M)  </span>**

**<span style="color:#023e8a;">Here we have different kinds of data that help us to get good recomendations: </span>**

📸 `images` - images of every article_id

🙋 `articles`  - detailed metadata of every article_id

👔 `customers`  - detailed metadata of every customer_id

🧾 `transactions_train`  - purchases with details

## **<span id="First" style="color:#023e8a;">1. First steps</span>**

[**<span style="color:#FEF1FE;background-color:#023e8a;border-radius: 5px;padding: 2px">Go to Table of Content</span>**](#Content)

**<span style="color:#023e8a;"> The first step as always: load the data :)</span>**

In [1]:
import numpy as np
# import pandas as pd
exec(os.environ['IREWR_IMPORTS'])
# ALEX: remove plotting
# import seaborn as sns
# from matplotlib import pyplot as plt
# from tqdm.notebook import tqdm

In [2]:
articles = pd.read_csv("./input/articles.scaled.csv")
customers = pd.read_csv("./input/customers.scaled.csv")
transactions = pd.read_csv("./input/transactions_train.scaled.csv")

**<span style="color:#023e8a;"> Let's look at the tables and try to get some outcomes about data inside.</span>**

## **<span id="Articles" style="color:#023e8a;">2. Articles</span>**

[**<span style="color:#FEF1FE;background-color:#023e8a;border-radius: 5px;padding: 2px">Go to Table of Content</span>**](#Content)

**<span style="color:#023e8a;"> This table contains all h&m articles with details such as a type of product, a color, a product group and other features.</span>**  
**<span style="color:#023e8a;"> Article data description: </span>**

> `article_id` **<span style="color:#023e8a;">: A unique identifier of every article.</span>**  
> `product_code`, `prod_name` **<span style="color:#023e8a;">: A unique identifier of every product and its name (not the same).</span>**  
> `product_type`, `product_type_name` **<span style="color:#023e8a;">: The group of product_code and its name</span>**  
> `graphical_appearance_no`, `graphical_appearance_name` **<span style="color:#023e8a;">: The group of graphics and its name</span>**  
> `colour_group_code`, `colour_group_name` **<span style="color:#023e8a;">: The group of color and its name</span>**  
> `perceived_colour_value_id`, `perceived_colour_value_name`, `perceived_colour_master_id`, `perceived_colour_master_name` **<span style="color:#023e8a;">: The added color info</span>**  
> `department_no`, `department_name`: **<span style="color:#023e8a;">: A unique identifier of every dep and its name</span>**  
> `index_code`, `index_name`: **<span style="color:#023e8a;">: A unique identifier of every index and its name</span>**  
> `index_group_no`, `index_group_name`: **<span style="color:#023e8a;">: A group of indeces and its name</span>**  
> `section_no`, `section_name`: **<span style="color:#023e8a;">: A unique identifier of every section and its name</span>**  
> `garment_group_no`, `garment_group_name`: **<span style="color:#023e8a;">: A unique identifier of every garment and its name</span>**  
> `detail_desc`: **<span style="color:#023e8a;">: Details</span>**  

In [3]:
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


**<span style="color:#023e8a;">Ladieswear accounts for a significant part of all dresses. Sportswear has the least portion.</span>**

In [4]:
# ALEX: remove plotting
# f, ax = plt.subplots(figsize=(15, 7))
# ax = sns.histplot(data=articles, y='index_name', color='orange')
# ax.set_xlabel('count by index name')
# ax.set_ylabel('index name')
# plt.show()

**<span style="color:#023e8a;"> The garments grouped by index: Jersey fancy is the most frequent garment, especially for women and children. The next by number is accessories, many various accessories with low price.</span>**

In [5]:
# ALEX: remove plotting
# f, ax = plt.subplots(figsize=(15, 7))
# ax = sns.histplot(data=articles, y='garment_group_name', color='orange', hue='index_group_name', multiple="stack")
# ax.set_xlabel('count by garment group')
# ax.set_ylabel('garment group')
# plt.show()

**<span style="color:#023e8a;">Now, pay attention to index group-index structure. Ladieswear and Children/Baby have subgroups.</span>**

In [6]:
articles.groupby(['index_group_name', 'index_name']).count()['article_id']

index_group_name  index_name                    
Baby/Children     Baby Sizes 50-98                   8875
                  Children Accessories, Swimwear     4615
                  Children Sizes 134-170             9214
                  Children Sizes 92-140             12007
Divided           Divided                           15149
Ladieswear        Ladies Accessories                 6961
                  Ladieswear                        26001
                  Lingeries/Tights                   6775
Menswear          Menswear                          12553
Sport             Sport                              3392
Name: article_id, dtype: int64

**<span style="color:#023e8a;"> And look at the product group-product structure. Accessories are really various, the most numerious: bags, earrings and hats. However, trousers prevail.</span>**

In [7]:
pd.options.display.max_rows = None
articles.groupby(['product_group_name', 'product_type_name']).count()['article_id']

product_group_name     product_type_name       
Accessories            Accessories set                 7
                       Alice band                      6
                       Baby Bib                        3
                       Bag                          1280
                       Beanie                         56
                       Belt                          458
                       Bracelet                      180
                       Braces                          3
                       Bucket hat                      7
                       Cap                            13
                       Cap/peaked                    573
                       Dog Wear                       20
                       Earring                      1159
                       Earrings                       11
                       Eyeglasses                      2
                       Felt hat                       10
                       Giftbox          

**<span style="color:#023e8a;"> And the table with number of unique values in columns:</span>**

In [8]:
for col in articles.columns:
    if not 'no' in col and not 'code' in col and not 'id' in col:
        un_n = articles[col].nunique()
        print(f'n of unique {col}: {un_n}')

n of unique prod_name: 45875
n of unique product_type_name: 131
n of unique product_group_name: 19
n of unique graphical_appearance_name: 30
n of unique colour_group_name: 50
n of unique perceived_colour_value_name: 8
n of unique perceived_colour_master_name: 20
n of unique department_name: 250
n of unique index_name: 10
n of unique index_group_name: 5
n of unique section_name: 56
n of unique garment_group_name: 21
n of unique detail_desc: 43404


## **<span id="Customers" style="color:#023e8a;">3. Customers</span>**

[**<span style="color:#FEF1FE;background-color:#023e8a;border-radius: 5px;padding: 2px">Go to Table of Content</span>**](#Content)

**<span style="color:#023e8a;"> Customers data description: </span>**

> `customer_id` **<span style="color:#023e8a;">: A unique identifier of every customer</span>**  
> `FN` **<span style="color:#023e8a;">: 1 or missed </span>**  
> `Active` **<span style="color:#023e8a;">: 1 or missed</span>**  
> `club_member_status` **<span style="color:#023e8a;">: Status in club</span>**  
> `fashion_news_frequency` **<span style="color:#023e8a;">: How often H&M may send news to customer</span>**  
> `age` **<span style="color:#023e8a;">: The current age</span>**  
> `postal_code` **<span style="color:#023e8a;">: Postal code of customer</span>**  

In [9]:
pd.options.display.max_rows = 50
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


**<span style="color:#023e8a;"> There are no duplicates in </span>** `customers`

In [10]:
customers.shape[0] - customers['customer_id'].nunique()

0

**<span style="color:#023e8a;"> Here we have abnormal number of customers by one postal code. One has 120303, it might be encoded nan adress or smth like a huge distribution center, or pickup.</span>**

In [11]:
data_postal = customers.groupby('postal_code', as_index=False).count().sort_values('customer_id', ascending=False)
data_postal.head()

Unnamed: 0,postal_code,customer_id,FN,Active,club_member_status,fashion_news_frequency,age
61034,2c29ae653a9282cce4151bd87643c907644e09541abc28...,120303,42874,39886,118281,114377,118002
281937,cc4ed85e30f4977dae47662ddc468cd2eec11472de6fac...,261,109,104,261,261,260
156090,714976379549eb90aae4a71bca6c7402cc646ae7c40f6c...,159,90,88,159,159,158
171208,7c1fa3b0ec1d37ce2c3f34f63bd792f3b4494f324b6be5...,157,55,54,157,156,156
126228,5b7eb31eabebd3277de632b82267286d847fd5d44287ee...,156,42,41,156,156,155


**<span style="color:#023e8a;"> Ages, club_member_status are different, like customer_ids.</span>**

In [12]:
customers[customers['postal_code']=='2c29ae653a9282cce4151bd87643c907644e09541abc28ae87dea0d1f6603b1c'].head(5)

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
5,000064249685c11552da43ef22a5030f35a147f723d5b0...,,,,,,2c29ae653a9282cce4151bd87643c907644e09541abc28...
8,00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef77...,,,ACTIVE,NONE,20.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...
9,00008469a21b50b3d147c97135e25b4201a8c58997f787...,,,ACTIVE,NONE,20.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...
11,000097d91384a0c14893c09ed047a963c4fc6a5c021044...,,,ACTIVE,NONE,31.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...
14,0000ae1bbb25e04bdc7e35f718e852adfb3fbb72ef38b3...,1.0,1.0,ACTIVE,Regularly,29.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...


**<span style="color:#023e8a;"> The most common age is about 21-23</span>**

In [13]:
# ALEX: remove plotting
# import seaborn as sns
# from matplotlib import pyplot as plt
# sns.set_style("darkgrid")
# f, ax = plt.subplots(figsize=(10,5))
# ax = sns.histplot(data=customers, x='age', bins=50, color='orange')
# ax.set_xlabel('Distribution of the customers age')
# plt.show()

**<span style="color:#023e8a;"> Status in H&M club. Almost every customer has an active club status, some of them begin to activate it (pre-create). A tiny part of customers abandoned the club.</span>**

In [14]:
# ALEX: remove plotting
# sns.set_style("darkgrid")
# f, ax = plt.subplots(figsize=(10,5))
# ax = sns.histplot(data=customers, x='club_member_status', color='orange')
# ax.set_xlabel('Distribution of club member status')
# plt.show()

**<span style="color:#023e8a;"> Here we have three types for NO DATA. Let's unite these values.</span>**

In [15]:
customers['fashion_news_frequency'].unique()

array(['NONE', 'Regularly', nan, 'Monthly', 'None'], dtype=object)

In [16]:
customers.loc[~customers['fashion_news_frequency'].isin(['Regularly', 'Monthly']), 'fashion_news_frequency'] = 'None'
customers['fashion_news_frequency'].unique()

array(['None', 'Regularly', 'Monthly'], dtype=object)

In [17]:
pie_data = customers[['customer_id', 'fashion_news_frequency']].groupby('fashion_news_frequency').count()

**<span style="color:#023e8a;"> Customers prefer not to get any messages about the current news.</span>**

In [18]:
# ALEX: remove plotting
# sns.set_style("darkgrid")
# f, ax = plt.subplots(figsize=(10,5))
# # ax = sns.histplot(data=customers, x='fashion_news_frequency', color='orange')
# # ax = sns.pie(data=customers, x='fashion_news_frequency', color='orange')
# colors = sns.color_palette('pastel')
# ax.pie(pie_data.customer_id, labels=pie_data.index, colors = colors)
# ax.set_facecolor('lightgrey')
# ax.set_xlabel('Distribution of fashion news frequency')
# plt.show()

## **<span id="Transactions" style="color:#023e8a;">4. Transactions</span>**

[**<span style="color:#FEF1FE;background-color:#023e8a;border-radius: 5px;padding: 2px">Go to Table of Content</span>**](#Content)

**<span style="color:#023e8a;"> Transactions data description: </span>**

> `t_dat` **<span style="color:#023e8a;">: A unique identifier of every customer</span>**  
> `customer_id` **<span style="color:#023e8a;">: A unique identifier of every customer </span>**  **<span style="color:#FF0000;">(in </span>** `customers` **<span style="color:#FF0000;"> table)</span>**  
> `article_id` **<span style="color:#023e8a;">: A unique identifier of every article</span>**  **<span style="color:#FF0000;">(in </span>** `articles` **<span style="color:#FF0000;"> table)</span>**  
> `price` **<span style="color:#023e8a;">: Price of purchase</span>**  
> `sales_channel_id` **<span style="color:#023e8a;">: 1 or 2</span>**  

In [19]:
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


**<span style="color:#023e8a;"> Here we see outliers for price. </span>**

In [20]:
pd.set_option('display.float_format', '{:.4f}'.format)
transactions.describe()['price']

count   31788324.0000
mean           0.0278
std            0.0192
min            0.0000
25%            0.0158
50%            0.0254
75%            0.0339
max            0.5915
Name: price, dtype: float64

In [21]:
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.0508,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.0305,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.0152,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.0169,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.0169,2


In [22]:
# ALEX: remove plotting
# sns.set_style("darkgrid")
# f, ax = plt.subplots(figsize=(10,5))
# ax = sns.boxplot(data=transactions, x='price', color='orange')
# ax.set_xlabel('Price outliers')
# plt.show()

**<span style="color:#023e8a;"> Top 10 customers by num of transactions. </span>**

In [23]:
transactions_byid = transactions.groupby('customer_id').count()

In [24]:
transactions_byid.sort_values(by='price', ascending=False)['price'][:10]

customer_id
be1981ab818cf4ef6765b2ecaea7a2cbf14ccd6e8a7ee985513d9e8e53c6d91b    1895
b4db5e5259234574edfff958e170fe3a5e13b6f146752ca066abca3c156acc71    1441
49beaacac0c7801c2ce2d189efe525fe80b5d37e46ed05b50a4cd88e34d0748f    1364
a65f77281a528bf5c1e9f270141d601d116e1df33bf9df512f495ee06647a9cc    1361
cd04ec2726dd58a8c753e0d6423e57716fd9ebcf2f14ed6012e7e5bea016b4d6    1237
55d15396193dfd45836af3a6269a079efea339e875eff42cc0c228b002548a9d    1208
c140410d72a41ee5e2e3ba3d7f5a860f337f1b5e41c27cf9bda5517c8774f8fa    1170
8df45859ccd71ef1e48e2ee9d1c65d5728c31c46ae957d659fa4e5c3af6cc076    1169
03d0011487606c37c1b1ed147fc72f285a50c05f00b9712e0fc3da400c864296    1157
6cc121e5cc202d2bf344ffe795002bdbf87178054bcda2e57161f0ef810a4b55    1143
Name: price, dtype: int64

**<span style="color:#023e8a;"> However, comparing prices inside groups is more accurate, because accessories and trousers prices may vary largerly. </span>**

**<span style="color:#023e8a;"> Get subset from articles and merge it to transactions. </span>**

In [25]:
articles_for_merge = articles[['article_id', 'prod_name', 'product_type_name', 'product_group_name', 'index_name']]

In [26]:
articles_for_merge = transactions[['customer_id', 'article_id', 'price', 't_dat']].merge(articles_for_merge, on='article_id', how='left')

**<span style="color:#023e8a;"> Here we see outliers for group name prices. Lower/Upper/Full body have a huge price variance. I guess it could be like some unique collections, relative to casual ones. Some high price articles even belong to accessories group.</span>**

In [27]:
# ALEX: remove plotting
# sns.set_style("darkgrid")
# f, ax = plt.subplots(figsize=(25,18))
# ax = sns.boxplot(data=articles_for_merge, x='price', y='product_group_name')
# ax.set_xlabel('Price outliers', fontsize=22)
# ax.set_ylabel('Index names', fontsize=22)
# ax.xaxis.set_tick_params(labelsize=22)
# ax.yaxis.set_tick_params(labelsize=22)

# plt.show()

**<span style="color:#023e8a;"> Then look at boxplot prices according to accessories product group and find the reasons of high prices inside group.</span>**

**<span style="color:#023e8a;"> The largest outliers can be found among bags, which is logical enough. In addition, scarves and other accessories have articles with prices highly contrasting to the rest of garments.</span>**

In [28]:
# ALEX: remove plotting
# sns.set_style("darkgrid")
# f, ax = plt.subplots(figsize=(25,18))
_ = articles_for_merge[articles_for_merge['product_group_name'] == 'Accessories']
# ALEX: remove plotting
# ax = sns.boxplot(data=_, x='price', y='product_type_name')
# ax.set_xlabel('Price outliers', fontsize=22)
# ax.set_ylabel('Index names', fontsize=22)
# ax.xaxis.set_tick_params(labelsize=22)
# ax.yaxis.set_tick_params(labelsize=22)
# del _

# plt.show()

**<span style="color:#023e8a;"> The index with the highest mean price is Ladieswear. With the lowest - children. </span>**

In [29]:
articles_index = articles_for_merge[['index_name', 'price']].groupby('index_name').mean()
# ALEX: remove plotting
# sns.set_style("darkgrid")
# f, ax = plt.subplots(figsize=(10,5))
# ax = sns.barplot(x=articles_index.price, y=articles_index.index, color='orange', alpha=0.8)
# ax.set_xlabel('Price by index')
# ax.set_ylabel('Index')
# plt.show()
_ = articles_index.price
_ = articles_index.index

**<span style="color:#023e8a;"> Stationery has the lowest mean price, the highest - shoes. </span>**

In [30]:
articles_index = articles_for_merge[['product_group_name', 'price']].groupby('product_group_name').mean()
# ALEX: remove plotting
# sns.set_style("darkgrid")
# f, ax = plt.subplots(figsize=(10,5))
# ax = sns.barplot(x=articles_index.price, y=articles_index.index, color='orange', alpha=0.8)
# ax.set_xlabel('Price by product group')
# ax.set_ylabel('Product group')
# plt.show()
_ = articles_index.price
_ = articles_index.index

**<span style="color:#023e8a;"> Now check the mean price change in time for top 5 product groups by mean price: </span>**
>`Shoes`  
>`Garment Full body`  
>`Bags`  
>`Garment Lower body`  
>`Underwear/nightwear`  

In [31]:
articles_for_merge['t_dat'] = pd.to_datetime(articles_for_merge['t_dat'])

In [32]:
product_list = ['Shoes', 'Garment Full body', 'Bags', 'Garment Lower body', 'Underwear/nightwear']
colors = ['cadetblue', 'orange', 'mediumspringgreen', 'tomato', 'lightseagreen']
k = 0
# ALEX: remove plotting
# f, ax = plt.subplots(3, 2, figsize=(20, 15))
for i in range(3):
    for j in range(2):
        try:
            product = product_list[k]
            articles_for_merge_product = articles_for_merge[articles_for_merge.product_group_name == product_list[k]]
            series_mean = articles_for_merge_product[['t_dat', 'price']].groupby(pd.Grouper(key="t_dat", freq='M')).mean().fillna(0)
            series_std = articles_for_merge_product[['t_dat', 'price']].groupby(pd.Grouper(key="t_dat", freq='M')).std().fillna(0)
# ALEX: remove plotting
#             ax[i, j].plot(series_mean, linewidth=4, color=colors[k])
#             ax[i, j].fill_between(series_mean.index, (series_mean.values-2*series_std.values).ravel(), 
#                              (series_mean.values+2*series_std.values).ravel(), color=colors[k], alpha=.1)
#             ax[i, j].set_title(f'Mean {product_list[k]} price in time')
#             ax[i, j].set_xlabel('month')
#             ax[i, j].set_xlabel(f'{product_list[k]}')
            _ = series_mean.index
            _ = (series_mean.values-2*series_std.values).ravel()
            _ = (series_mean.values+2*series_std.values).ravel()
            k += 1
        except IndexError:
            pass
# ALEX: remove plotting
#             ax[i, j].set_visible(False)
# plt.show()

## **<span id="Images" style="color:#023e8a;">5. Images with description and price</span>**

[**<span style="color:#FEF1FE;background-color:#023e8a;border-radius: 5px;padding: 2px">Go to Table of Content</span>**](#Content)

**<span style="color:#023e8a;"> Let's check the last purchases by max price and by min price </span>**

In [33]:
# ALEX: remove plotting
# import matplotlib.pyplot as plt
# import matplotlib.image as mpimg

In [34]:
max_price_ids = transactions[transactions.t_dat==transactions.t_dat.max()].sort_values('price', ascending=False).iloc[:5][['article_id', 'price']]
min_price_ids = transactions[transactions.t_dat==transactions.t_dat.min()].sort_values('price', ascending=True).iloc[:5][['article_id', 'price']]

**<span style="color:#023e8a;"> Photos with description and price (top 5 max) </span>**

In [35]:
# ALEX: remove plotting
# f, ax = plt.subplots(1, 5, figsize=(20,10))
i = 0
for _, data in max_price_ids.iterrows():
    desc = articles[articles['article_id'] == data['article_id']]['detail_desc'].iloc[0]
    desc_list = desc.split(' ')
    for j, elem in enumerate(desc_list):
        if j > 0 and j % 5 == 0:
            desc_list[j] = desc_list[j] + '\n'
    desc = ' '.join(desc_list)
# ALEX: remove plotting
#     img = mpimg.imread(f'./input/images/0{str(data.article_id)[:2]}/0{int(data.article_id)}.jpg')
#     ax[i].imshow(img)
#     ax[i].set_title(f'price: {data.price:.2f}')
#     ax[i].set_xticks([], [])
#     ax[i].set_yticks([], [])
#     ax[i].grid(False)
#     ax[i].set_xlabel(desc, fontsize=10)
    _ = data.article_id
    _ = data.article_id
    _ = data.price
    i += 1
# ALEX: remove plotting
# plt.show()

**<span style="color:#023e8a;"> Photos with description and price (top 5 min) </span>**

In [36]:
# ALEX: remove plotting
# f, ax = plt.subplots(1, 5, figsize=(20,10))
i = 0
for _, data in min_price_ids.iterrows():
    desc = articles[articles['article_id'] == data['article_id']]['detail_desc'].iloc[0]
    desc_list = desc.split(' ')
    for j, elem in enumerate(desc_list):
        if j > 0 and j % 4 == 0:
            desc_list[j] = desc_list[j] + '\n'
    desc = ' '.join(desc_list)
# ALEX: remove plotting
#     img = mpimg.imread(f'./input/images/0{str(data.article_id)[:2]}/0{int(data.article_id)}.jpg')
#     ax[i].imshow(img)
#     ax[i].set_title(f'price: {data.price:.4f}')
#     ax[i].set_xlabel(desc, fontsize=10)
#     ax[i].set_xticks([], [])
#     ax[i].set_yticks([], [])
#     ax[i].grid(False)
    _ = data.article_id
    _ = data.article_id
    _ = data.price
    i += 1
# ALEX: remove plotting
# plt.axis('off')
# plt.show()

## **<center><span style="color:#FEF1FE;background-color:#023e8a;border-radius: 5px;padding: 5px">Thanks for reading! If you find this notebook useful or interesting, please, support with an upvote :)</span></center>**