# eCommerce project

In [1]:
import pandas as pd
import numpy as np
from datetime import date, datetime, timedelta
import warnings
warnings.filterwarnings("ignore")

#### Reading user data

In [2]:
customers = pd.read_csv('olist_customers_dataset.csv')

Studying the dataset

In [3]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB


Let's find out what is the difference between customer_id and customer_unique_id

In [3]:
customers.customer_id.nunique()

99441

In [4]:
 customers.customer_unique_id.nunique() 

96096

In [5]:
customers.groupby('customer_unique_id', as_index=False)\
        .agg({'customer_id':'count'}).query('customer_id >1')\
        .sort_values('customer_id', ascending=False)

Unnamed: 0,customer_unique_id,customer_id
52973,8d50f5eadf50201ccdcedfb9e2ac8455,17
23472,3e43e6105506432c953e165fb2acf44c,9
10354,1b6c7548a2a1f9037c1fd3ddfed95f33,7
37797,6469f99c1f9dfae7733b25662e7f1782,7
76082,ca77025e7201e3b30c44b472ff346268,7
...,...,...
33730,597e54e653cbc1ce5df1e6d97bbc448c,2
33755,598ef1f13aba15f2b88387184f6c25ec,2
33829,59bba92de9b8f6be0808e4d9d64b4b43,2
33867,59d66d72939bc9497e19d89c61a96d5f,2


The user's unique id is customer unique_id and customer_id is the order id. 

#### Reading and studying order data

In [6]:
order = pd.read_csv('olist_orders_dataset.csv', 
                    parse_dates=['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date',
                               'order_delivered_customer_date', 'order_estimated_delivery_date'])

In [7]:
order.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  object        
 1   customer_id                    99441 non-null  object        
 2   order_status                   99441 non-null  object        
 3   order_purchase_timestamp       99441 non-null  datetime64[ns]
 4   order_approved_at              99281 non-null  datetime64[ns]
 5   order_delivered_carrier_date   97658 non-null  datetime64[ns]
 6   order_delivered_customer_date  96476 non-null  datetime64[ns]
 7   order_estimated_delivery_date  99441 non-null  datetime64[ns]
dtypes: datetime64[ns](5), object(3)
memory usage: 6.1+ MB


In [8]:
order.isna().sum()

order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64

#### Reading and studying goods data

In [9]:
order_items = pd.read_csv('olist_order_items_dataset.csv', parse_dates=['shipping_limit_date'])

In [10]:
order_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   order_id             112650 non-null  object        
 1   order_item_id        112650 non-null  int64         
 2   product_id           112650 non-null  object        
 3   seller_id            112650 non-null  object        
 4   shipping_limit_date  112650 non-null  datetime64[ns]
 5   price                112650 non-null  float64       
 6   freight_value        112650 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 6.0+ MB


## Tasks

### 1. How many users do we have who made a purchase only once?

In [11]:
purh = order.query('order_status == ["delivered", "processing", "shipped"]')

In [12]:
purh.merge(customers, how='left', on='customer_id')\
    .groupby('customer_unique_id', as_index=False)\
    .agg({'order_id':'count'})\
    .query('order_id == 1')\
    .shape[0]

91814

### 2. How many orders per month are not delivered in average for various reasons (display details by reason)?

In [13]:
order['Year_month'] = order['order_purchase_timestamp'].dt.to_period('M')

In [14]:
order.query("order_status == ['unavailable', 'canceled']")\
    .groupby(['Year_month','order_status'], as_index=False)\
    .order_id.count()\
    .groupby('order_status', as_index=False)\
    .order_id.mean()

Unnamed: 0,order_status,order_id
0,canceled,26.041667
1,unavailable,29.0


### 3. For each product, determine on what day of the week the product is most often purchased.

In [15]:
order['day_name'] = order['order_purchase_timestamp'].dt.day_name()

**1 version**

In [16]:
%%time
order.merge(order_items, how='left', on='order_id')\
    .groupby(['product_id', 'day_name'], as_index=False)\
    .order_id.count()\
    .set_index('day_name')\
    .groupby('product_id', as_index=False)\
    .order_id.idxmax()

Wall time: 15.3 s


Unnamed: 0,product_id,order_id
0,00066f42aeeb9f3007548bb9d3f33c38,Sunday
1,00088930e925c41fd95ebfe695fd2655,Tuesday
2,0009406fd7479715e4bef61dd91f2462,Thursday
3,000b8f95fcb9e0096488278317764d19,Friday
4,000d9be29b5207b54e86aa1b1ac54872,Tuesday
...,...,...
32946,fff6177642830a9a94a0f2cba5e476d1,Saturday
32947,fff81cc3158d2725c0655ab9ba0f712c,Monday
32948,fff9553ac224cec9d15d49f5a263411f,Friday
32949,fffdb2d0ec8d6a61f0a0a0db3f25b441,Tuesday


**2 version**

In [17]:
%%time
order.merge(order_items, how='left', on='order_id')\
    .groupby(['product_id', 'day_name'], as_index=False)\
    .order_id.count()\
    .sort_values(['product_id', 'order_id'], ascending=False)\
    .groupby(['product_id'])\
    .head(1).reset_index(drop=True)

Wall time: 230 ms


Unnamed: 0,product_id,day_name,order_id
0,fffe9eeff12fcbd74a2f2b007dde0c58,Wednesday,1
1,fffdb2d0ec8d6a61f0a0a0db3f25b441,Tuesday,2
2,fff9553ac224cec9d15d49f5a263411f,Friday,1
3,fff81cc3158d2725c0655ab9ba0f712c,Monday,1
4,fff6177642830a9a94a0f2cba5e476d1,Saturday,1
...,...,...,...
32946,000d9be29b5207b54e86aa1b1ac54872,Tuesday,1
32947,000b8f95fcb9e0096488278317764d19,Friday,1
32948,0009406fd7479715e4bef61dd91f2462,Thursday,1
32949,00088930e925c41fd95ebfe695fd2655,Tuesday,1


### 4. How many purchases does each user have per week (by months) on average?  
There may not be an integer number of weeks within a month. For example, November 2021 has 4.28 weeks.

In [18]:
cust_purh = purh.merge(customers, how='left', on='customer_id')

In [19]:
cust_purh['num_of_weeks'] = cust_purh.order_purchase_timestamp.dt.daysinmonth/7

In [20]:
cust_purh['Year_month'] = cust_purh.order_purchase_timestamp.dt.to_period('M')

In [21]:
cust_purh.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,num_of_weeks,Year_month
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP,4.428571,2017-10
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,af07308b275d755c9edb36a90c618231,47813,barreiras,BA,4.428571,2018-07
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,3a653a41f6f9fc3d2a113cf8398680e8,75265,vianopolis,GO,4.428571,2018-08
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15,7c142cf63193a1473d2e66489a9ae977,59296,sao goncalo do amarante,RN,4.285714,2017-11
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26,72632f0f9dd73dfee390c9b22eb56dd6,9195,santo andre,SP,4.0,2018-02


In [22]:
cust_purh_in_week = cust_purh.groupby(['customer_unique_id', 'Year_month', 'num_of_weeks'], as_index = False).order_id.count()

In [23]:
cust_purh_in_week['purh_in_week']=cust_purh_in_week.order_id/cust_purh_in_week.num_of_weeks

In [24]:
cust_purh_in_week.sort_values('purh_in_week', ascending=False)

Unnamed: 0,customer_unique_id,Year_month,num_of_weeks,order_id,purh_in_week
7214,12f5d6e1cbf93dafd9dcc19095df0b3d,2017-01,4.428571,6,1.354839
68340,b4e4f24de1e8725b74e4a1f4975116ed,2018-02,4.000000,4,1.000000
23576,3e43e6105506432c953e165fb2acf44c,2018-02,4.000000,4,1.000000
61252,a239b8e2fbce33780f1f1912e2ee5275,2017-02,4.000000,4,1.000000
14263,25a560b9a6006157838aab1bdbd68624,2017-04,4.285714,4,0.933333
...,...,...,...,...,...
40372,6ac4c2b8476a9b14dc824e2cda804dfc,2018-08,4.428571,1,0.225806
40373,6ac52814155aa5b8da64bb30a14c6b3b,2018-05,4.428571,1,0.225806
40374,6ac5a10275f1f5885f87a85356c980b9,2018-03,4.428571,1,0.225806
40376,6ac6902b78ed854e42f75c8e25234e8c,2018-08,4.428571,1,0.225806


### 5. Write a python function that allows you to build a cohort analysis. Between January and December, identify the cohort with the highest retention for the 3rd month.

**Let's prepare the data - we need 3 columns - order_id, order_purchase_timestamp, customer_unique_id for completed purchases.**

In [25]:
cust_purh_coh = purh.merge(customers, how='left', on='customer_id')[['order_id', 'order_purchase_timestamp', 'customer_unique_id']]

In [26]:
cust_purh_coh.head()

Unnamed: 0,order_id,order_purchase_timestamp,customer_unique_id
0,e481f51cbdc54678b7cc49136f2d6af7,2017-10-02 10:56:33,7c396fd4830fd04220f754e42b4e5bff
1,53cdb2fc8bc7dce0b6741e2150273451,2018-07-24 20:41:37,af07308b275d755c9edb36a90c618231
2,47770eb9100c2d0c44946d9cf07ec65d,2018-08-08 08:38:49,3a653a41f6f9fc3d2a113cf8398680e8
3,949d5b44dbf5de918fe9c16f97b45f8a,2017-11-18 19:28:06,7c142cf63193a1473d2e66489a9ae977
4,ad21c59c0840e6cb83a9ceb5573f8159,2018-02-13 21:18:39,72632f0f9dd73dfee390c9b22eb56dd6


Calculation method:
For each user, we define its **CohortGroup** as the minimum (first) month of activity - the name of the cohort.
From each user activity record, we subtract **CohortGroup** and get an indicator of the number of months that have passed since the month of the first purchase **amount_of_month**.
Grouping by **CohortGroup** and **amount_of_month**, we calculate the size of the cohort (the number of unique users) for the nth month. The resulting table is turned over into a pivot table.
Convert from absolute values to relative values (relative to 0 month).

**First, we will write a function to create df on the basis of which we can build a cohort analysis with retention for all cohorts, then to identify the cohort with the highest retention for n month.**

In [27]:
def _cohort_analysis(df, order_purchase_timestamp, customer_unique_id, start="1900-01-01",end="2025-12-31"):
    
    df = df.loc[:,[order_purchase_timestamp, customer_unique_id]]
    
    start = datetime.strptime(start, '%Y-%m-%d')
    end = datetime.strptime(end, '%Y-%m-%d')
    
    # Filter by date
  
    df = df.query('order_purchase_timestamp >= @start and order_purchase_timestamp <= @end')
    
    # Formation of cohorts on the first order
    
    df['CohortGroup'] = df.groupby('customer_unique_id').order_purchase_timestamp \
                                                .transform('min')\
                                                .dt.to_period('M')
    df['purchase_month'] = df['order_purchase_timestamp'].dt.to_period('M') 
    
    # Subtract CohortGroup from each user activity record and get the number of months,
    #elapsed since the first purchase month amount_of_month
    
    df['amount_of_month'] = ((df.order_purchase_timestamp.dt.year - df.CohortGroup.dt.year)*12
                                    + df.order_purchase_timestamp.dt.month - df.CohortGroup.dt.month)
    
    # Getting the size of each cohort
   
    df['Size_of_coh'] = df.groupby('CohortGroup', as_index=False)\
                                .customer_unique_id.transform('nunique')
    
    # We count unique users by cohorts and months of purchases
    
    Cohort_an = df.groupby(['CohortGroup','amount_of_month', 'Size_of_coh'], as_index=False)\
                                    .customer_unique_id.nunique()
    # Calculating retention
    
    Cohort_an['ret_ratio_%'] = round(Cohort_an.customer_unique_id / Cohort_an.Size_of_coh *100,2)
    
    return Cohort_an

In [28]:
def Cohort_matrix(df, order_purchase_timestamp, customer_unique_id, start="1900-01-01",end="2025-12-31"):
    """
    for start use the "first date" of report period
    for end use the "last date" of report period
    """
    Cohort_an = _cohort_analysis(df, order_purchase_timestamp, customer_unique_id, start="1900-01-01",end="2025-12-31")
    
    Matrix = Cohort_an.pivot(index = ['CohortGroup', 'Size_of_coh'],
                                     columns = 'amount_of_month',
                                     values = 'ret_ratio_%')
    Matrix = Matrix.style\
                   .set_caption('Monthly cohorts: user retention')\
                   .background_gradient(axis=0, low=0, high=1.0)\
                   .highlight_null('white')\
                   .format("{:}", na_rep="")
     
    return Matrix

In [29]:
def Cohort_max_n_month(df, order_purchase_timestamp, customer_unique_id, n, start="1900-01-01",end="2025-12-31"):
    """
    for start use the "first date" of report period
    for end use the "last date" of report period
    for n use the number of month of retention which you are interested in
    """
    Cohort_an = _cohort_analysis(df, order_purchase_timestamp, customer_unique_id, start="1900-01-01",end="2025-12-31")
    
    name_of_cohort = Cohort_an.query('amount_of_month == @n').set_index('CohortGroup')['ret_ratio_%'].idxmax()
    
    return str(name_of_cohort)

In [30]:
_cohort_analysis(cust_purh_coh, 'customer_unique_id', 'order_purchase_timestamp', '2017-01-01', '2018-01-01')

Unnamed: 0,CohortGroup,amount_of_month,Size_of_coh,customer_unique_id,ret_ratio_%
0,2017-01,0,741,741,100.00
1,2017-01,1,741,3,0.40
2,2017-01,2,741,2,0.27
3,2017-01,3,741,1,0.13
4,2017-01,4,741,3,0.40
...,...,...,...,...,...
72,2017-10,1,4378,31,0.71
73,2017-10,2,4378,11,0.25
74,2017-11,0,7156,7156,100.00
75,2017-11,1,7156,40,0.56


In [31]:
Cohort_matrix(cust_purh_coh, 'customer_unique_id', 'order_purchase_timestamp', '2017-01-01', '2018-01-01')

Unnamed: 0_level_0,amount_of_month,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,19,20
CohortGroup,Size_of_coh,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-09,2,100.0,,,,,,,,,,,,,,,,,,,
2016-10,272,100.0,,,,,,0.37,,,0.37,,0.37,,0.37,,0.37,,0.37,0.74,0.74
2016-12,1,100.0,100.0,,,,,,,,,,,,,,,,,,
2017-01,740,100.0,0.41,0.27,0.14,0.41,0.14,0.41,0.14,0.14,,0.41,0.14,0.68,0.41,0.14,0.14,0.27,0.41,0.14,
2017-02,1678,100.0,0.24,0.3,0.12,0.42,0.12,0.24,0.18,0.12,0.18,0.12,0.3,0.12,0.18,0.12,0.06,0.06,0.24,,
2017-03,2568,100.0,0.51,0.35,0.39,0.35,0.16,0.16,0.31,0.35,0.08,0.35,0.12,0.23,0.12,0.16,0.23,0.08,0.16,,
2017-04,2310,100.0,0.61,0.22,0.17,0.3,0.26,0.35,0.3,0.3,0.17,0.26,0.09,0.04,0.04,0.09,0.09,0.13,,,
2017-05,3525,100.0,0.48,0.48,0.4,0.31,0.34,0.43,0.14,0.26,0.26,0.26,0.34,0.26,0.03,0.2,0.23,,,,
2017-06,3094,100.0,0.45,0.36,0.39,0.26,0.36,0.36,0.23,0.13,0.23,0.32,0.32,0.16,0.13,0.19,,,,,
2017-07,3815,100.0,0.52,0.34,0.24,0.29,0.21,0.31,0.1,0.18,0.26,0.21,0.29,0.13,0.26,,,,,,


In [32]:
Cohort_max_n_month(cust_purh_coh, 'customer_unique_id', 'order_purchase_timestamp', 3,'2017-01-01', '2018-01-01')

'2017-05'

###  6. Create RFM clusters for users. Output for each cluster the average values of the metrics R, F, M

**In this task let's use [RFMizer](https://github.com/Slony/rfmizer)**  
RFMizer is a Python script that takes a complete log of users' orders exported from CRM system and outputs user ID to RFMxyz segments mapping and RFMxyz segments to bid multipliers mapping.

**Let's prepare data**

Each line of file consists of three mandatory fields and arbitrary number of optional fields.  
Mandatory fields are:  

order_date — date of an order made by the user  
user_id — an internal ID of a user in advertiser's CRM system  
order_value — monetary value of the order  

In [33]:
orders_csv = order.merge(order_items, how='left', on='order_id')\
                        .merge(customers, how='left', on='customer_id')\
                        [['order_purchase_timestamp', 'customer_unique_id', 'order_id', 'price' ]]

In [34]:
orders_csv = orders_csv.groupby(['order_purchase_timestamp','customer_unique_id', 'order_id'], as_index=False)\
            .price.sum()\
            .rename(columns={'order_purchase_timestamp':'order_date', 'customer_unique_id':'user_id', 'price':'order_value'})

In [35]:
orders_csv.drop(columns='order_id', axis=1, inplace=True)

In [36]:
orders_csv.to_csv('orders.csv',sep=',',index=False,encoding='utf-8')

In [37]:
orders_csv.head()

Unnamed: 0,order_date,user_id,order_value
0,2016-09-04 21:15:19,b7d76e111c89f7ebf14761390f0f7d17,72.89
1,2016-09-05 00:15:34,4854e9b3feff728c13ee5fc7d1547e92,59.5
2,2016-09-13 15:24:19,009b0127b727ab0ba422f6d9604487c7,0.0
3,2016-09-15 12:16:38,830d5b7aaa3b6f1e9ad63703bec97d23,134.97
4,2016-10-02 22:07:52,0eb1ee9dba87f5b36b4613a65074337c,100.0


To calculate using RFMizer, you need to know the calculation period look_back_period - the number of days ago from the date of the last order
The size of the period for which the calculation of the value of each segment of buyers is most likely to be selected empirically. Let's take as suggested in the documentation prediction_period = 182

In [38]:
look_back_period = orders_csv.order_date.max()-orders_csv.order_date.min() - pd.to_timedelta(182, unit='D')
look_back_period

Timedelta('590 days 20:14:59')

As a result, we get a csv file that allows us to join the results of the RFM analysis and the user's personal data.

In [39]:
mapping = pd.read_csv('RFM_3-3-3-591-182_mapping.csv')
mapping.tail()

Unnamed: 0,user_id,frequency,monetary,recency
96091,0a5dadb73145d29f42f703c97c377ae8,1,1,3
96092,09687a7b7431a93b5c53b50ba779bf94,1,1,3
96093,c1ee153508c6b785b491443a95ff364e,1,1,3
96094,9bb92bebd4cb7511e1a02d5e50bc4655,1,1,3
96095,87ab9fec999db8bd5774917de3cdf01c,1,1,3


In [40]:
orders_csv['order_value'] = orders_csv.order_value.replace(0,np.NAN)

In [41]:
orders_result = orders_csv.groupby('user_id', as_index=False)\
.agg({'order_date':'max', 'order_date':'count', 'order_value':'mean'})\
.rename(columns={'order_date':'num_of_orders', 'order_value':'mean_order_value'})

In [42]:
orders_result = orders_csv.groupby('user_id', as_index=False)\
.agg({'order_date':['max','count'] , 'order_value':'mean'})

In [43]:
orders_result.columns = ['_'.join(col).rstrip('_') for col in orders_result.columns.values]

In [44]:
orders_result['recency_mean'] = (orders_csv.order_date.max() - orders_result.order_date_max).dt.days

In [45]:
orders_result = orders_result.drop(columns=['order_date_max'], axis=1)\
.rename(columns={'order_date_count':'frequency_mean', 'order_value_mean':'monetary_mean'})

In [46]:
orders_result = orders_result.merge(mapping, on='user_id')

As a result of the transformations, we received a dataset with marked segments and absolute values of indicators for each client.  
Now we can calculate the average values for each indicator segment.

In [47]:
orders_result.head()

Unnamed: 0,user_id,frequency_mean,monetary_mean,recency_mean,frequency,monetary,recency
0,0000366f3b9a7992bf8c76cfdf3221e2,1,129.9,160,1,3,3
1,0000b849f77a49e4a4ce2b2a4ca5be3f,1,18.9,163,1,1,3
2,0000f46a3911fa3c0805444483337064,1,69.0,585,1,2,1
3,0000f6ccb0745a6a4b88665a16c9f078,1,25.99,369,1,1,1
4,0004aac84e0df4da2b147fca70cf8255,1,180.0,336,1,3,1


The zero segment is assigned to users with a purchase date outside the considered period.

In [48]:
orders_result['RFM_cluster'] = (orders_result.frequency.map(str) 
                                + orders_result.monetary.map(str) 
                                + orders_result.recency.map(str))

In [49]:
orders_result.groupby('RFM_cluster',as_index=False)\
            .agg({'frequency_mean':'mean', 'monetary_mean': 'mean', 'recency_mean': 'mean'})\
            .round(2)

Unnamed: 0,RFM_cluster,frequency_mean,monetary_mean,recency_mean
0,0,1.03,150.23,627.22
1,111,1.01,34.6,445.17
2,112,1.0,34.86,262.24
3,113,1.0,34.63,120.4
4,121,1.01,85.98,443.98
5,122,1.01,88.26,263.39
6,123,1.0,89.11,122.45
7,131,1.02,303.95,442.51
8,132,1.02,275.21,261.9
9,133,1.01,299.73,123.14


Obviously the most valuable users are in the cluster 333 