# Preprocessing transactions.csv Part 2

In [2]:
import numpy as np
import pandas as pd
import boto3
import matplotlib.pyplot as plt
import seaborn as sns
import os
import datetime

pd.options.display.float_format = '{:.4f}'.format

### Loading Data via S3

In [None]:
bucket='ads-508-group-6-raw'
data_key = 'transactions.csv'

s3 = boto3.client('s3')
transactions_obj = s3.get_object(Bucket = bucket, Key = data_key)

transactions_df = pd.read_csv(transactions_obj['Body'])

### Loading Data Locally

In [3]:
transactions_df = pd.read_csv("C:\\Users\\garyb\\OneDrive\\Documents\\GitHub\\recommendation-engine\\data\\historical_transactions.csv")

In [4]:
transactions_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.0508,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.0305,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.0152,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.0169,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.0169,2


### Recasting Data Types

In [5]:
transactions_df['t_dat'] = pd.to_datetime(transactions_df['t_dat'], format='%Y-%m-%d')

In [6]:
transactions_df = transactions_df.astype({'article_id':'str',
                                          'sales_channel_id':'str'})

### Data Mining Historical Transaction Data

In [7]:
def max_min(x):
    return x.max() - x.min()

In [8]:
df=transactions_df.groupby('customer_id').agg({'t_dat':max_min, 
                                               'article_id':'count', 
                                               'price':'sum', 
                                               'sales_channel_id':lambda val: (val == '1').count()})

In [9]:
df['t_dat'] = pd.to_numeric(df['t_dat'].dt.days, downcast='integer')

In [10]:
df.rename(columns={"t_dat": "days_min_max", 
                   "article_id": "total_articles_purchased",
                   'price':"total_amount_spent",
                   "sales_channel_id":"sales_channel_1_count"})

Unnamed: 0_level_0,days_min_max,total_articles_purchased,total_amount_spent,sales_channel_1_count
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,450,20,0.5982,20
0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,527,71,2.2378,71
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,501,7,0.2151,7
00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e,0,2,0.0610,2
00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,362,6,0.2139,6
...,...,...,...,...
ffffaff3905b803d1c7e153a1378a5151e1f34f236ba5451afc7cf3f699c5690,0,1,0.1220,1
ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e4747568cac33e8c541831,350,28,0.7521,28
ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab53481233731b5c4f8b7,519,58,1.2996,58
ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1778d0116cffd259264,489,35,0.6467,35


In [11]:
df.reset_index().to_csv("C:\\Users\\garyb\\OneDrive\\Documents\\GitHub\\recommendation-engine\\data\\historical_transaction_agg.csv", 
                    index = False)

### Data Mining Current Period Data

In [12]:
population_df = pd.read_csv("C:\\Users\\garyb\\OneDrive\\Documents\\GitHub\\recommendation-engine\\data\\transactions_population.csv")

In [13]:
population_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2020-01-01,0034b3dced3e565a43438bdfb5447e7321fea65388b398...,835247001,0.0339,2
1,2020-01-01,00410b91d62eefa76958fa5cac12f5daa7cfc0556e417d...,802930002,0.0678,2
2,2020-01-01,00410b91d62eefa76958fa5cac12f5daa7cfc0556e417d...,760084008,0.0254,2
3,2020-01-01,004b0fb384bcab2f8e1059dd5ca68c17580365ab95c05a...,804662002,0.0339,2
4,2020-01-01,004b0fb384bcab2f8e1059dd5ca68c17580365ab95c05a...,801554002,0.0169,2


In [14]:
population_df['t_dat'] = pd.to_datetime(population_df['t_dat'], format='%Y-%m-%d')

In [15]:
population_df = population_df.astype({'article_id':'str',
                                      'sales_channel_id':'str'})

In [16]:
df=population_df.groupby('customer_id').agg({'t_dat':max_min, 
                                             'article_id':'count', 
                                             'price':'sum', 
                                             'sales_channel_id':lambda val: (val == '1').count()})

In [17]:
df['t_dat'] = pd.to_numeric(df['t_dat'].dt.days, downcast='integer')

In [18]:
df.rename(columns={"t_dat": "days_min_max", 
                   "article_id": "total_articles_purchased",
                   'price':"total_amount_spent",
                   "sales_channel_id":"sales_channel_1_count"})

Unnamed: 0_level_0,days_min_max,total_articles_purchased,total_amount_spent,sales_channel_1_count
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,0,5,0.0936,5
0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,40,19,0.4726,19
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,0,3,0.0694,3
00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef77334eaec4ffccd7ebcc,0,2,0.0534,2
00009d946eec3ea54add5ba56d5210ea898def4b46c68570cf0096d962cacc75,71,21,0.6582,21
...,...,...,...,...
ffff4c4e8b57b633c1ddf8fbd53db16b962cf831baf9ed67c6a53d86e167a35b,0,3,0.0677,3
ffff61677073258d461e043cc9ed4ed97be5617a920640ff61024f4619bf41c4,21,8,0.1751,8
ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e4747568cac33e8c541831,0,2,0.0712,2
ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab53481233731b5c4f8b7,41,18,0.4566,18


In [19]:
df.reset_index().to_csv("C:\\Users\\garyb\\OneDrive\\Documents\\GitHub\\recommendation-engine\\data\\population_transaction_agg.csv", 
                    index = False)