# Preprocessing Part 0 - Performing Feature Selection on articles.csv and Adjoining to transaction_train.csv

### Run on ml.t3.xlarge instance

In [1]:
%%capture
!pip install numpy
!pip install pandas
!pip install boto3
!pip install matplotlib
!pip install seaborn
!pip install awswrangler

In [2]:
import numpy as np
import pandas as pd
import boto3
import matplotlib.pyplot as plt
import seaborn as sns
import awswrangler as wr

pd.options.display.float_format = '{:.4f}'.format

### Loading Article Data via S3

In [3]:
bucket='ads-508-group-6-raw'
data_key = 'articles.csv'

s3 = boto3.client('s3')
articles_obj = s3.get_object(Bucket = bucket, Key = data_key)

articles_df = pd.read_csv(articles_obj['Body'])

### Loading Article Data Locally

In [4]:
#articles_df = pd.read_csv("..\\data\\articles.csv")

In [5]:
articles_df.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [6]:
articles_df.describe()

Unnamed: 0,article_id,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_group_no,section_no,garment_group_no
count,105542.0,105542.0,105542.0,105542.0,105542.0,105542.0,105542.0,105542.0,105542.0,105542.0,105542.0
mean,698424569.0969,698424.5634,234.8619,1009515.0757,32.2338,3.2062,7.808,4532.7778,3.1715,42.6642,1010.4383
std,128462381.3362,128462.3844,75.0493,22413.5858,28.0862,1.5638,5.3767,2712.692,4.3532,23.2601,6.731
min,108775015.0,108775.0,-1.0,-1.0,-1.0,-1.0,-1.0,1201.0,1.0,2.0,1001.0
25%,616992501.0,616992.5,252.0,1010008.0,9.0,2.0,4.0,1676.0,1.0,20.0,1005.0
50%,702213001.5,702213.0,259.0,1010016.0,14.0,4.0,5.0,4222.0,2.0,46.0,1009.0
75%,796703001.75,796703.0,272.0,1010016.0,52.0,4.0,11.0,7389.0,4.0,61.0,1017.0
max,959461001.0,959461.0,762.0,1010029.0,93.0,7.0,20.0,9989.0,26.0,97.0,1025.0


### Dropping String Variables

In [7]:
articles_df.drop(columns=['prod_name', 
                          'product_type_name', 
                          'product_group_name', 
                          'graphical_appearance_name', 
                          'colour_group_name', 
                          'perceived_colour_value_name', 
                          'perceived_colour_master_name', 
                          'department_name', 
                          'index_name', 
                          'index_group_name', 
                          'section_name', 
                          'garment_group_name', 
                          'detail_desc'],
                inplace = True)

### Loading Transaction Data via S3

In [8]:
bucket='ads-508-group-6-raw'
data_key = 'transactions_train.csv'

s3 = boto3.client('s3')
transactions_obj = s3.get_object(Bucket = bucket, Key = data_key)

transactions_df = pd.read_csv(transactions_obj['Body'])

### Loading Transaction Data Locally

In [9]:
#articles_df = pd.read_csv("..\\data\transactions_train.csv")

In [10]:
transactions_df.describe()

Unnamed: 0,article_id,price,sales_channel_id
count,31788324.0,31788324.0,31788324.0
mean,696227219.1338,0.0278,1.704
std,133448003.4873,0.0192,0.4565
min,108775015.0,0.0,1.0
25%,632803008.0,0.0158,1.0
50%,714582003.0,0.0254,2.0
75%,786524001.0,0.0339,2.0
max,956217002.0,0.5915,2.0


## Joining Selected Article Features onto Transaction Data

In [11]:
transactions_df = pd.merge(transactions_df, 
                           articles_df,
                           how='left', 
                           on=['article_id'])

In [12]:
transactions_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.0508,2,663713,283,1010016,9,4,5,1338,B,1,61,1017
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.0305,2,541518,306,1010016,51,1,4,1334,B,1,61,1017
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.0152,2,505221,252,1010010,52,2,4,5963,D,2,58,1003
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.0169,2,685687,252,1010010,52,7,4,3090,A,1,15,1023
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.0169,2,685687,252,1010010,93,4,19,3090,A,1,15,1023


### Saving to S3

In [13]:
wr.s3.to_csv(df = transactions_df,
             path = "s3://ads-508-group-6-processed/transactions_enriched.csv",
             index = False)

{'paths': ['s3://ads-508-group-6-processed/transactions_enriched.csv'],
 'partitions_values': {}}

### Saving Locally

In [14]:
#target_df.to_csv("..\\data\\transactions_enriched.csv", index = False)

### Shutting Down Kernel To Release Resources

In [15]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [16]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>