# Scaling down for the Steamlit App

So far, we have been working with over 100k items. However, we need to scale down inorder to upload the data to Github and deploy it on Streamlit App.

We will randomly sample around 20k items for our recommender app:

In [1]:
import sqlite3
import pandas as pd
import numpy as np

Now that we have picked the articles randomly, we need to pick images for these items:

In [2]:
conn = sqlite3.connect('../4. Data/mini_product_card.db')

q = """
SELECT article_id,
prod_name, detail_desc
FROM name_desc_id;
"""

# read the table from the first database into a pandas DataFrame
df1 = pd.read_sql_query(q, conn)

df1

Unnamed: 0,article_id,prod_name,detail_desc
0,693575010,Ellen shorts,"Shorts in stretch twill with a zip fly, front ..."
1,611584001,Piper top,Short top in soft jersey with narrow shoulder ...
2,510407031,MAXI,3/4-length leggings in soft organic cotton jer...
3,623698001,Inkery dress,"V-neck dress in ribbed jersey with narrow, adj..."
4,661888002,Milly onesie BG,All-in-one pyjamas in patterned stretch cotton...
...,...,...,...
20934,735749016,SALT regular tee,"Straight-cut, relaxed-fit T-shirt in soft pima..."
20935,705428001,Cool Stream earring,Long metal earrings decorated with sparkly sto...
20936,485726001,TP Adam 2-p long john BB,Two pairs of longjohns in soft cotton jersey w...
20937,579121002,LITTLE COLIN 2- pack tee,"T-shirts in soft, printed cotton jersey."


In [3]:
conn = sqlite3.connect('../4. Data/popularity_V2.db')

p = """
SELECT article_id,
       total_purchases,
       day_type,
       prod_name,
       product_type_name,
       product_group_name,
       department_name,
       section_name,
       garment_group_name,
       index_group_name
  FROM popular;
"""

# read the table from the first database into a pandas DataFrame
df2 = pd.read_sql_query(p, conn)

df2

Unnamed: 0,article_id,total_purchases,day_type,prod_name,product_type_name,product_group_name,department_name,section_name,garment_group_name,index_group_name
0,211143037,366.0,weekday,Janet SL-set (W),Pyjama set,Nightwear,Nightwear,"Womens Nightwear, Socks & Tigh","Under-, Nightwear",Ladieswear
1,255396006,77.0,weekday,Nora Cardigan,Cardigan,Garment Upper body,Jersey Fancy DS,Divided Selected,Jersey Fancy,Divided
2,358483006,46.0,weekday,Stressan light knit jumper,Sweater,Garment Upper body,Tops Knitwear DS,Divided Selected,Knitwear,Divided
3,211143022,33.0,weekday,Janet SL (W),Pyjama set,Nightwear,Nightwear,"Womens Nightwear, Socks & Tigh","Under-, Nightwear",Ladieswear
4,297067002,2167.0,weekday,Small dot 1p Tights,Leggings/Tights,Garment Lower body,Tights basic,"Womens Nightwear, Socks & Tigh",Socks and Tights,Ladieswear
...,...,...,...,...,...,...,...,...,...,...
36740,910949002,25.0,weekend,sidney bonded highsupport bra,Bra,Underwear,Ladies Sport Bras,Ladies H&M Sport,Jersey Fancy,Sport
36741,850795002,3.0,weekend,Sun skirt,Skirt,Garment Lower body,Kids Girl Dresses,Kids Girl,Dresses/Skirts girls,Baby/Children
36742,910949002,46.0,weekday,sidney bonded highsupport bra,Bra,Underwear,Ladies Sport Bras,Ladies H&M Sport,Jersey Fancy,Sport
36743,850795002,4.0,weekday,Sun skirt,Skirt,Garment Lower body,Kids Girl Dresses,Kids Girl,Dresses/Skirts girls,Baby/Children


In [4]:
# merge the two dataframes on the 'article_id' column
merged_df = pd.merge(df1, df2, on='article_id', how='inner')


In [5]:
merged_df

Unnamed: 0,article_id,prod_name_x,detail_desc,total_purchases,day_type,prod_name_y,product_type_name,product_group_name,department_name,section_name,garment_group_name,index_group_name
0,693575010,Ellen shorts,"Shorts in stretch twill with a zip fly, front ...",44.0,weekday,Ellen shorts,Shorts,Garment Lower body,Young Girl Trouser,Young Girl,Trousers,Baby/Children
1,693575010,Ellen shorts,"Shorts in stretch twill with a zip fly, front ...",23.0,weekend,Ellen shorts,Shorts,Garment Lower body,Young Girl Trouser,Young Girl,Trousers,Baby/Children
2,611584001,Piper top,Short top in soft jersey with narrow shoulder ...,114.0,weekday,Piper top,Vest top,Garment Upper body,Projects Jersey & Knitwear,Divided Projects,Jersey Fancy,Divided
3,611584001,Piper top,Short top in soft jersey with narrow shoulder ...,74.0,weekend,Piper top,Vest top,Garment Upper body,Projects Jersey & Knitwear,Divided Projects,Jersey Fancy,Divided
4,510407031,MAXI,3/4-length leggings in soft organic cotton jer...,2.0,weekend,MAXI,Leggings/Tights,Garment Lower body,Kids Girl Jersey Basic,Girls Underwear & Basics,Jersey Basic,Baby/Children
...,...,...,...,...,...,...,...,...,...,...,...,...
36740,485726001,TP Adam 2-p long john BB,Two pairs of longjohns in soft cotton jersey w...,23.0,weekend,TP Adam 2-p long john BB,Long John,Underwear,Young Boy UW/NW,Boys Underwear & Basics,"Under-, Nightwear",Baby/Children
36741,579121002,LITTLE COLIN 2- pack tee,"T-shirts in soft, printed cotton jersey.",23.0,weekday,LITTLE COLIN 2- pack tee,T-shirt,Garment Upper body,Kids Boy Jersey Fancy,Kids Boy,Jersey Fancy,Baby/Children
36742,579121002,LITTLE COLIN 2- pack tee,"T-shirts in soft, printed cotton jersey.",12.0,weekend,LITTLE COLIN 2- pack tee,T-shirt,Garment Upper body,Kids Boy Jersey Fancy,Kids Boy,Jersey Fancy,Baby/Children
36743,865568006,Tiblisi paperpag,"Trousers in an airy viscose weave with a high,...",6.0,weekday,Tiblisi paperpag,Trousers,Garment Lower body,Young Girl Trouser,Young Girl,Trousers,Baby/Children


In [7]:
merged_df['article_id'].nunique()

#df1['article_id'].nunique()


19468

In [8]:
merged_df['article_id'].isin(df1['article_id']).sum()

36745

In [9]:
unique_ids = pd.concat([df1['article_id'], df2['article_id']]).unique()

In [10]:
unique_ids

array([693575010, 611584001, 510407031, ..., 485726001, 579121002,
       865568006], dtype=int64)

In [13]:
import os
import shutil

# define the input and output directories
input_dir = r"C:\Users\alvee\OneDrive\Documents\DS Project\Outbrain Click Prediction\Processed_Images"
output_dir =r"C:\Users\alvee\OneDrive\Documents\DS Project\Outbrain Click Prediction\Processed_Images_2"

# define the input file name
input_file_name = "example.jpg"

# iterate over the files in the input directory
for i in unique_ids:
    input_file_name = '0' + str(i) + '.jpg'
    for filename in os.listdir(input_dir):
        # check if the filename matches the input file name
        if filename == input_file_name:
            # construct the full path of the input and output files
            input_file_path = os.path.join(input_dir, filename)
            output_file_path = os.path.join(output_dir, filename)
            # copy the file from input directory to output directory
            shutil.copy(input_file_path, output_file_path)
