In [1]:
import kaggle
import pandas as pd
import boto3
from io import StringIO

In [2]:
def access_data(kaggle_path:str, storage_path:str, full_storage_path:str):
    """
    Dowwnloads and stores data from Kaggle on local machine

    Args:
        kaggle_path (str): identifier for dataset to be downloaded
        storage_path (str): location to dowload data to
        full_storage_path (str): location to pull data from

    Returns
        df (Pandas DataFrame): data loaded into a pandas dataframe
    """
    kaggle.api.authenticate()
    kaggle.api.dataset_download_files(kaggle_path, path=storage_path, unzip=True)
    with open(full_storage_path, 'r',encoding='utf-8') as dataset:
        df = pd.read_csv(dataset)
    return df

In [3]:
df = access_data("justinpakzad/vestiaire-fashion-dataset", 'data', 'C:/Users/afari/Desktop/desktop items/python2024/BIEngineer-ProjectOne/data/vestiaire.csv')

Dataset URL: https://www.kaggle.com/datasets/justinpakzad/vestiaire-fashion-dataset


In [5]:
def write_to_s3(profile_name:str, bucketname:str, prefix:str, data:pd.DataFrame):
    """
    Writes data from kaggle to bucket in s3

    Args
        profile_name (str): name of aws profile with  key credentials 
        bucketname (str): name of bucket to push data to
        prefix (str): location within bucket to store data and file name 
        data (pd.DataFrame): dataset to be written to s3

    Returns
        None
    """
    csv_buffer = StringIO()
    dataframe = data
    dataframe.to_csv(csv_buffer, index=False)
    session = boto3.Session(profile_name=profile_name)
    s3_client = session.client('s3')
    bucketname = bucketname
    prefix = prefix
    s3_client.put_object(Bucket=bucketname, Key=prefix, Body=csv_buffer.getvalue())
    

In [6]:
write_to_s3('default', "akfashiontrendsproject", "data/vestaire_collective_data.csv", df)