# IMPORTS

In [1]:
import pandas as pd
import s3fs
import joblib
import pipeline_class as pc
import os

from sklearn.pipeline    import Pipeline

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


## Loading Data From Cloud

In [3]:
key = os.environ['MY_ACCESS_KEY_AWS']
secret = os.environ['MY_SECRET_KEY_AWS']

In [4]:
#load dataset from S3
fs = s3fs.S3FileSystem(key=key, secret=secret)
bucket_name = 'customer-insiders-dataset'  # Substitua pelo nome do seu bucket
key = 'ecommerce.csv'  # Substitua pelo caminho e nome do seu arquivo CSV

# Leitura do arquivo CSV do S3 e criação do DataFrame
with fs.open(f'{bucket_name}/{key}', 'rb') as file:
    df_raw = pd.read_csv(file, encoding='iso-8859-1')
df_raw = df_raw.drop('Unnamed: 8', axis=1)     

## Load Pipeline From Cloud

In [5]:
#Load Pipeline from S3 AWS
file = fs.open(f'{bucket_name}/pipe_final/pipe_final.joblib', 'rb')
pipe_aws = joblib.load(file)

## Apply Pipeline on Data from Cloud

In [None]:
#Apply pipeline to data from AWS
data = df_raw.copy()
labels = pipe_aws.transform(data)

## Clean Data

In [None]:
df_profile = df_raw.copy()

In [None]:
#Use pipeline to clean data and apply labels
rename_pipe = Pipeline([
    ('Rename Columns', pc.RenameColumns())
])

data_cleaning_pipe = Pipeline([
    ('Data Cleaning', pc.DataCleaning())
])

feature_pipe = Pipeline([
    ('Feature Engineering', pc.FeatureEngineering())
])

In [None]:
#cleaning data
df_profile = rename_pipe.fit_transform(df_profile)
df_profile = data_cleaning_pipe.fit_transform(df_profile)
df_profile = feature_pipe.fit_transform(df_profile)

## Applying Labels from Pipeline and Results

In [None]:
#applying labels
df_profile['clusters'] = labels

In [None]:
#Per Monetary
df_monetary = df_profile.loc[:,['clusters','monetary']].groupby('clusters').mean().reset_index()

#Per Frequency
df_frequency = df_profile.loc[:,['clusters','frequency']].groupby('clusters').mean().reset_index()

#Per Recency
df_recency = df_profile.loc[:,['clusters','recency']].groupby('clusters').mean().reset_index()

#Per Pencentual
df_client_perc = df_profile.loc[:,['customer_id','clusters']].groupby('clusters').count().reset_index()
df_client_perc['customer_id'] = df_client_perc['customer_id'].apply(lambda x: 100*(x)/(df_client_perc['customer_id'].sum()))
df_client_perc = df_client_perc.rename(columns={'customer_id':'percent'})

#Per qtd_products
df_prods = df_profile.loc[:,['clusters','qt_prods']].groupby('clusters').mean().reset_index()

#Per Returns
df_returns = df_profile.loc[:,['clusters','returns_count']].groupby('clusters').mean().reset_index()

#Per Relationship days
df_duration = df_profile.loc[:,['clusters','relationship_duration']].groupby('clusters').mean().reset_index()

#Per Customer
df_customer = df_profile.loc[:,['clusters','customer_id']].groupby('clusters').count().reset_index()

df_resume = (pd.merge(df_monetary, df_frequency, on='clusters', how='left')
             .merge(df_recency, on='clusters', how='left')
             .merge(df_client_perc, on='clusters', how='left')
             .merge(df_prods, on='clusters', how='left')
             .merge(df_returns, on='clusters', how='left')
             .merge(df_duration, on='clusters', how='left')
             .merge(df_customer, on='clusters', how='left')
             )

In [None]:
df_resume.sort_values(by='monetary', ascending=False)