# IMPORTS

In [1]:
import pandas as pd
import s3fs
import joblib
import pipeline_class as pc
import os

from sklearn.pipeline    import Pipeline

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


## Loading Data From Cloud

In [2]:
key = os.environ['MY_ACCESS_KEY_AWS']
secret = os.environ['MY_SECRET_KEY_AWS']

In [3]:
#load dataset from S3
fs = s3fs.S3FileSystem(key=key, secret=secret)
bucket_name = 'customer-insiders-dataset'  # Substitua pelo nome do seu bucket
key = 'ecommerce.csv'  # Substitua pelo caminho e nome do seu arquivo CSV

# Leitura do arquivo CSV do S3 e criação do DataFrame
with fs.open(f'{bucket_name}/{key}', 'rb') as file:
    df_raw = pd.read_csv(file, encoding='iso-8859-1')
df_raw = df_raw.drop('Unnamed: 8', axis=1)     

## Load Pipeline From Cloud

In [4]:
#Load Pipeline from S3 AWS
file = fs.open(f'{bucket_name}/pipe_final/pipe_final.joblib', 'rb')
pipe_aws = joblib.load(file)

## Apply Pipeline on Data from Cloud

In [5]:
#Apply pipeline to data from AWS
data = df_raw.copy()
labels = pipe_aws.transform(data)

## Clean Data

In [6]:
df_profile = df_raw.copy()

In [7]:
#Use pipeline to clean data and apply labels
rename_pipe = Pipeline([
    ('Rename Columns', pc.RenameColumns())
])

data_cleaning_pipe = Pipeline([
    ('Data Cleaning', pc.DataCleaning())
])

feature_pipe = Pipeline([
    ('Feature Engineering', pc.FeatureEngineering())
])

In [8]:
#cleaning data
df_profile = rename_pipe.fit_transform(df_profile)
df_profile = data_cleaning_pipe.fit_transform(df_profile)
df_profile = feature_pipe.fit_transform(df_profile)

## Applying Labels from Pipeline and Results

In [9]:
#applying labels
df_profile['clusters'] = labels

In [10]:
name_clusters = {4: 'Insiders',
                5: 'Potentials',
                 1: 'At Risk',
                 2: 'Sleeping',
                 3: 'Occasional'
                }

df_profile['clusters'] = df_profile['clusters'].map(name_clusters)

In [11]:
#Per Monetary
df_monetary = df_profile.loc[:,['clusters','monetary']].groupby('clusters').mean().reset_index()

#Per Frequency
df_frequency = df_profile.loc[:,['clusters','frequency']].groupby('clusters').mean().reset_index()

#Per Recency
df_recency = df_profile.loc[:,['clusters','recency']].groupby('clusters').mean().reset_index()

#Per Pencentual
df_client_perc = df_profile.loc[:,['customer_id','clusters']].groupby('clusters').count().reset_index()
df_client_perc['customer_id'] = df_client_perc['customer_id'].apply(lambda x: 100*(x)/(df_client_perc['customer_id'].sum()))
df_client_perc = df_client_perc.rename(columns={'customer_id':'percent'})

#Per qtd_products
df_prods = df_profile.loc[:,['clusters','qt_prods']].groupby('clusters').mean().reset_index()

#Per Returns
df_returns = df_profile.loc[:,['clusters','returns_count']].groupby('clusters').mean().reset_index()

#Per Relationship days
df_duration = df_profile.loc[:,['clusters','relationship_duration']].groupby('clusters').mean().reset_index()

#Per Avg Order Value
df_avg_value = df_profile.loc[:,['clusters','avg_order_value']].groupby('clusters').mean().reset_index()

#Per Customer
df_customer = df_profile.loc[:,['clusters','customer_id']].groupby('clusters').count().reset_index()

df_resume = (pd.merge(df_monetary, df_frequency, on='clusters', how='left')
             .merge(df_recency, on='clusters', how='left')
             .merge(df_client_perc, on='clusters', how='left')
             .merge(df_prods, on='clusters', how='left')
             .merge(df_returns, on='clusters', how='left')
             .merge(df_duration, on='clusters', how='left')
             .merge(df_customer, on='clusters', how='left')
             .merge(df_avg_value, on='clusters', how='left')
             )

In [12]:
df_resume.sort_values(by='monetary', ascending=False)

Unnamed: 0,clusters,monetary,frequency,recency,percent,qt_prods,returns_count,relationship_duration,customer_id,avg_order_value
1,Insiders,6987.83236,0.092111,36.78882,14.137689,317.621118,53.301863,261.679503,805,1194.733488
3,Potentials,1846.939369,0.026235,91.954521,32.824025,111.091493,27.903692,136.232745,1869,790.826792
0,At Risk,588.866232,0.020077,134.425121,18.177028,32.921739,5.098551,55.720773,1035,421.573256
4,Sleeping,226.495361,0.018749,152.509537,25.781524,19.269074,2.192779,29.196866,1468,183.296059
2,Occasional,16.186228,0.0,195.524178,9.079733,3.353965,0.584139,0.0,517,16.186228


# Connect SQL AWS

In [13]:
import psycopg2
import sqlite3
from sqlalchemy import create_engine, text

In [14]:
#connect database AWS
db_host = 'insiders-database.co05ecdga4gg.us-east-2.rds.amazonaws.com'
db_user = 'insiders'
db_password = 'insiders123'
db_name = 'postgres'

endpoint = f'postgresql://{db_user}:{db_password}@{db_host}/{db_name}'
engine = create_engine(endpoint)
conn = engine.connect()

## Create Table

In [16]:
# query_create_table_insiders = """
#     CREATE TABLE insiders (
#         customer_id                 REAL,
#            monetary                 REAL,
#            unique_prods             INTEGER,
#            qt_prods                 INTEGER,
#            avg_basket_size          REAL,
#            recency                  INTEGER,
#            relationship_duration    INTEGER,
#            purchase_count           INTEGER,
#            returns_count            REAL,
#            monetary_returns         REAL,
#            avg_unit_price           REAL,
#            return_rate              REAL,
#            avg_purchase_interval    REAL,
#            frequency                REAL,
#            avg_order_value          REAL,
#            clusters                 TEXT
#     )
#     """

# conn.execute(text(query_create_table_insiders))
# conn.commit()

## Drop Table

In [15]:
# #drop table
# query_drop_insiders = """
#     DROP TABLE insiders
# """
# conn.execute(text(query_drop_insiders))
# conn.commit()

# Insert Data

In [17]:
#insert data - append
conn = create_engine(endpoint)
df_profile.to_sql('insiders', con=conn, if_exists='append', index=False)

694

In [None]:
# #insert data - replace
# conn = create_engine(endpoint)
# df_profile.to_sql('insiders', con=conn, if_exists='replace', index=False)

## Consulting Database

In [18]:
#consulting database
engine = create_engine(endpoint)
query = """
    SELECT * FROM insiders
"""

df = pd.read_sql_query(sql=text(query), con=engine.connect())


In [19]:
df

Unnamed: 0,customer_id,monetary,unique_prods,qt_prods,avg_basket_size,recency,relationship_duration,purchase_count,returns_count,monetary_returns,avg_unit_price,return_rate,avg_purchase_interval,frequency,avg_order_value,clusters
0,12347.0,4310.00,103,182,26.00,2,365,7,0.0,0.00,2.644011,0.0,0.019178,0.019178,615.71430,Insiders
1,12348.0,1437.24,21,27,6.75,75,283,4,0.0,0.00,0.692963,0.0,0.014134,0.014134,359.31000,Potentials
2,12349.0,1457.55,72,72,72.00,18,0,1,0.0,0.00,4.237500,0.0,0.000000,0.000000,1457.55000,Potentials
3,12350.0,294.40,16,16,16.00,310,0,1,0.0,0.00,1.581250,0.0,0.000000,0.000000,294.40000,Sleeping
4,12352.0,1385.74,57,77,11.00,36,260,7,63.0,-120.33,4.075455,9.0,0.026923,0.026923,197.96286,Potentials
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5689,22705.0,3.35,2,2,2.00,1,0,1,0.0,0.00,1.675000,0.0,0.000000,0.000000,3.35000,Occasional
5690,22706.0,5699.00,634,634,634.00,1,0,1,0.0,0.00,4.320946,0.0,0.000000,0.000000,5699.00000,Insiders
5691,22707.0,6756.06,730,730,730.00,0,0,1,0.0,0.00,4.175904,0.0,0.000000,0.000000,6756.06000,Insiders
5692,22708.0,3217.20,56,59,59.00,0,0,1,0.0,0.00,6.269661,0.0,0.000000,0.000000,3217.20000,Potentials
