In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [2]:
# Importando as bibliotecas necessárias
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
import numpy as np

In [3]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
online_retail = fetch_ucirepo(id=352)

# data (as pandas dataframes)
X = online_retail.data.features
y = online_retail.data.targets

# metadata
print(online_retail.metadata)

# variable information
print(online_retail.variables)

{'uci_id': 352, 'name': 'Online Retail', 'repository_url': 'https://archive.ics.uci.edu/dataset/352/online+retail', 'data_url': 'https://archive.ics.uci.edu/static/public/352/data.csv', 'abstract': 'This is a transactional data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail.', 'area': 'Business', 'tasks': ['Classification', 'Clustering'], 'characteristics': ['Multivariate', 'Sequential', 'Time-Series'], 'num_instances': 541909, 'num_features': 6, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': None, 'index_col': ['InvoiceNo', 'StockCode'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2015, 'last_updated': 'Mon Oct 21 2024', 'dataset_doi': '10.24432/C5BW33', 'creators': ['Daqing Chen'], 'intro_paper': {'ID': 361, 'type': 'NATIVE', 'title': 'Data mining for the online retail industry: A case study of RFM model-based customer segmenta

In [4]:
# Retirando a coluna de Data e Hora da compra
X = X.drop(columns=['InvoiceDate'])

In [5]:
# Channel Islands não possui DDI próprio então foi colocado o número 4
# Canadá possui o mesmo DDI que USA então foi alterado pra 11
# European Community não possui DDI próprio então foi colocado o número 3

country_mapping = {
    'United Kingdom': 44,
    'Germany': 49,
    'France': 33,
    'EIRE': 353,
    'Spain': 34,
    'Netherlands': 31,
    'Belgium': 32,
    'Switzerland': 41,
    'Portugal': 351,
    'Australia': 61,
    'Norway': 47,
    'Italy': 39,
    'Channel Islands': 4,
    'Finland': 358,
    'Cyprus': 657,
    'Sweden': 46,
    'Unspecified': 0,
    'Austria': 43,
    'Denmark': 45,
    'Japan': 81,
    'Poland': 48,
    'Israel': 972,
    'USA': 1,
    'Hong Kong': 852,
    'Singapore': 65,
    'Iceland': 354,
    'Canada': 11,
    'Greece': 30,
    'Malta': 356,
    'United Arab Emirates': 971,
    'European Community': 3,
    'RSA': 27,
    'Lebanon': 961,
    'Lithuania': 370,
    'Brazil': 55,
    'Czech Republic': 420,
    'Bahrain': 973,
    'Saudi Arabia': 966
}

X['Country'] = X['Country'].map(country_mapping).astype(int)

In [None]:
X_sample = X.sample(frac=0.5, random_state=42)  # Seleciona 50% dos dados
X['Quantity'] = X['Quantity'].astype('int32')
X['Country'] = X['Country'].astype('int16')


# Usar pd.get_dummies para transformar Description em colunas de 0 e 1
df_encoded = pd.get_dummies(X_sample, columns=['Description'], prefix='', prefix_sep='').fillna(0).astype(int)

# Ver resultado
print(df_encoded.head())

In [None]:
print(X_sample['Country'].value_counts())

In [None]:
X_sample.info()

In [None]:
print(X_sample.describe())

In [None]:
numerical_cols = X_sample.select_dtypes(include=['number']).columns
scaler = MinMaxScaler()
X_sample[numerical_cols] = scaler.fit_transform(X_sample[numerical_cols])

In [None]:
string_cols = X_sample.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['number']).columns
X_numerical = X[numerical_cols]
print(X.isnull().sum())  # Check for NaN values


label_encoders = {}
for col in string_cols:
    label_encoders[col] = LabelEncoder()
    X_sample[col] = label_encoders[col].fit_transform(X_sample[col])

In [None]:
input_dim = X_sample.shape[1]  # 4 itens de entrada no dataset

In [None]:
from tensorflow import keras
# Construir o Autoencoder
input_layer = Input(shape=(input_dim,))
encoder = Dense(3, activation="relu")(input_layer)  # Consider using 'tanh' or 'linear'
decoder = Dense(input_dim, activation="linear")(encoder) # Use 'linear' for regression
autoencoder = Model(inputs=input_layer, outputs=decoder)

In [None]:
# Definir o modelo
autoencoder.compile(optimizer='adam', loss='mse')  # Use 'mse' for regression

In [None]:
# # Compilar o modelo
# autoencoder.fit(X_sample, X_sample, epochs=100, verbose=0)

autoencoder.fit(X_sample, X_sample, epochs=30, batch_size=64, verbose=0)
# Utilizando a Sample

In [None]:
# Fazer previsões (reconstrução das entradas)
reconstructed = autoencoder.predict(X_sample)
print(reconstructed)

In [None]:
# Identificar colunas não numéricas
non_numeric_cols = X.select_dtypes(include=['object']).columns

# Aplicar Label Encoding ou One-Hot Encoding nas colunas categóricas
for col in non_numeric_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Verifique o resultado
print(X.head())

In [None]:
numerical_cols = X.select_dtypes(include=['number']).columns  # Get numerical columns from X
scaler = MinMaxScaler()  # Create a new or reset the existing scaler
X[numerical_cols] = scaler.fit_transform(X[numerical_cols]) # Fit and transform on X

In [None]:
# Fazer previsões (reconstrução das entradas)
# Após o treinamento, o autoencoder tenta reconstruir as amostras de entrada
reconstructed = autoencoder.predict(X)
print("Dados originais:")
print(X_sample)
print("\nDados reconstruídos:")
print(reconstructed)  # Exibe os dados reconstruídos, que devem ser próximos aos dados originais

In [None]:
# import pandas as pd
# from sklearn.preprocessing import LabelEncoder

# # Assuming X is a pandas DataFrame
# # ... your existing code for creating the autoencoder model ...

# # 1. Identify string columns in X
# string_cols = X.select_dtypes(include=['object']).columns

# # 2. Create a LabelEncoder for each string column
# label_encoders = {}
# for col in string_cols:
#     label_encoders[col] = LabelEncoder()
#     X[col] = label_encoders[col].fit_transform(X[col])

# # 3. Now you can train the model
# autoencoder.fit(X, X, epochs=100, verbose=0)