In [3]:
import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [4]:
#Extraindo os dados
data = pd.read_csv('Amazon Sale Report.csv',sep=";")

#Explorando os dados
print(data.head())
# print(data.info())
# print(data.describe())


       Date Fulfilment       Category Size  Qty currency  Amount    ship-city
0  04-30-22   Merchant            Set    S    0      INR  647.62       MUMBAI
1  04-30-22   Merchant          kurta  3XL    1      INR  406.00    BENGALURU
2  04-30-22     Amazon          kurta   XL    1      INR  329.00  NAVI MUMBAI
3  04-30-22   Merchant  Western Dress    L    0      INR  753.33   PUDUCHERRY
4  04-30-22     Amazon            Top  3XL    1      INR  574.00      CHENNAI


In [8]:
# Carregar e pré-processar dados
def load_and_preprocess():
    data = pd.read_csv('Amazon Sale Report.csv', sep=';')
    data.dropna(inplace=True)
    
    # Converter datas
    data['Date'] = pd.to_datetime(data['Date'], errors='coerce')
    data['Month'] = data['Date'].dt.month
    data['Year'] = data['Date'].dt.year
    
    # Codificação categórica
    le = LabelEncoder()
    data['Fulfilment'] = le.fit_transform(data['Fulfilment'])
    data['Category'] = le.fit_transform(data['Category'])
    
    return data, le

In [9]:
# Treinar e salvar modelo
def train_and_save_model():
    data, le = load_and_preprocess()
    
    # Features e target
    X = data[['Fulfilment', 'Category', 'Month']]
    y = data['Amount']
    
    # Divisão treino-teste
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    
    # Pipeline de pré-processamento
    preprocessor = ColumnTransformer(
        transformers=[
            ('scaler', StandardScaler(), ['Fulfilment', 'Category', 'Month'])
        ])
    
    # Modelo e Grid Search
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor(random_state=42))
    ])
    
    param_grid = {
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 10]
    }
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='r2')
    grid_search.fit(X_train, y_train)
    
    # Salvar modelo e encoder
    joblib.dump(grid_search.best_estimator_, 'best_model.pkl')
    joblib.dump(le, 'label_encoder.pkl')
    
    # Gerar previsões futuras
    fulfilment_unique = data['Fulfilment'].unique()
    category_unique = data['Category'].unique()
    month_unique = np.arange(1, 13)
    
    combinacao = list(itertools.product(fulfilment_unique, category_unique, month_unique))
    vendas_futuras = pd.DataFrame(combinacao, columns=['Fulfilment', 'Category', 'Month'])
    vendas_futuras['predicted_sales'] = grid_search.predict(vendas_futuras)
    vendas_futuras.to_csv('vendas_futuras.csv', index=False)
    
    return grid_search.best_estimator_, X_test, y_test

if __name__ == "__main__":
    model, X_test, y_test = train_and_save_model()
    y_pred = model.predict(X_test)
    
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("MSE:", mean_squared_error(y_test, y_pred))
    print("R2:", r2_score(y_test, y_pred))

  data['Date'] = pd.to_datetime(data['Date'], errors='coerce')


MAE: 152.233824789727
MSE: 48055.16892858606
R2: 0.39047592407622045
