# Parameters

In [1]:
BASE_DIR = '/Users/efraflores/Desktop/EF/Corner/Catalog/Stock_fcst/data'
BRANCH_ID = 9988

# Class

In [2]:
# Control de datos
from time import sleep
from pathlib import Path
from pickle import dump as save_pkl
from IPython.display import clear_output, display

# Ingeniería de variables
from re import sub, UNICODE
from numpy import nan, array
from datetime import datetime
from unicodedata import normalize
from nltk.corpus import stopwords
from string import ascii_uppercase
from pandas import DataFrame, read_csv

# Modelos
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import RobustScaler, MinMaxScaler


class StockForecast:
    def __init__(self, base_dir: str, branch_id: int) -> None:
        '''
        Obtener un directorio como texto y convertirlo a tipo Path para unir directorios, buscar archivos, etc.
        '''
        self.base_dir = Path(base_dir)
        self.branch_id = branch_id
        self.file_name = f'{self.branch_id}_op.csv'

        # Definir la ruta completa para leer el archivo
        self.file_path = self.base_dir.joinpath(self.file_name)

        # Verificar que existe el archivo en el directorio
        if not self.file_path.is_file():
            print(f'Debería haber un archivo llamado: "{self.file_name}" en el directorio:\n{self.base_dir}\n\nAgrégalo e intenta de nuevo!\n')

    def export_csv(self, df: DataFrame, name_suffix=None, **kwargs) -> None: 
        '''
        Exportar un archivo en formato csv
        '''
        export_name = f'{self.branch_id}.csv' if name_suffix==None else f'{self.branch_id}_{name_suffix}.csv'
        df.to_csv(self.base_dir.joinpath(export_name), **kwargs)
        print(f'Archivo: {export_name} fue exportado exitosamente en:\n{self.base_dir}')

    def get_file(self, testing: bool=True, id_col: str='product_id', cols: list=['product','category','brand']) -> DataFrame:
        '''
        Importar el csv a nivel orden-producto y construir la tabla de productos
        '''
        if testing:
            # Leer sólo una fracción del archivo para confirmar que el flujo corre sin problemas
            chunks = read_csv(self.file_path, chunksize=1000)
            df = DataFrame()
            for i,chunk in enumerate(chunks):
                # Solamente se usará el primer chunk de 1K renglones
                if i > 0: break
                else: df = df.append(chunk)
            self.df = df

        else: self.df = read_csv(self.file_path)
        
        # Una tabla con los productos, para crear las familias en un método posterior
        self.prod = self.df.drop_duplicates(id_col).set_index(id_col)[cols]

    def clean_text(self, text: str, pattern: str="[^a-zA-Z0-9\s]", lower: bool=True, rem_stopw: bool=False, stopwords_list: list=[]) -> str: 
        '''
        Limpieza de texto
        '''
        # Reemplazar acentos: áàäâã --> a
        clean = normalize('NFD', str(text).replace('\n', ' \n ')).encode('ascii', 'ignore')
        # Omitir caracteres especiales !"#$%&/()=...
        clean = sub(pattern, ' ', clean.decode('utf-8'), flags=UNICODE)
        # Mantener sólo un espacio
        clean = sub(r'\s{2,}', ' ', clean.strip())
        # Minúsculas si el parámetro lo indica
        if lower: clean = clean.lower()
        if rem_stopw: clean = ' '.join(filter(lambda x: x not in stopwords_list, clean.split()))
        # Si el registro estaba vacío, indicar nulo
        if clean in ('','nan'): clean = nan
        return clean

    def create_families(self, n_families: int=11, export_result: bool=False, **kwargs) -> None:

        X = self.prod.apply(lambda x: self.clean_text(''.join(str(x)), rem_stopw=True, stopwords_list=stopwords.words('spanish')+sf.prod.columns.tolist()+['nan']), axis=1)
        
        family_model_one = Pipeline(steps=[('countv', CountVectorizer(**kwargs)),('cluster',KMeans(n_families, random_state=22))])
        self.prod['family_one'] = family_model_one.fit_predict(X)
        top_fam_one = self.prod['family_one'].value_counts(1, dropna=False)
        print(top_fam_one,'\n'*2)

        X = X.to_frame().rename({0:'text'}, axis=1)
        X['family_one'] = self.prod['family_one']

        X_sub = X[X['family_one']==top_fam_one.index[0]].copy()

        family_model_two = Pipeline(steps=[('countv', CountVectorizer(**kwargs)),('cluster',KMeans(n_families, random_state=22))])
        X_sub['family_two'] = family_model_two.fit_predict(X_sub['text'])
        self.prod = self.prod.join(X_sub[['family_two']])

        top_fam_two = self.prod['family_two'].value_counts(1, dropna=False)
        print(top_fam_two,'\n'*2)

        X['family_two'] = self.prod['family_two']
        X_sub_sub = X[X['family_two']==top_fam_two.index[0]].copy()

        family_model_three = Pipeline(steps=[('countv', CountVectorizer(**kwargs)),('cluster',KMeans(n_families, random_state=22))])
        X_sub_sub['family_three'] = family_model_three.fit_predict(X_sub_sub['text'])
        self.prod = self.prod.join(X_sub_sub[['family_three']])

        top_fam_three = self.prod['family_three'].value_counts(1, dropna=False)
        print(top_fam_three,'\n'*2)
        
        if export_result: self.export_csv(self.prod, name_suffix='fam')

sf = StockForecast(BASE_DIR, BRANCH_ID)
sf.get_file(testing=False)
sf.create_families(max_features=10000, export_result=True)

3     0.667215
1     0.070872
9     0.050511
4     0.040288
2     0.039854
8     0.032230
0     0.025212
10    0.024866
6     0.018801
5     0.016202
7     0.013949
Name: family_one, dtype: float64 


2.0     0.471062
NaN     0.332785
7.0     0.034396
0.0     0.025559
9.0     0.023220
6.0     0.022267
4.0     0.020707
3.0     0.017328
10.0    0.015335
8.0     0.015075
1.0     0.013343
5.0     0.008924
Name: family_two, dtype: float64 


NaN     0.528938
2.0     0.276988
9.0     0.035609
4.0     0.027465
0.0     0.022180
8.0     0.021920
6.0     0.019754
5.0     0.018801
3.0     0.014556
10.0    0.012736
1.0     0.011350
7.0     0.009704
Name: family_three, dtype: float64 


Archivo: 9988_fam.csv fue exportado exitosamente en:
/Users/efraflores/Desktop/EF/Corner/Catalog/Stock_fcst/data
