In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [None]:
df=pd.read_csv("/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv")
datax=df.sample(n=300, random_state=123)
datax.set_index('id',inplace=True)

datax=datax.iloc[:,-1941:]

In [None]:
import matplotlib.pyplot as plt
def compare_lists(list1, list2):
    """
    Compara dos listas de diferentes tamaños y cuenta la tendencia de la lista mayor.
    Retorna "tendencia al alza" si list1 tiene más valores mayores que list2,
    "tendencia a la baja" si list2 tiene más valores mayores que list1,
    y "empate" si ambos tienen la misma cantidad de valores mayores.
    """
    list1 = list1.values.tolist()
    list2 = list2.values.tolist()
    min_len = min(len(list1), len(list2))    
    # Reducir las listas al tamaño del menor
    list1 = list1[:min_len]   
    list2 = list2[:min_len]    
    # Buscar el primer índice donde ambas listas tengan valores no nulos
    i = 0
    while i < min_len and (np.isnan(list1[i]) or np.isnan(list2[i])):
        i += 1
        
    # Comparar valores de ambas listas a partir del índice encontrado
    count_a = 0
    count_b = 0
    for j in range(i, min_len):        
        if list1[j] > list2[j]:
            count_a += 1
        elif list2[j] > list1[j]:
            count_b += 1
    dif=abs(count_a-count_b)
    umbral = 3
    if dif >= umbral:
        if count_a > count_b:
            return ["tendencia al alza", count_a ,count_b]
        
        elif count_b > count_a:
            return ["tendencia a la baja", count_a ,count_b]
    else:
        return ["no tiene tendencia", count_a, count_b]
    
def plot_moving_averages(data, window1, window2):
    """
    Calcula y grafica dos promedios móviles de una serie de tiempo.
    data: la serie de tiempo a procesar.
    window1: el tamaño de la ventana del primer promedio móvil.
    window2: el tamaño de la ventana del segundo promedio móvil.
    """
    #plt.figure(figsize=(16,8))
    xlabel='Horizonte de Tiempo'
    ylabel='Valor de la serie'
    # Calcula los dos promedios móviles
    data=pd.DataFrame(data)
    ma1 = data.rolling(window=window1).mean()
    ma2 = data.rolling(window=window2).mean()
    
    new_x_values = np.arange(len(data))
    
    # Grafica la serie de tiempo y los dos promedios móviles
    
    plt.plot(new_x_values, data, label='Original')
    plt.plot(new_x_values, ma1,color="green", label=f'MA({window1})')
    plt.plot(new_x_values, ma2,color="red", label=f'MA({window2})')
    
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    
    
    plt.legend()
    plt.show()
    
    return ma1,ma2

In [None]:
rax=[]
datax2=datax.iloc[:,-30:]
for i in range(300):
    m1,m2=plot_moving_averages(datax2.iloc[i],3,5)
    rax.append([compare_lists(m1, m2), datax2.index[i]])
    
    print(rax[i])
    
nrax=[]
for i in range(len(rax)):
    if(rax[i][0][0]!='no tiene tendencia'):
        nrax.append(rax[i]) 
pt=(len(nrax)/len(rax))*100
print(f"{len(nrax)} series fuenron clasificadas con tendencia es decir {pt}%.")

In [None]:
def train_test_split(data, n_test):
    return data[:-n_test, :], data[-n_test:, :]

def calculate_zeros_percentage(arr):
    num_zeros = np.count_nonzero(arr == 0)
    total_elements = arr.size
    zeros_percentage = (num_zeros / total_elements) * 100
    return zeros_percentage
    
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols = list()
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
    for i in range(0, n_out):
        cols.append(df.shift(-i))
    agg = pd.concat(cols, axis=1)
    if dropnan:
        agg.dropna(inplace=True)
    return agg.values

import seaborn as sns

import matplotlib.pyplot as plt
def prediction_plot(testY, test_predict):

    len_prediction=[x for x in range(len(testY))]
    plt.figure(figsize=(8,4))
    plt.plot(len_prediction, testY[:len(testY)], marker='.', label="actual")
    plt.plot(len_prediction, test_predict[:len(testY)], 'r', label="prediction")
    plt.tight_layout()
    sns.despine(top=True)
    plt.subplots_adjust(left=0.07)
    plt.ylabel('Pred Trend', size=15)
    plt.xlabel('Time step', size=15)
    plt.legend(fontsize=15)
    plt.show();
    
def smape(a, f):
    a = np.array([float(val) for val in a])
    f = np.array([float(val) for val in f])
    
    numerator = 2 * np.abs(f - a)
    denominator = np.abs(a) + np.abs(f)
    
    return 100 / len(a) * np.sum(numerator / denominator)

In [None]:
params = {
        "objective": "poisson",
        "metric": "poisson",
        "learning_rate": 0.09,
        "sub_feature": 0.9,
        "sub_row": 0.75,
        "bagging_freq": 1,
        "lambda_l2": 0.1,
        'verbosity': 1,
        'num_iterations': 2000,
        'num_leaves': 32,
        "min_data_in_leaf": 50,
            }


In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

smapeList=[]

predsList=[]
for i in range(len(nrax)):
    print("-----------------------",i,"-------------------------------")
    
    #MA=datax.loc[nrax[i][1]].rolling(window=5).mean().dropna()
    nMA=datax.loc[nrax[i][1]]
    #nMA=datax.iloc[[i][0]]
    values =nMA.values
    
    # transform the time series data into supervised learning
    values= np.array(values).reshape(-1, 1)
    data = series_to_supervised(values, n_in=6)
    train, test = train_test_split(data, 28)
    train = np.asarray(train)
    trainX, trainY = train[:, :-1], train[:, -1]
    testX, testY= test[:, :-1], test[:, -1]
    train_data = lgb.Dataset(trainX, label=trainY)
    valid_data = lgb.Dataset(testX, label=testY)
    m_lgb = lgb.train(params, train_data, valid_sets=[train_data, valid_data],
                      verbose_eval=20, early_stopping_rounds=30,num_boost_round = 3600)
    
    preds=m_lgb.predict(testX)
    smapeList.append(smape(testY,preds))
        
    print('----------------------predic test----------------------------')
    predsList.append(preds)
    prediction_plot(testY,preds)

In [None]:
# Crear un DataFrame
ids=[i[1] for i in nrax]

df = pd.DataFrame(predsList)
df['id'] = ids
df.set_index('id', inplace=True)
# Guardar el DataFrame en un archivo CSV
df.to_csv('preds_lgbm_130sOrginal.csv', index=True)

In [None]:
p= sum(smapeList)/len(smapeList)
sp=round(p,2)
print('sMape promedio: ',sp)

In [None]:
dfr= pd.DataFrame(smapeList)
dfr.to_csv("Results_lgbm_130sMA5.csv")