## Define functions

In [4]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

def linear_regression_model(X, y, X_test=None):
    # --- Convert X ---
    if isinstance(X, pd.Series):
        X = X.to_frame()  # convertir serie en DataFrame (2D)
    elif isinstance(X, list) or isinstance(X, np.ndarray):
        X = np.array(X).reshape(-1, 1) if np.ndim(X) == 1 else np.array(X)
    
    # --- Convert y ---
    if isinstance(y, pd.DataFrame):
        y = y.squeeze()  # convertir DataFrame con una columna a Serie
    else:
        y = np.array(y)
    
    # --- Train Model ---
    model = LinearRegression()
    model.fit(X, y)
    
    # --- Predictions ---
    y_test_pred = None
    if X_test is not None:
        if isinstance(X_test, pd.Series):
            X_test = X_test.to_frame()
        elif isinstance(X_test, list) or isinstance(X_test, np.ndarray):
            X_test = np.array(X_test).reshape(-1, 1)
        y_test_pred = model.predict(X_test)
    
    print(f"\n📈 Ecuación: y = {model.intercept_:.2f} + {model.coef_[0]:.2f}x")
    
    return y_test_pred

def min_max_scaling(X, feature_range=(0, 1)):
    scaler = MinMaxScaler(feature_range=feature_range)
    X_scaled = scaler.fit_transform(X)
    return X_scaled
    
def train_test_split_data(X, y, test_size=0.2, random_state=None):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

## Retreive Data

In [None]:
retail_2009_2010 = pd.read_excel(r"..\..\utils\cleaned_retail_2009.xlsx")
retail_2010_2011 = pd.read_excel(r"..\..\utils\cleaned_retail_2010.xlsx")

retail_2009_2011 = pd.concat([retail_2009_2010, retail_2010_2011], ignore_index=True)

print(retail_2009_2011)

lr_data = retail_2009_2011[retail_2009_2011["stockcode"] == 85099] #- The most sold Stockcode]
lr_data = lr_data.loc[:,['quantity', 'price']]




In [None]:
print("## INFO: \n", lr_data.info())
print("DETAILS: \n", lr_data.describe())
print(f"""
## Details:
- Number of records: {lr_data.shape[0]}
- Number of features: {lr_data.shape[1]}
- Missing values:\n{lr_data.isnull().sum()}   
""")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689289 entries, 0 to 689288
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   invoice      689289 non-null  int64         
 1   stockcode    689289 non-null  int64         
 2   description  689289 non-null  object        
 3   quantity     689289 non-null  int64         
 4   invoicedate  689289 non-null  datetime64[ns]
 5   price        689289 non-null  float64       
 6   customer_id  689289 non-null  int64         
 7   country      689289 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(4), object(2)
memory usage: 42.1+ MB
## INFO: 
 None
DETAILS: 
              invoice      stockcode       quantity  \
count  689289.000000  689289.000000  689289.000000   
mean   537403.341519   32996.878826       7.323673   
min    489434.000000       1.000000       1.000000   
25%    514714.000000   21786.000000       2.000000   
50%    5369

In [None]:
# I define 20% 137.857 records of data as test data


X_train, X_test, y_train, y_test = train_test_split_data()