In [1]:
# Python modules.
import os
import sys
sys.path.insert(0, "..")
sys.path.insert(0, "../data")


# Other modules.
from dotenv import load_dotenv
load_dotenv(dotenv_path=".env", override=True)
import kaggle
import pandas as pd
from sklearn.preprocessing import (
    LabelEncoder,
    MinMaxScaler,
)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (
    root_mean_squared_error,
)


# Library.
from src.utils import (
    prepare_submission,
    submit_file,
    get_submission_scores,
)



[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/picx/kaggle-swag-competition/e/KAG-5


In [2]:
# Loading data
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/train.csv")

In [3]:
# Preprocessing
def fillna(df: pd.DataFrame) -> pd.DataFrame:
    """Fillign na.
    Super simple rules.

    :param df:
    :return df:
    """
    # Object
    df["Brand"] = df["Brand"].fillna(value="Unknown")
    df["Material"] = df["Material"].fillna(value="Unknown")
    df["Size"] = df["Size"].fillna(value="Unknown")
    df["Laptop Compartment"] = df["Laptop Compartment"].fillna(value="No")
    df["Waterproof"] = df["Waterproof"].fillna(value="No")
    df["Style"] = df["Style"].fillna(value="Unknown")
    df["Color"] = df["Color"].fillna(value="Unknown")
    # Float
    df["Compartments"] = df["Compartments"].fillna(value=0.0)
    df["Weight Capacity (kg)"] = df["Weight Capacity (kg)"].fillna(value=0.0)
    return df

df_train_filled = fillna(df=df_train)
df_train_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    300000 non-null  int64  
 1   Brand                 300000 non-null  object 
 2   Material              300000 non-null  object 
 3   Size                  300000 non-null  object 
 4   Compartments          300000 non-null  float64
 5   Laptop Compartment    300000 non-null  object 
 6   Waterproof            300000 non-null  object 
 7   Style                 300000 non-null  object 
 8   Color                 300000 non-null  object 
 9   Weight Capacity (kg)  300000 non-null  float64
 10  Price                 300000 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 25.2+ MB


In [4]:
scalers = {
    "object": {},
    "float64": {},
}
df_train_filled_n = df_train_filled.copy()
for column_name in df_train_filled_n.columns:
    if column_name == "id":
        continue
    if df_train_filled_n[column_name].dtype == "object":
        enc = LabelEncoder()
        df_train_filled_n[column_name] = enc.fit_transform(df_train_filled_n.loc[:, [column_name]])
        scalers["object"][column_name] = enc
    elif df_train_filled_n[column_name].dtype == "float64":
        enc = MinMaxScaler()
        df_train_filled_n[column_name] = enc.fit_transform(df_train_filled_n.loc[:, [column_name]])
        scalers["float64"][column_name] = enc
    else:
        raise TypeError(f"{df_train_filled_n[column_name].dtype}")

df_train_filled_n.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    300000 non-null  int64  
 1   Brand                 300000 non-null  int64  
 2   Material              300000 non-null  int64  
 3   Size                  300000 non-null  int64  
 4   Compartments          300000 non-null  float64
 5   Laptop Compartment    300000 non-null  int64  
 6   Waterproof            300000 non-null  int64  
 7   Style                 300000 non-null  int64  
 8   Color                 300000 non-null  int64  
 9   Weight Capacity (kg)  300000 non-null  float64
 10  Price                 300000 non-null  float64
dtypes: float64(3), int64(8)
memory usage: 25.2 MB


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [5]:
column_names = ["Brand", "Material", "Size", "Color"]
X = df_train_filled_n.loc[:, column_names]
y = df_train_filled_n.loc[:, ["Price"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
def get_lr_parameters():
    """To retrieve LR parameters."""
    lasso_setup = {
        "alpha": 0.1,      # L1 regularization strength
        "fit_intercept": True,
        "max_iter": 1000,  # Maximum iterations for convergence
        "tol": 0.0001,
        "selection": 'cyclic',
    }
    return lasso_setup


model = LinearRegression()
model.fit(X_train, y_train)

In [7]:
score = model.score(X_test, y_test)
rmse = root_mean_squared_error(y_true=y_test, y_pred=model.predict(X_test))
intercept = model.intercept_
coeff = model.coef_

In [8]:
df_test_filled = fillna(df=df_test)
df_test_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    300000 non-null  int64  
 1   Brand                 300000 non-null  object 
 2   Material              300000 non-null  object 
 3   Size                  300000 non-null  object 
 4   Compartments          300000 non-null  float64
 5   Laptop Compartment    300000 non-null  object 
 6   Waterproof            300000 non-null  object 
 7   Style                 300000 non-null  object 
 8   Color                 300000 non-null  object 
 9   Weight Capacity (kg)  300000 non-null  float64
 10  Price                 300000 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 25.2+ MB


In [9]:
df_test_filled_n = df_test_filled.copy()
for column_name in df_test_filled_n.columns:
    if column_name == "id":
        continue
    if df_test_filled_n[column_name].dtype == "object":
        df_test_filled_n[column_name] = (
            scalers["object"][column_name]
            .fit_transform(df_test_filled_n.loc[:, [column_name]])
        )
    elif df_test_filled_n[column_name].dtype == "float64":
        df_test_filled_n[column_name] = (
            scalers["float64"][column_name]
            .fit_transform(df_test_filled_n.loc[:, [column_name]])
        )
    else:
        raise TypeError(f"{df_train_filled_n[column_name].dtype}")

df_test_filled_n.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    300000 non-null  int64  
 1   Brand                 300000 non-null  int64  
 2   Material              300000 non-null  int64  
 3   Size                  300000 non-null  int64  
 4   Compartments          300000 non-null  float64
 5   Laptop Compartment    300000 non-null  int64  
 6   Waterproof            300000 non-null  int64  
 7   Style                 300000 non-null  int64  
 8   Color                 300000 non-null  int64  
 9   Weight Capacity (kg)  300000 non-null  float64
 10  Price                 300000 non-null  float64
dtypes: float64(3), int64(8)
memory usage: 25.2 MB


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [10]:
df_test_filled_n["prediction"] = model.predict(df_test_filled_n.loc[:, column_names])
df_test_filled_n["prediction"] = scalers["float64"]["Price"].inverse_transform(df_test_filled_n.loc[:, ["prediction"]])

In [11]:
df_test_filled_n

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price,prediction
0,0,1,1,1,0.666667,1,0,2,0,0.387057,0.719694,80.992915
1,1,1,0,2,1.000000,1,1,1,3,0.902618,0.399115,80.937102
2,2,4,1,2,0.111111,1,0,1,5,0.554792,0.179061,81.736622
3,3,2,2,2,0.777778,1,0,1,3,0.431241,0.485985,81.243675
4,4,0,0,1,0.000000,1,1,1,3,0.591645,0.526097,80.985871
...,...,...,...,...,...,...,...,...,...,...,...,...
299995,299995,0,1,2,0.888889,0,0,2,1,0.424360,0.851833,80.637827
299996,299996,1,1,0,0.555556,0,1,2,1,0.887773,0.035987,81.316383
299997,299997,3,0,0,0.888889,1,1,0,4,0.396608,0.714175,81.890357
299998,299998,0,2,2,0.000000,0,1,2,4,0.205858,0.747339,80.937356


In [12]:
prepare_submission(df_predictions=df_test_filled_n)

Generating output: data/my_submission.csv.


In [13]:
submit_file(message="Only using Brand")

Submitting data/my_submission.csv to competition:playground-series-s5e2


100%|██████████| 2.29M/2.29M [00:01<00:00, 1.77MB/s]


Submission not allowed.
(400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Content-Type': 'application/json', 'Date': 'Fri, 07 Feb 2025 15:23:06 GMT', 'Access-Control-Allow-Credentials': 'true', 'Access-Control-Allow-Origin': '*', 'Set-Cookie': 'ka_sessionid=babb25800f487332f8abbfd3a9ec7566; max-age=2626560; path=/, GCLB=CMzUnZ_Y1fyfIRAD; path=/; HttpOnly', 'Vary': 'Accept-Encoding', 'X-Kaggle-MillisecondsElapsed': '157', 'X-Kaggle-RequestId': '413209ea703872d3461fdd5fb9319eec', 'X-Kaggle-ApiVersion': '1.6.17', 'X-Kaggle-HubVersion': '0.3.7', 'X-Frame-Options': 'SAMEORIGIN', 'Strict-Transport-Security': 'max-age=63072000; includeSubDomains; preload', 'Content-Security-Policy': "object-src 'none'; script-src 'nonce-TiygFC+vEtSnX7ph5QTH1A==' 'report-sample' 'unsafe-inline' 'unsafe-eval' 'strict-dynamic' https: http:; base-uri 'none'; report-uri https://csp.withgoogle.com/csp/kaggle/20201130; frame-src 'self' https://www.kaggleusercontent.com https://www.youtube.com/embed/ 

In [14]:
get_submission_scores()

Retrieving scores from competition:playground-series-s5e2
42753436 - 90.27426 -  - 2025-02-07 14:09:37
42753425 -  -  - 2025-02-07 14:08:43
42753373 -  -  - 2025-02-07 14:05:28
42751414 - 47.83047 -  - 2025-02-07 11:28:51
42751405 - 61.44125 -  - 2025-02-07 11:27:19
42751147 - 39.16456 -  - 2025-02-07 11:01:47
42751114 - 39.16456 -  - 2025-02-07 10:59:07
