In [2]:
import pandas as pd
import glob

files = glob.glob("data/*.parquet")
df = pd.concat([pd.read_parquet(f) for f in files], ignore_index=True)

In [5]:
df.head()

Unnamed: 0,brand,model,color,seats,year,fuel,capacity,power,body_type,gearbox,...,accident_free,country_of_origin,title,price_pln,price_net_info,location,equipment,posted_date,description,url
0,Fiat,Ducato,Biały,3.0,2024,Diesel,2 197 cm3,140 KM,Furgon (blaszak),Manualna,...,Tak,Polska,Fiat Ducato,119 310,(97 000PLN-Netto),"Sulechowska 26a - 65-735 Zielona Góra, Lubuski...",[],16 października 2025 13:10,Opis\nZgłoś\nGrupa Gezet \nFIAT DUCATO SERIA 2...,https://www.otomoto.pl/dostawcze/oferta/fiat-d...
1,Fiat,DUCATO,Biały,3.0,2016,Diesel,2 287 cm3,130 KM,Furgon (blaszak),Manualna,...,Tak,Francja,Fiat DUCATO,35 900,,"CHYŻNE 207 - 34-481 Chyżne, nowotarski, Małopo...","[Dodatkowe wyposażenie, Wspomaganie kierownicy...",16 października 2025 13:07,"Opis\nZgłoś\nTELEFON :\nWyświetl numer\n,\nWyś...",https://www.otomoto.pl/dostawcze/oferta/fiat-d...
2,Fiat,Ducato,Biały,3.0,2016,Diesel,3 000 cm3,170 KM,Chłodnia/izoterma,Manualna,...,Tak,Polska,Fiat Ducato,75 900,,"WIGURY 7 - 26-021 Daleszyce, kielecki, Świętok...","[Dodatkowe wyposażenie, Alarm, ASR (kontrola t...",16 października 2025 13:05,Opis\nZgłoś\nMarka Iveco Daily\nModel : 70c17\...,https://www.otomoto.pl/dostawcze/oferta/fiat-d...
3,Citroën,jumper,Biały,3.0,2014,Diesel,2 189 cm3,130 KM,Furgon (blaszak),Manualna,...,,,Citroën jumper,29 700,,"Starożytna 71, Kalisz, Polska - 62-800 Kalisz,...","[Dodatkowe wyposażenie, Klimatyzacja manualna]",16 października 2025 13:07,Opis\nZgłoś\nWitam !\nSprzedam Citroena Jumper...,https://www.otomoto.pl/dostawcze/oferta/citroe...
4,Fiat,Ducato,Biały,,2020,Diesel,2 300 cm3,150 KM,Kontener,Manualna,...,,Niemcy,Fiat Ducato,69 700,,"Starożytna 71, Kalisz, Polska - 62-800 Kalisz,...","[Dodatkowe wyposażenie, Klimatyzacja manualna]",16 października 2025 13:05,Opis\nZgłoś\nWitam !\nSprzedam Fiata ducato sp...,https://www.otomoto.pl/dostawcze/oferta/fiat-d...


In [13]:
df['price_pln'] = pd.to_numeric(df['price_pln'].astype(str).str.replace(r'[^0-9\.-]','', regex=True), errors='coerce')
pd.options.display.float_format = '{:.2f}'.format
df['price_pln'].describe()

count      25537.00
mean     1579023.15
std      4920553.06
min          500.00
25%        36900.00
50%        63900.00
75%       144525.00
max     65412015.00
Name: price_pln, dtype: float64

In [16]:
# requires: scikit-learn, pandas, numpy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np

# 1) Make sure price is numeric
df['price_pln'] = pd.to_numeric(df['price_pln'].astype(str).str.replace(r'[^0-9\.-]', '', regex=True), errors='coerce')

# 2) Drop rows with missing target or description
df2 = df.loc[df['price_pln'].notna() & df['description'].notna()].copy()

# optional: short descriptions -> keep or remove
df2 = df2[df2['description'].str.strip() != '']

# 3) split
X = df2['description']
y = df2['price_pln'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4) pipeline: TF-IDF -> SVD (dense) -> Decision Tree regressor
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=20000,     # cap vocabulary
        ngram_range=(1,2),      # unigrams + bigrams
        min_df=3,               # drop very rare tokens
        stop_words='english'    # optional
    )),
    ("svd", TruncatedSVD(n_components=100, random_state=42)),  # reduce dimensionality -> dense output
    ("dt", DecisionTreeRegressor(random_state=42, max_depth=10))  # tune max_depth as needed
])

# 5) fit
pipeline.fit(X_train, y_train)

# 6) evaluate
y_pred = pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Test RMSE: {rmse:.2f}")
print(f"Test R^2:  {r2:.3f}")

# 7) use the pipeline to predict price for a new description:
new_descriptions = ["Example listing description goes here"]
predicted_prices = pipeline.predict(new_descriptions)
print(predicted_prices)


Test RMSE: 3755893.14
Test R^2:  0.387
[287748.98995816]


# Training with parameters 
start = 500
stop = 65412015
step = 1000

and min_samples_per_class = 5 

In [17]:
# requires: scikit-learn, pandas, numpy
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# --- 1) Prepare price and description ---
# ensure numeric prices
df['price_pln'] = pd.to_numeric(
    df['price_pln'].astype(str).str.replace(r'[^0-9\.-]', '', regex=True),
    errors='coerce'
)

# drop missing
df2 = df.loc[df['price_pln'].notna() & df['description'].notna()].copy()
df2 = df2[df2['description'].str.strip().astype(bool)]

# --- 2) Make bins from 500 to 65_412_015 step 1000 ---
start = 500
stop = 65412015
step = 1000

# create bin edges (inclusive last edge)
bins = np.arange(start, stop + step, step)  # will generate many bins (~65k)
# use pd.cut to assign bin index (0..n_bins-1). right=False means intervals like [edge, next)
df2['price_bin'] = pd.cut(df2['price_pln'], bins=bins, right=False, labels=False)

# drop rows that fell outside bins (NaN)
df2 = df2[df2['price_bin'].notna()].copy()
df2['price_bin'] = df2['price_bin'].astype(int)

# --- 3) Remove very rare classes so StratifiedKFold works ---
min_samples_per_class = 5  # adjust: minimum samples required per class
class_counts = df2['price_bin'].value_counts()
valid_classes = class_counts.loc[class_counts >= min_samples_per_class].index
df2 = df2[df2['price_bin'].isin(valid_classes)].copy()

# If too many classes remain, consider increasing min_samples_per_class or coarsening step.
print("Number of samples after filtering:", len(df2))
print("Number of classes after filtering:", df2['price_bin'].nunique())

# --- 4) split ---
X = df2['description'].values
y = df2['price_bin'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- 5) pipeline for text -> dense -> classifier ---
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=50000,   # cap vocabulary
        ngram_range=(1,2),
        min_df=2,             # drop very rare tokens
        stop_words='english'  # optional, change if not english
    )),
    ("svd", TruncatedSVD(n_components=100, random_state=42)),  # reduced dense features
    ("clf", DecisionTreeClassifier(random_state=42))
])

# --- 6) cross-validation & grid search ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

param_grid = {
    "svd__n_components": [50, 100, 200],    # SVD dims to try (reduce/increase as needed)
    "clf__max_depth": [6, 10, 20, None],    # tune complexity of tree
    "clf__min_samples_leaf": [1, 3, 5]      # optional regularization
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=cv,
    scoring='accuracy',   # multi-class accuracy; change if you prefer another metric
    verbose=2,
    n_jobs=-1,
    refit=True
)

grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV score:", grid.best_score_)

# --- 7) Evaluate on test set ---
best = grid.best_estimator_
y_pred = best.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))

# --- 8) Predict for new descriptions (example) ---
new_descs = ["Example product text here."]
pred_bins = best.predict(new_descs)
# convert bin index back to price range (lower_edge, upper_edge)
pred_ranges = [(int(bins[b]), int(bins[b+1]-1)) for b in pred_bins]  # upper edge -1 to show inclusive
print("Predicted bins:", pred_bins)
print("Predicted price ranges (approx):", pred_ranges)


Number of samples after filtering: 22906
Number of classes after filtering: 374
Fitting 5 folds for each of 36 candidates, totalling 180 fits




[CV] END clf__max_depth=6, clf__min_samples_leaf=1, svd__n_components=50; total time=  12.2s
[CV] END clf__max_depth=6, clf__min_samples_leaf=1, svd__n_components=200; total time=  32.7s
[CV] END clf__max_depth=6, clf__min_samples_leaf=3, svd__n_components=200; total time=  32.2s
[CV] END clf__max_depth=6, clf__min_samples_leaf=5, svd__n_components=200; total time=  31.6s




[CV] END clf__max_depth=6, clf__min_samples_leaf=1, svd__n_components=100; total time=  18.6s
[CV] END clf__max_depth=6, clf__min_samples_leaf=3, svd__n_components=50; total time=  14.0s
[CV] END clf__max_depth=6, clf__min_samples_leaf=3, svd__n_components=100; total time=  18.0s
[CV] END clf__max_depth=6, clf__min_samples_leaf=5, svd__n_components=50; total time=  13.7s
[CV] END clf__max_depth=6, clf__min_samples_leaf=5, svd__n_components=100; total time=  17.0s
[CV] END clf__max_depth=10, clf__min_samples_leaf=1, svd__n_components=50; total time=  15.5s
[CV] END clf__max_depth=10, clf__min_samples_leaf=1, svd__n_components=100; total time=  19.9s
[CV] END clf__max_depth=10, clf__min_samples_leaf=3, svd__n_components=50; total time=  14.7s
[CV] END clf__max_depth=6, clf__min_samples_leaf=1, svd__n_components=100; total time=  18.8s
[CV] END clf__max_depth=6, clf__min_samples_leaf=3, svd__n_components=50; total time=  14.4s
[CV] END clf__max_depth=6, clf__min_samples_leaf=3, svd__n_com

## change parameters

change step = 10000 and decrease number of jobs in order to prevent timeout error n_jobs=1,

In [20]:
# requires: scikit-learn, pandas, numpy
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# --- 1) Prepare price and description ---
# ensure numeric prices
df['price_pln'] = pd.to_numeric(
    df['price_pln'].astype(str).str.replace(r'[^0-9\.-]', '', regex=True),
    errors='coerce'
)

# drop missing
df2 = df.loc[df['price_pln'].notna() & df['description'].notna()].copy()
df2 = df2[df2['description'].str.strip().astype(bool)]

# --- 2) Make bins from 500 to 65_412_015 step 1000 ---
start = 500
stop = 65412015
step = 10000

# create bin edges (inclusive last edge)
bins = np.arange(start, stop + step, step)  # will generate many bins (~65k)
# use pd.cut to assign bin index (0..n_bins-1). right=False means intervals like [edge, next)
df2['price_bin'] = pd.cut(df2['price_pln'], bins=bins, right=False, labels=False)

# drop rows that fell outside bins (NaN)
df2 = df2[df2['price_bin'].notna()].copy()
df2['price_bin'] = df2['price_bin'].astype(int)

# --- 3) Remove very rare classes so StratifiedKFold works ---
min_samples_per_class = 5  # adjust: minimum samples required per class
class_counts = df2['price_bin'].value_counts()
valid_classes = class_counts.loc[class_counts >= min_samples_per_class].index
df2 = df2[df2['price_bin'].isin(valid_classes)].copy()

# If too many classes remain, consider increasing min_samples_per_class or coarsening step.
print("Number of samples after filtering:", len(df2))
print("Number of classes after filtering:", df2['price_bin'].nunique())

# --- 4) split ---
X = df2['description'].values
y = df2['price_bin'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- 5) pipeline for text -> dense -> classifier ---
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=50000,   # cap vocabulary
        ngram_range=(1,2),
        min_df=2,             # drop very rare tokens
        stop_words='english'  # optional, change if not english
    )),
    ("svd", TruncatedSVD(n_components=100, random_state=42)),  # reduced dense features
    ("clf", DecisionTreeClassifier(random_state=42))
])

# --- 6) cross-validation & grid search ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

param_grid = {
    "svd__n_components": [50, 100, 200],    # SVD dims to try (reduce/increase as needed)
    "clf__max_depth": [6, 10, 20, None],    # tune complexity of tree
    "clf__min_samples_leaf": [1, 3, 5]      # optional regularization
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=cv,
    scoring='accuracy',   # multi-class accuracy; change if you prefer another metric
    verbose=2,
    n_jobs=1,
    refit=True
)

grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV score:", grid.best_score_)

# --- 7) Evaluate on test set ---
best = grid.best_estimator_
y_pred = best.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))

# --- 8) Predict for new descriptions (example) ---
new_descs = ["Example product text here."]
pred_bins = best.predict(new_descs)
# convert bin index back to price range (lower_edge, upper_edge)
pred_ranges = [(int(bins[b]), int(bins[b+1]-1)) for b in pred_bins]  # upper edge -1 to show inclusive
print("Predicted bins:", pred_bins)
print("Predicted price ranges (approx):", pred_ranges)


Number of samples after filtering: 23637
Number of classes after filtering: 179
Fitting 5 folds for each of 36 candidates, totalling 180 fits




[CV] END clf__max_depth=6, clf__min_samples_leaf=1, svd__n_components=50; total time=   7.2s
[CV] END clf__max_depth=6, clf__min_samples_leaf=1, svd__n_components=50; total time=   7.0s
[CV] END clf__max_depth=6, clf__min_samples_leaf=1, svd__n_components=50; total time=   7.1s
[CV] END clf__max_depth=6, clf__min_samples_leaf=1, svd__n_components=50; total time=   6.9s
[CV] END clf__max_depth=6, clf__min_samples_leaf=1, svd__n_components=50; total time=   7.2s
[CV] END clf__max_depth=6, clf__min_samples_leaf=1, svd__n_components=100; total time=  10.8s
[CV] END clf__max_depth=6, clf__min_samples_leaf=1, svd__n_components=100; total time=  10.7s
[CV] END clf__max_depth=6, clf__min_samples_leaf=1, svd__n_components=100; total time=  10.6s
[CV] END clf__max_depth=6, clf__min_samples_leaf=1, svd__n_components=100; total time=  10.8s
[CV] END clf__max_depth=6, clf__min_samples_leaf=1, svd__n_components=100; total time=  10.8s
[CV] END clf__max_depth=6, clf__min_samples_leaf=1, svd__n_compon

In [2]:
with pd.option_context('display.max_colwidth', None, 'display.width', None):
    print(df.loc[2063, 'description'])

Opis
Zgłoś
ON ROAD TRUCK SERVICES S.A.
ul. Obodrzycka 76, 61-249 Poznań
Niepruszewo, Ul. Świerkowa 2, 64-320 Buk
Witamy na stronie autoryzowanego dealera pojazdów Iveco Daily.
Jesteś zainteresowany? Zadzwoń
Wyświetl numer
Nie odbieramy? Napisz: telemarketing@grupadbk.com skontaktujemy się z Tobą najszybciej jak to będzie możliwe.
VIN: ZCFCE35B5S5705353
Iveco Daily 35S16HD + zabudowa wywrotka
Rozstaw osi 3750
Moc 156KM silnik 2,3L
Skrzynia biegów manualna
Wyposażenie:
Amortyzowany komfortowy fotel kierowcy (3 stopnie regulacji, podparcie lędźwiowe, podłokietnik)
Kamera cofania z liniami pomocniczymi
Okno w tylnej ścianie kabiny
Przełożenie mostu 3.59
Dodatkowy nastawny załączany ogranicznik prędkości
Podwójne resory paraboliczne tylne wzmocnione dla modeli S
Złącza elektryczne przyczepy 12V 13pin DIN
Wzmocnienie specjalne
Zaczep kulowy do przyczepy D-50 (klasa A-50X)
Interfejs danych FMS (do systemów telematycznych firm trzecich)
Układ elektronicznej blokady mechanizmu różnicowego (Trac

In [3]:
with pd.option_context('display.max_colwidth', None, 'display.width', None):
    print(df.loc[2063, 'url'])

https://www.otomoto.pl/dostawcze/oferta/iveco-daily-35s16h-d-ID6Hzw0u.html


In [4]:
mask = df['url'] == 'https://www.otomoto.pl/dostawcze/oferta/iveco-daily-35s16h-d-ID6Hzw0u.html'
df[mask]

Unnamed: 0,brand,model,color,seats,year,fuel,capacity,power,body_type,gearbox,...,accident_free,country_of_origin,title,price_pln,price_net_info,location,equipment,posted_date,description,url
2063,Iveco,Daily 35S16H D,Czerwony,7,2025,Diesel,2 887 cm3,156 KM,Wywrotka,Manualna,...,Tak,,Iveco Daily 35S16H D,242 802,(197 400PLN-Netto),"Świerkowa 2 - 64-320 Buk, poznański, Wielkopol...","[Dodatkowe wyposażenie, ABS, ASR (kontrola tra...",28 października 2025 9:54,Opis\nZgłoś\nON ROAD TRUCK SERVICES S.A.\nul. ...,https://www.otomoto.pl/dostawcze/oferta/iveco-...


In [5]:
# with pd.option_context('display.max_colwidth', None, 'display.max_rows', None):
temp_df = df[df['description'].str.contains('cena', case=False, na=False)]['description']
print(temp_df.head())
temp_df.to_csv(r'.\price_in_ads.txt')


2     Opis\nZgłoś\nMarka Iveco Daily\nModel : 70c17\...
12    Opis\nZgłoś\n★OPIS POJAZDU★\n▬▬▬▬▬▬▬▬▬▬▬▬▬▬\nW...
13    Opis\nZgłoś\nWitam\nPEUGEOT BOXER 2.0 BlueHDI ...
16    Opis\nZgłoś\nFIAT DOBLO CARGO MAXI DŁUGI 1,6 M...
17    Opis\nZgłoś\nGrupa GEZET Autoryzowany Salon FI...
Name: description, dtype: object


In [6]:
df[df['description'].str.contains('cena', case=False, na=False)][['description']]

Unnamed: 0,description
2,Opis\nZgłoś\nMarka Iveco Daily\nModel : 70c17\...
12,Opis\nZgłoś\n★OPIS POJAZDU★\n▬▬▬▬▬▬▬▬▬▬▬▬▬▬\nW...
13,Opis\nZgłoś\nWitam\nPEUGEOT BOXER 2.0 BlueHDI ...
16,"Opis\nZgłoś\nFIAT DOBLO CARGO MAXI DŁUGI 1,6 M..."
17,Opis\nZgłoś\nGrupa GEZET Autoryzowany Salon FI...
...,...
25524,"Opis\nZgłoś\nWitam, do sprzedania Vivaro kupio..."
25528,Opis\nZgłoś\nOFEROWANY SAMOCHÓD:\n- Volkswagen...
25531,Opis\nZgłoś\nSalon Samochodów Używanych RENEW ...
25532,Opis\nZgłoś\nAuto zakupione w polskim salonie!...
