In [1]:
import os
import donedeal

import pandas as pd
import numpy as np
import datetime as dt

In [23]:
scraper = donedeal.CarScraper()
scraper.scrape(batch_size=750)

df = scraper.DataFrame[donedeal.constants.RECOMMENDED_COLUMNS].copy()
df=(df.pipe(donedeal.data_cleaning.assign_lat_lon) # turns string 'x,y' into (float(x),float(y))
      .pipe(donedeal.data_cleaning.assign_mileage) # turns string 'x km' or 'x mi' into float(x) in km (turns mi to km)
      .pipe(donedeal.data_cleaning.price_to_float)) # turns string 'abc,def' into float(abcdef)
df=(df.loc[lambda self:~self.price.apply(donedeal.data_cleaning.isBsPrice)] # drops prices like 1234,123456,123456789,111111111 etc 
        .loc[lambda self:~self.kilometers.apply(donedeal.data_cleaning.isBsPrice)])# sometimes the mileage is bs too. sequential digits like 123456 are just so unlikely 
df.year = df.year.apply(lambda row: float(row) if row else float('nan'))
df.insert(0, 'ScrapeDate', dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

In [None]:
df.assign(lon = -df.lon.abs()).plot(kind='scatter', y='lat', x='lon', color='county',backend='plotly',width=500,height=500)

In [353]:
ds = df.copy().drop(columns = 'ScrapeDate countyTown friendlyUrl header description'.split())
ds = ds.apply(lambda row:row.apply(lambda x: x if x else None),axis=1)
ds = ds.dropna()
pd.options.display.max_columns = None
ds

Unnamed: 0,make,model,price,fuelType,trim,transmission,engine,enginePower,acceleration,seats,numDoors,colour,country,owners,roadTax,NCTExpiry,bodyType,year,currency,sellerType,county,lat,lon,kilometers
1,Skoda,Citigo,11495.0,Petrol,Ambition,Manual,1.0 L,60 hp,14.4 sec,4,5,Black,Ireland,2,190,Aug 2024,Hatchback,2018.0,EUR,PRO,Co. Cork,51.873600,-8.470710,40300.00000
3,BMW,5-Series,29750.0,Diesel,M Sport,Automatic,2.0 L,190 hp,7.6 sec,5,4,Blue,Ireland,4,270,Jan 2026,Saloon,2018.0,EUR,PRO,Co. Sligo,54.269414,-8.474957,151277.96000
4,Skoda,Octavia,5950.0,Diesel,Elegance,Manual,2.0 L,150 hp,8.5 sec,5,4,Blue,Ireland,1,190,Jul 2025,Saloon,2015.0,EUR,PRO,Co. Donegal,54.889246,-7.960024,334799.00000
6,Toyota,RAV4,16950.0,Diesel,Luna,Manual,2.0 L,143 hp,9.6 sec,5,4,Grey,Ireland,1,270,Dec 2024,SUV,2017.0,EUR,PRO,Co. Waterford,52.243320,-7.157566,178867.00000
8,Nissan,Note,5950.0,Petrol,SV,Manual,1.2 L,80 hp,13.7 sec,5,5,Black,Ireland,3,190,Jun 2025,MPV,2014.0,EUR,PRIVATE,Co. Kerry,52.446050,-9.485366,212000.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90124,Toyota,Corolla,12450.0,Diesel,Terra,Manual,1.4 L,90 hp,12.5 sec,5,4,Red,Ireland,2,180,Nov 2025,Saloon,2015.0,EUR,PRO,Co. Sligo,54.157609,-8.635683,156500.00000
90125,Ford,C-Max,12950.0,Diesel,Edition,Manual,1.6 L,95 hp,13.3 sec,5,4,Brown,Ireland,2,200,May 2023,MPV,2015.0,EUR,PRO,Co. Sligo,54.157609,-8.635683,152413.00000
90127,Ford,C-Max,15500.0,Diesel,Zetec,Manual,1.5 L,95 hp,13.3 sec,5,4,Blue,Ireland,1,190,Nov 2020,MPV,2016.0,EUR,PRO,Co. Mayo,53.949621,-9.385500,114800.00000
90132,Ford,Focus,15750.0,Diesel,Style,Manual,1.5 L,95 hp,10.8 sec,5,4,Blue,Ireland,3,180,Sep 2021,Hatchback,2017.0,EUR,PRO,Co. Limerick,52.399820,-8.575387,60790.00000


In [358]:
import re
import datetime as dt

today = dt.datetime.today()

def parse_engine(s:str)->float:
    search = re.search(r'(\d+\.\d+)\s*L',s,re.IGNORECASE)
    res:str = search.group(1) if search else 'nan'
    return float(res)
def parse_enginePower(s:str)->float:
    search = re.search(r'(\d+)\s*(hp|bhp)',s,re.IGNORECASE)
    res:str = search.group(1) if search else 'nan'
    return float(res)
def parse_acceleration(s:str)->float:
    search = re.search(r'(\d+\.?\d*)\s*sec',s,re.IGNORECASE)
    res:str = search.group(1) if search else 'nan'
    return float(res)
def parse_NCTExpiry(s:str)->int:
    dist = dt.datetime.strptime(s,'%b %Y')-today
    return dist.days
def map_strings(column:pd.Series)->pd.DataFrame:
    '''
    given a column of strings, returns a dataframe where each unique string is mapped to a column of 1s and 0s
    '''
    name:str          = column.name
    unique:list[str]  = column.unique().tolist()
    print(name)
    return column.apply(lambda variable:pd.Series({f'{name}_is_{var}':1 if variable == var else 0 for var in unique}))

ds['seats'] = ds.seats.astype('int')
ds['numDoors'] = ds.numDoors.astype('int')
ds['owners'] = ds.owners.astype('int')
ds['roadTax'] = ds.roadTax.astype('int')
ds['year'] = ds.year.astype('int')

ds['NCTExpiry'] = ds.NCTExpiry.apply(parse_NCTExpiry)
ds['engine'] = ds.engine.apply(parse_engine)
ds['enginePower'] = ds.enginePower.apply(parse_enginePower)
ds['acceleration'] = ds.acceleration.apply(parse_acceleration)

ds.apply(lambda row: row.apply(lambda x: type(x).__name__).value_counts().to_dict(),axis=0)
types = ds.apply(lambda row: row.apply(lambda x: type(x).__name__).iloc[0],axis=0).sort_values().to_dict()
ds = ds[types.keys()]

inverse_types = dict()
for col,dtype in types.items():
    if dtype not in inverse_types:
        inverse_types[dtype] = []
    inverse_types[dtype].append(col)

ds.reset_index(drop=True,inplace=True)
# exploded = pd.concat([ds]+[map_strings(ds[col]) for col in inverse_types['str']],axis=1).drop(columns=inverse_types['str'])

In [344]:
import FFnet

hidden_sizes=[250,25]
activation_functions=[FFnet.ActivationFunctions.tanh]*len(hidden_sizes)+[FFnet.ActivationFunctions.linear]
activation_function_derivs=[FFnet.ActivationFunctions.d_tanh]*len(hidden_sizes)+[FFnet.ActivationFunctions.d_linear]

nn = FFnet.NeuralNetwork(Inputs = exploded.drop(columns = 'price').values.reshape(-1,exploded.shape[1]-1,1),
                         Targets = exploded.price.values.reshape(-1,1,1),
                         hidden_sizes=hidden_sizes,
                         activation_functions=activation_functions,
                         activation_function_derivs=activation_function_derivs,
                         CostFunction=FFnet.CostFunctions.MSE,
                         lbound = -0.95,
                         ubound = 0.95)

optim = FFnet.Optimizer.Adam(nn,method = dict(Mini=10000))
nn.Train(epochs=2800,learning_rate=0.01,Optimizer=optim,report_every=1)
nn.Train(epochs=2800,learning_rate=0.001,Optimizer=optim,report_every=1)

Epoch: 50 Loss: 0.01009344694859482 Gradient Norm: 0.00038731405892811212

KeyboardInterrupt: 

In [346]:
# nn.Train(epochs=10,learning_rate=0.001,Optimizer=optim,report_every=1)
nn.Predict_to_pandas(input_names=exploded.drop(columns = 'price').columns.tolist(),target_names=['price']).plot(x='price_t',y='price_p',kind='scatter',backend='plotly',width=500,height=500) 

In [348]:
pred = nn.Predict_to_pandas(input_names=exploded.drop(columns = 'price').columns.tolist(),target_names=['price'])
def vector_to_strings(pred:pd.DataFrame)->pd.DataFrame:
    pattern = re.compile(r'(?P<column>.*)_is_(?P<value>.*)')
    column_map =pd.DataFrame([dict(name=col,**pattern.match(col).groupdict()) for col in pred.columns if pattern.match(col)])
    column_to_value:dict = column_map.set_index('name').value.to_dict()

    for column in column_map.column.unique():
        pred[column] = pred[column_map.loc[lambda self:self.column == column,'name'].values].apply(lambda row:row.idxmax(),axis=1).apply(lambda x: column_to_value[x])

    return pred.drop(columns=column_map.name.tolist())

pred = vector_to_strings(pred)
pred

Unnamed: 0,kilometers,lat,engine,enginePower,acceleration,lon,seats,numDoors,year,owners,roadTax,NCTExpiry,price_t,price_p,county,sellerType,currency,make,country,transmission,trim,fuelType,model,bodyType,colour
0,151277.96000,54.269414,2.0,190.0,7.6,-8.474957,5.0,4.0,2018.0,4.0,270.0,517.0,29750.0,20500.059989,Co. Sligo,PRO,EUR,BMW,Ireland,Automatic,M Sport,Diesel,5-Series,Saloon,Blue
1,162543.34000,53.992247,3.0,313.0,5.9,-7.359321,7.0,5.0,2018.0,1.0,600.0,668.0,59995.0,20500.059989,Co. Cavan,PRO,EUR,BMW,Ireland,Automatic,M Sport,Diesel,X5,SUV,White
2,131177.30340,53.992247,2.0,190.0,7.7,-7.359321,5.0,4.0,2016.0,1.0,200.0,91.0,24995.0,20500.059989,Co. Cavan,PRO,EUR,BMW,Ireland,Automatic,M Sport,Diesel,5-Series,Saloon,Grey
3,107825.78000,53.992247,2.0,190.0,6.8,-7.359321,5.0,4.0,2019.0,3.0,200.0,211.0,39995.0,20500.059989,Co. Cavan,PRO,EUR,BMW,Ireland,Automatic,M Sport,Diesel,3-Series,Saloon,Blue
4,112251.46500,53.366943,3.0,265.0,6.5,-6.340699,7.0,4.0,2019.0,2.0,600.0,211.0,68950.0,20500.059989,Co. Dublin,PRO,EUR,BMW,Ireland,Automatic,M Sport,Diesel,X5,SUV,Grey
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8103,170000.00000,54.244934,2.0,110.0,11.9,-6.970013,5.0,5.0,2013.0,1.0,400.0,-913.0,13950.0,20500.059989,Co. Monaghan,PRO,EUR,Volkswagen,Ireland,Manual,SPORT & STYLE,Diesel,Tiguan,SUV,Brown
8104,175418.06000,54.165650,1.6,115.0,10.2,-7.998476,5.0,5.0,2018.0,1.0,190.0,-214.0,17500.0,20500.059989,Co. Leitrim,PRO,EUR,Volkswagen,Ireland,Manual,Trendline,Diesel,Golf,Estate,Green
8105,150000.00000,53.456810,2.0,163.0,8.2,-7.104996,5.0,5.0,2016.0,1.0,190.0,152.0,17000.0,20500.059989,Co. Westmeath,PRO,EUR,BMW,Ireland,Manual,EfficientDynamics Plus,Diesel,3-Series,Estate,Blue
8106,193120.80000,53.456810,2.0,190.0,7.6,-7.104996,5.0,5.0,2016.0,1.0,270.0,183.0,20500.0,20500.059989,Co. Westmeath,PRO,EUR,BMW,Ireland,Manual,xLine,Diesel,X1,SUV,Grey


In [325]:
pred.currency.value_counts()

currency
EUR    5017
GBP       2
Name: count, dtype: int64