In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv("Houses.csv")
df = data.copy()
df['price'] = df['price'].replace('[^\d.]', '', regex=True).astype(int)
df['area'] = df['area'].replace('[^\d]', '', regex=True).astype(int)
df['rooms'] = df['rooms'].replace('[^\d]', '', regex=True).astype(int)
df['title'] = df['title'].str.strip()
df = df.drop(columns=['Unnamed: 0'])
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 608 entries, 0 to 607
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     608 non-null    object
 1   price     608 non-null    int32 
 2   area      608 non-null    int32 
 3   rooms     608 non-null    int32 
 4   interior  532 non-null    object
dtypes: int32(3), object(2)
memory usage: 16.8+ KB


Unnamed: 0,title,price,area,rooms,interior
0,Flat Rijnstraat,2500,81,3,Part-furnished
1,Flat Julius Pergerstraat,2100,85,3,Furnished
2,Flat Westerdok,1950,70,2,Furnished
3,Flat Reestraat,2750,85,4,Part-furnished
4,Flat Frederiksplein,2500,85,3,Furnished
...,...,...,...,...,...
603,Flat Victoria Welbystraat,1190,55,2,
604,Flat Herengracht 120 F,3000,93,4,
605,Flat Jan Evertsenstraat 649,2350,95,3,Furnished
606,Flat Derde Egelantiersdwarsstraat,1650,40,2,Furnished


In [74]:
for i in ['title','area','rooms','interior']:
      print("column: ",i)
      print(df[i].unique())

column:  title
['Flat Rijnstraat' 'Flat Julius Pergerstraat' 'Flat Westerdok'
 'Flat Reestraat' 'Flat Frederiksplein' 'Flat Martini van Geffenstraat'
 'Flat Bloemgracht' 'Flat Willem de Zwijgerlaan' 'Flat Amstelkade 180 3'
 'Flat Willem Parelstraat 182' 'Flat Backershagen'
 'Flat Nieuwegrachtje 2 2' 'Flat Dirk van Hasseltssteeg 34'
 'Flat Wiborgeiland 121' 'Flat Prinsengracht 474 A' 'Flat Vrolikstraat'
 'Flat Egidiusstraat' 'Flat Groenmarktkade' 'Flat Mosveld'
 'Flat Ceintuurbaan' 'Flat Hudsonstraat 67 3' 'Flat Keizersgracht'
 'Flat Lamarckhof' 'Flat Emmalaan 19 C' 'Flat Spuistraat'
 'Flat Henri Polaklaan' 'Flat Victoria Welbystraat'
 'Flat Herengracht 120 F' 'Flat Jan Evertsenstraat 649'
 'Flat Derde Egelantiersdwarsstraat' 'Flat Eerste Weteringdwarsstraat'
 'Flat Nieuwezijds Voorburgwal' 'Flat Rustenburgerstraat']
column:  area
[ 81  85  70 116  56  65  51  35  40 110  95 100 118  68  59  57  84  69
  55  93  60  77]
column:  rooms
[3 2 4 5 1]
column:  interior
['Part-furnished' 'Fur

In [75]:
LE = LabelEncoder()
label_dict = {}
for i in ['title','area','rooms','interior']:
    df[i] = LE.fit_transform(df[i])        
    label_dict[i] = {label:encoded_label for label, encoded_label in zip(LE.classes_,LE.transform(LE.classes_))}

In [76]:
df

Unnamed: 0,title,price,area,rooms,interior
0,24,2500,13,2,1
1,15,2100,15,2,0
2,29,1950,11,1,0
3,23,2750,15,3,1
4,9,2500,15,2,0
...,...,...,...,...,...
603,27,1190,3,1,3
604,12,3000,16,3,3
605,14,2350,17,2,0
606,4,1650,1,1,0


In [77]:
label_dict

{'title': {'Flat Amstelkade 180 3': 0,
  'Flat Backershagen': 1,
  'Flat Bloemgracht': 2,
  'Flat Ceintuurbaan': 3,
  'Flat Derde Egelantiersdwarsstraat': 4,
  'Flat Dirk van Hasseltssteeg 34': 5,
  'Flat Eerste Weteringdwarsstraat': 6,
  'Flat Egidiusstraat': 7,
  'Flat Emmalaan 19 C': 8,
  'Flat Frederiksplein': 9,
  'Flat Groenmarktkade': 10,
  'Flat Henri Polaklaan': 11,
  'Flat Herengracht 120 F': 12,
  'Flat Hudsonstraat 67 3': 13,
  'Flat Jan Evertsenstraat 649': 14,
  'Flat Julius Pergerstraat': 15,
  'Flat Keizersgracht': 16,
  'Flat Lamarckhof': 17,
  'Flat Martini van Geffenstraat': 18,
  'Flat Mosveld': 19,
  'Flat Nieuwegrachtje 2 2': 20,
  'Flat Nieuwezijds Voorburgwal': 21,
  'Flat Prinsengracht 474 A': 22,
  'Flat Reestraat': 23,
  'Flat Rijnstraat': 24,
  'Flat Rustenburgerstraat': 25,
  'Flat Spuistraat': 26,
  'Flat Victoria Welbystraat': 27,
  'Flat Vrolikstraat': 28,
  'Flat Westerdok': 29,
  'Flat Wiborgeiland 121': 30,
  'Flat Willem Parelstraat 182': 31,
  'Flat

In [78]:
df.isnull()

Unnamed: 0,title,price,area,rooms,interior
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
603,False,False,False,False,False
604,False,False,False,False,False
605,False,False,False,False,False
606,False,False,False,False,False


In [79]:
df.isnull().sum()

title       0
price       0
area        0
rooms       0
interior    0
dtype: int64

In [80]:
X = df.drop('price',axis=1)

In [81]:
y = df['price']

In [83]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
import time
import warnings
warnings.filterwarnings('ignore')

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=37)

models = [LinearRegression(), Ridge(), Lasso(),
          ElasticNet(), DecisionTreeRegressor(),
          RandomForestRegressor(), GradientBoostingRegressor(),
          SVR(),BaggingRegressor(), CatBoostRegressor()
]

scores_df = pd.DataFrame(columns=['Model','Score','Time'])

for model in models:
    start = time.time()
    model.fit(X_train,y_train)
    end = time.time()
    
    new_data = pd.DataFrame({'Model': [type(model).__name__],
                         'Score': [model.score(X_test, y_test)],
                         'Time': [end - start]})
    
    scores_df = pd.concat([scores_df, new_data], ignore_index=True)

    
    print(f"{type(model).__name__} - {model.score(X_test,y_test)}")

LinearRegression - 0.5754035432308406
Ridge - 0.5752729950556281
Lasso - 0.5754863577579945
ElasticNet - 0.5230743725449868
DecisionTreeRegressor - 1.0
RandomForestRegressor - 1.0
GradientBoostingRegressor - 0.9989085486909156
SVR - 0.08064134199144246
BaggingRegressor - 1.0
Learning rate set to 0.036532
0:	learn: 719.2938224	total: 497us	remaining: 497ms
1:	learn: 701.2289586	total: 931us	remaining: 465ms
2:	learn: 681.4045555	total: 1.36ms	remaining: 451ms
3:	learn: 663.9798767	total: 1.79ms	remaining: 447ms
4:	learn: 647.0998837	total: 2.23ms	remaining: 443ms
5:	learn: 628.0413137	total: 2.66ms	remaining: 441ms
6:	learn: 610.0149091	total: 3.08ms	remaining: 437ms
7:	learn: 593.7208965	total: 3.52ms	remaining: 436ms
8:	learn: 577.8813113	total: 3.86ms	remaining: 425ms
9:	learn: 562.0025479	total: 4.3ms	remaining: 426ms
10:	learn: 547.6733936	total: 4.71ms	remaining: 424ms
11:	learn: 531.8797799	total: 5.37ms	remaining: 442ms
12:	learn: 518.1989157	total: 5.86ms	remaining: 445ms
13:	l

205:	learn: 13.4012148	total: 80.9ms	remaining: 312ms
206:	learn: 13.1883663	total: 81.4ms	remaining: 312ms
207:	learn: 12.9868661	total: 81.8ms	remaining: 311ms
208:	learn: 12.7973470	total: 82.2ms	remaining: 311ms
209:	learn: 12.5895267	total: 82.6ms	remaining: 311ms
210:	learn: 12.4296221	total: 83ms	remaining: 310ms
211:	learn: 12.2904832	total: 83.4ms	remaining: 310ms
212:	learn: 12.1071316	total: 83.8ms	remaining: 310ms
213:	learn: 11.9976696	total: 84.2ms	remaining: 309ms
214:	learn: 11.7417736	total: 84.5ms	remaining: 309ms
215:	learn: 11.4958272	total: 84.9ms	remaining: 308ms
216:	learn: 11.3936105	total: 85.2ms	remaining: 308ms
217:	learn: 11.2327096	total: 85.6ms	remaining: 307ms
218:	learn: 11.0384313	total: 86ms	remaining: 307ms
219:	learn: 10.8356513	total: 86.4ms	remaining: 306ms
220:	learn: 10.6884481	total: 86.8ms	remaining: 306ms
221:	learn: 10.5030997	total: 87.2ms	remaining: 306ms
222:	learn: 10.3785873	total: 87.6ms	remaining: 305ms
223:	learn: 10.1894544	total: 88

677:	learn: 0.0609021	total: 252ms	remaining: 119ms
678:	learn: 0.0605282	total: 252ms	remaining: 119ms
679:	learn: 0.0601484	total: 253ms	remaining: 119ms
680:	learn: 0.0596134	total: 253ms	remaining: 119ms
681:	learn: 0.0585735	total: 253ms	remaining: 118ms
682:	learn: 0.0581433	total: 254ms	remaining: 118ms
683:	learn: 0.0573078	total: 254ms	remaining: 117ms
684:	learn: 0.0568312	total: 255ms	remaining: 117ms
685:	learn: 0.0561607	total: 255ms	remaining: 117ms
686:	learn: 0.0557204	total: 255ms	remaining: 116ms
687:	learn: 0.0547781	total: 256ms	remaining: 116ms
688:	learn: 0.0543820	total: 256ms	remaining: 116ms
689:	learn: 0.0539926	total: 256ms	remaining: 115ms
690:	learn: 0.0530859	total: 257ms	remaining: 115ms
691:	learn: 0.0527906	total: 257ms	remaining: 114ms
692:	learn: 0.0523822	total: 257ms	remaining: 114ms
693:	learn: 0.0520102	total: 258ms	remaining: 114ms
694:	learn: 0.0517157	total: 258ms	remaining: 113ms
695:	learn: 0.0510677	total: 259ms	remaining: 113ms
696:	learn: 

In [84]:
scores_df.sort_values(by = "Score",ascending = False)

Unnamed: 0,Model,Score,Time
4,DecisionTreeRegressor,1.0,0.004954
5,RandomForestRegressor,1.0,0.289257
8,BaggingRegressor,1.0,0.015954
9,CatBoostRegressor,1.0,0.474976
6,GradientBoostingRegressor,0.998909,0.046662
2,Lasso,0.575486,0.005953
0,LinearRegression,0.575404,0.006965
1,Ridge,0.575273,0.007971
3,ElasticNet,0.523074,0.006974
7,SVR,0.080641,0.01097


In [85]:
best = CatBoostRegressor()
best.fit(X_train,y_train)

Learning rate set to 0.036532
0:	learn: 719.2938224	total: 2.56ms	remaining: 2.56s
1:	learn: 701.2289586	total: 4.64ms	remaining: 2.31s
2:	learn: 681.4045555	total: 7ms	remaining: 2.33s
3:	learn: 663.9798767	total: 8.99ms	remaining: 2.24s
4:	learn: 647.0998837	total: 11.8ms	remaining: 2.34s
5:	learn: 628.0413137	total: 13.8ms	remaining: 2.29s
6:	learn: 610.0149091	total: 15.7ms	remaining: 2.22s
7:	learn: 593.7208965	total: 17.8ms	remaining: 2.21s
8:	learn: 577.8813113	total: 19.6ms	remaining: 2.16s
9:	learn: 562.0025479	total: 21.4ms	remaining: 2.12s
10:	learn: 547.6733936	total: 23.2ms	remaining: 2.09s
11:	learn: 531.8797799	total: 25.1ms	remaining: 2.07s
12:	learn: 518.1989157	total: 27ms	remaining: 2.05s
13:	learn: 506.2409296	total: 28.9ms	remaining: 2.04s
14:	learn: 491.3041994	total: 31ms	remaining: 2.04s
15:	learn: 478.1294562	total: 32.9ms	remaining: 2.02s
16:	learn: 465.0427015	total: 34.7ms	remaining: 2.01s
17:	learn: 453.0773827	total: 36.4ms	remaining: 1.99s
18:	learn: 442.

273:	learn: 4.9495503	total: 339ms	remaining: 899ms
274:	learn: 4.8897159	total: 340ms	remaining: 896ms
275:	learn: 4.8056520	total: 340ms	remaining: 892ms
276:	learn: 4.7123053	total: 341ms	remaining: 889ms
277:	learn: 4.6295371	total: 341ms	remaining: 886ms
278:	learn: 4.5493578	total: 341ms	remaining: 882ms
279:	learn: 4.4872203	total: 342ms	remaining: 879ms
280:	learn: 4.4221540	total: 342ms	remaining: 875ms
281:	learn: 4.3463968	total: 342ms	remaining: 872ms
282:	learn: 4.2943213	total: 343ms	remaining: 869ms
283:	learn: 4.2379868	total: 343ms	remaining: 865ms
284:	learn: 4.1656481	total: 344ms	remaining: 862ms
285:	learn: 4.1113126	total: 344ms	remaining: 859ms
286:	learn: 4.0531958	total: 344ms	remaining: 855ms
287:	learn: 4.0020918	total: 345ms	remaining: 852ms
288:	learn: 3.9560823	total: 345ms	remaining: 849ms
289:	learn: 3.8945396	total: 345ms	remaining: 846ms
290:	learn: 3.8283469	total: 346ms	remaining: 842ms
291:	learn: 3.7910057	total: 346ms	remaining: 839ms
292:	learn: 

777:	learn: 0.0233614	total: 511ms	remaining: 146ms
778:	learn: 0.0229710	total: 512ms	remaining: 145ms
779:	learn: 0.0228318	total: 512ms	remaining: 144ms
780:	learn: 0.0227029	total: 512ms	remaining: 144ms
781:	learn: 0.0223629	total: 513ms	remaining: 143ms
782:	learn: 0.0221128	total: 513ms	remaining: 142ms
783:	learn: 0.0219607	total: 513ms	remaining: 141ms
784:	learn: 0.0217133	total: 514ms	remaining: 141ms
785:	learn: 0.0215702	total: 514ms	remaining: 140ms
786:	learn: 0.0213272	total: 514ms	remaining: 139ms
787:	learn: 0.0211240	total: 515ms	remaining: 138ms
788:	learn: 0.0210142	total: 515ms	remaining: 138ms
789:	learn: 0.0209208	total: 515ms	remaining: 137ms
790:	learn: 0.0207855	total: 516ms	remaining: 136ms
791:	learn: 0.0204476	total: 516ms	remaining: 135ms
792:	learn: 0.0202582	total: 516ms	remaining: 135ms
793:	learn: 0.0200037	total: 517ms	remaining: 134ms
794:	learn: 0.0198217	total: 517ms	remaining: 133ms
795:	learn: 0.0195701	total: 517ms	remaining: 133ms
796:	learn: 

<catboost.core.CatBoostRegressor at 0x267fa76ce10>

In [70]:
best.save_model("model.cbm")