In [26]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import auc, f1_score, accuracy_score, roc_auc_score, mean_absolute_error
from sklearn.model_selection import StratifiedKFold, GridSearchCV
#from tqdm import tqdm_notebook
#from catboost import CatBoostRegressor
#from lightgbm import LGBMRegressor, LGBMClassifier
import warnings
warnings.filterwarnings("ignore")

In [27]:
train_df = pd.read_csv("train1.csv")
train_df.head()

Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,...,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13,price
0,0,2011-1,616,0.0,4,43,2,0,30.0,1.0,...,0,0,0,0,0,0,0,0,0,1738000
1,1,2011-1,112,0.0,3,33,1,0,15.0,1.0,...,0,0,0,0,0,0,0,0,0,1169000
2,2,2011-1,230,,9,34,1,0,25.0,,...,0,0,0,0,0,0,0,0,0,2821000
3,3,2011-1,302,1.0,4,60,3,0,15.0,0.0,...,0,0,0,0,0,0,0,0,0,5714000
4,4,2011-1,578,0.0,3,49,2,0,30.0,,...,0,0,0,0,0,0,0,0,0,1660000



Для построения модели помимо вышеперечисленного даны 22 переменные с вполне понятными названиями за исключением: kw1 ... kw13 - это факт наличия некоторых ключевых слов в тексте объявления.

In [28]:
train_df["street_id"].min()

0

In [29]:
def is_first(a):
    s=[]
    for b in a:
        if b==1:
            s.append(0)
        else:
            s.append(1)
    return s   

def is_big(rooms,area):
    res=[]
    for i in range (0,len(rooms)):
        if rooms[i]>=3 and area[i]>=70:
            res.append(1)
        elif rooms[i]==2 and area[i]>=50:
            res.append(1)
        elif rooms[i]==1 and area[i]>=35:
            res.append(1)
        else:
            res.append(0)
    return res       

def get_year(data):
    res=[]
    for d in data:
        res.append(d[0:4:])
    return res    
       

    

In [30]:
#вместо цены квартиры будем предсказывать цену метра^2
train_df=train_df[~train_df["price"].isnull()]
train_df=train_df[~train_df["metro_dist"].isnull()]
train_df=train_df[train_df["price"]<50000000]
train_df=train_df[train_df["price"]>700000]
train_df["price"]=train_df["price"]//train_df["area"]
train_df["floor"]=is_first(train_df["floor"].values)
train_df.columns

Index(['id', 'date', 'street_id', 'build_tech', 'floor', 'area', 'rooms',
       'balcon', 'metro_dist', 'g_lift', 'n_photos', 'kw1', 'kw2', 'kw3',
       'kw4', 'kw5', 'kw6', 'kw7', 'kw8', 'kw9', 'kw10', 'kw11', 'kw12',
       'kw13', 'price'],
      dtype='object')

In [31]:
train_df.isnull().sum(axis=0)

id                0
date              0
street_id         0
build_tech    28610
floor             0
area              0
rooms             0
balcon            0
metro_dist        0
g_lift        28306
n_photos          0
kw1               0
kw2               0
kw3               0
kw4               0
kw5               0
kw6               0
kw7               0
kw8               0
kw9               0
kw10              0
kw11              0
kw12              0
kw13              0
price             0
dtype: int64

In [32]:
train_df=train_df.fillna(train_df.median())
train_df

Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,...,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13,price
0,0,2011-1,616,0.0,1,43,2,0,30.0,1.0,...,0,0,0,0,0,0,0,0,0,40418
1,1,2011-1,112,0.0,1,33,1,0,15.0,1.0,...,0,0,0,0,0,0,0,0,0,35424
2,2,2011-1,230,0.0,1,34,1,0,25.0,0.0,...,0,0,0,0,0,0,0,0,0,82970
3,3,2011-1,302,1.0,1,60,3,0,15.0,0.0,...,0,0,0,0,0,0,0,0,0,95233
4,4,2011-1,578,0.0,1,49,2,0,30.0,0.0,...,0,0,0,0,0,0,0,0,0,33877
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,2012-3,612,0.0,1,36,1,0,30.0,0.0,...,0,0,0,0,0,0,0,0,0,108277
99996,99996,2012-3,573,0.0,1,51,2,0,30.0,0.0,...,0,0,0,0,0,0,0,0,0,170549
99997,99997,2012-3,550,0.0,1,48,2,0,30.0,0.0,...,0,0,0,0,0,0,0,0,0,135375
99998,99998,2012-3,595,1.0,1,51,2,1,15.0,1.0,...,0,0,0,0,0,0,0,0,0,185019


In [33]:
Y=train_df.iloc[:, -1:].values
train_df["big_rooms"]=is_big(train_df["rooms"].values,train_df["area"].values)
train_df["year"]=get_year(train_df["date"].values)
train_df=train_df.drop(["price","n_photos"],axis=1)
train_df["ar_div_room"]=train_df["area"]//train_df["rooms"]
train_df

Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,...,kw7,kw8,kw9,kw10,kw11,kw12,kw13,big_rooms,year,ar_div_room
0,0,2011-1,616,0.0,1,43,2,0,30.0,1.0,...,0,0,0,0,0,0,0,0,2011,21
1,1,2011-1,112,0.0,1,33,1,0,15.0,1.0,...,0,0,0,0,0,0,0,0,2011,33
2,2,2011-1,230,0.0,1,34,1,0,25.0,0.0,...,0,0,0,0,0,0,0,0,2011,34
3,3,2011-1,302,1.0,1,60,3,0,15.0,0.0,...,0,0,0,0,0,0,0,0,2011,20
4,4,2011-1,578,0.0,1,49,2,0,30.0,0.0,...,0,0,0,0,0,0,0,0,2011,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,2012-3,612,0.0,1,36,1,0,30.0,0.0,...,0,0,0,0,0,0,0,1,2012,36
99996,99996,2012-3,573,0.0,1,51,2,0,30.0,0.0,...,0,0,0,0,0,0,0,1,2012,25
99997,99997,2012-3,550,0.0,1,48,2,0,30.0,0.0,...,0,0,0,0,0,0,0,0,2012,24
99998,99998,2012-3,595,1.0,1,51,2,1,15.0,1.0,...,0,0,0,0,0,0,0,1,2012,25


In [34]:
#l=np.linspace(1,25,25)
#for i in l:
#    train_df[("floor"+str(i))]=0
#train_df    

In [35]:
X=train_df.iloc[: , 2:].values
#Y=train_df.iloc[:, -1:]
from sklearn.preprocessing import minmax_scale
X=minmax_scale(X)
X.shape

(94844, 24)

In [36]:
Y

array([[ 40418],
       [ 35424],
       [ 82970],
       ...,
       [135375],
       [185019],
       [ 95000]], dtype=int64)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.01, random_state=42)

In [38]:
"""
train_mae = []
test_mae = []
#l = [0.36,0.37, 0.38,0.39, 0.4,0.41, 0.42, 0.43]
min_s=[4,5,6,7,8,9,10,11,12,13,14,16]

for n in min_s:
    model=GradientBoostingRegressor( min_samples_leaf=10,loss='huber',learning_rate=0.4, n_estimators=150,min_samples_split=5,max_depth=7,random_state=131)
    temp_train_mae = []
    temp_test_mae = []
    model.fit(X_train, y_train)
    train_mae.append(mean_absolute_error(model.predict(X_train), y_train))
    test_mae.append(mean_absolute_error(model.predict(X_test), y_test))
    print("done", n)
"""

'\ntrain_mae = []\ntest_mae = []\n#l = [0.36,0.37, 0.38,0.39, 0.4,0.41, 0.42, 0.43]\nmin_s=[4,5,6,7,8,9,10,11,12,13,14,16]\n\nfor n in min_s:\n    model=GradientBoostingRegressor( min_samples_leaf=10,loss=\'huber\',learning_rate=0.4, n_estimators=150,min_samples_split=5,max_depth=7,random_state=131)\n    temp_train_mae = []\n    temp_test_mae = []\n    model.fit(X_train, y_train)\n    train_mae.append(mean_absolute_error(model.predict(X_train), y_train))\n    test_mae.append(mean_absolute_error(model.predict(X_test), y_test))\n    print("done", n)\n'

In [39]:
"""
plt.plot(min_s,test_mae,color='red')
plt.plot(min_s,train_mae,color='green')
plt.show 
"""

"\nplt.plot(min_s,test_mae,color='red')\nplt.plot(min_s,train_mae,color='green')\nplt.show \n"

In [40]:
#model2=RandomForestRegressor(max_depth=32,min_samples_split=5,max_features=20,n_estimators=100,random_state=42)


#model2=LGBMRegressor(learning_rate=0.2, n_estimators=550,min_samples_split=5,max_depth=100,random_state=142)

model2=GradientBoostingRegressor(min_samples_leaf=10,loss='huber',learning_rate=0.4, n_estimators=150,min_samples_split=5,max_depth=7,random_state=131)
#best entry GradientBoostingRegressor(learning_rate=0.25, n_estimators=150,min_samples_split=5,max_depth=10,random_state=142)
# L R = 0.33 и 0.4 хуже mae=926

In [41]:
#model1.fit(X_train,y_train)

In [42]:
model2.fit(X_train,y_train)

GradientBoostingRegressor(learning_rate=0.4, loss='huber', max_depth=7,
                          min_samples_leaf=10, min_samples_split=5,
                          n_estimators=150, random_state=131)

In [43]:
#y_pred1=model1.predict(X_test)
y_pred2=model2.predict(X_test)
#y_pred=(99*y_pred2+y_pred1)//100
y_pred=y_pred2

In [44]:
mean_absolute_error(y_pred,y_test)

12613.730540098451

In [45]:
test_df = pd.read_csv("test1.csv")
test_df.head()


Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,...,kw4,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13
0,100000,2012-3,422,0.0,1,59,3,0,25.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,100001,2012-3,380,0.0,10,58,2,0,,,...,0,0,0,1,0,0,0,0,0,0
2,100002,2012-3,362,1.0,3,54,2,0,10.0,,...,0,0,0,0,0,0,0,0,0,0
3,100003,2012-3,34,0.0,4,35,1,0,25.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,100004,2012-3,562,0.0,4,56,3,0,30.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
test_df=test_df.fillna(test_df.median())
test_df.head()

Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,...,kw4,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13
0,100000,2012-3,422,0.0,1,59,3,0,25.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,100001,2012-3,380,0.0,10,58,2,0,25.0,0.0,...,0,0,0,1,0,0,0,0,0,0
2,100002,2012-3,362,1.0,3,54,2,0,10.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,100003,2012-3,34,0.0,4,35,1,0,25.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,100004,2012-3,562,0.0,4,56,3,0,30.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
test_df["street_id"].max()

671

In [48]:
test_df["floor"]=is_first(test_df["floor"].values)
test_df["big_rooms"]=is_big(test_df["rooms"].values,test_df["area"].values)
test_df["year"]=get_year(test_df["date"].values)
test_df=test_df.drop(["n_photos"],axis=1)
test_df["ar_div_room"]=test_df["area"]//test_df["rooms"]
test_df


Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,...,kw7,kw8,kw9,kw10,kw11,kw12,kw13,big_rooms,year,ar_div_room
0,100000,2012-3,422,0.0,0,59,3,0,25.0,0.0,...,0,0,0,0,0,0,0,0,2012,19
1,100001,2012-3,380,0.0,1,58,2,0,25.0,0.0,...,1,0,0,0,0,0,0,1,2012,29
2,100002,2012-3,362,1.0,1,54,2,0,10.0,0.0,...,0,0,0,0,0,0,0,1,2012,27
3,100003,2012-3,34,0.0,1,35,1,0,25.0,1.0,...,0,0,0,0,0,0,0,1,2012,35
4,100004,2012-3,562,0.0,1,56,3,0,30.0,0.0,...,0,0,0,0,0,0,0,0,2012,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,199995,2013-9,89,0.0,1,43,2,0,30.0,0.0,...,0,0,0,0,0,0,0,0,2013,21
99996,199996,2013-9,664,1.0,1,62,3,1,30.0,0.0,...,0,0,0,0,0,0,0,0,2013,20
99997,199997,2013-9,358,0.0,1,35,1,1,30.0,0.0,...,0,0,0,0,0,0,0,1,2013,35
99998,199998,2013-9,224,1.0,1,52,2,1,30.0,0.0,...,0,0,0,0,0,0,0,1,2013,26


In [49]:
#test_df["area_room"]=test_df["area"]*test_df["rooms"]
#test_df["square_metro_dist"]=test_df["metro_dist"]**2

In [50]:
X_check=test_df.iloc[:,2:].values
X_check=minmax_scale(X_check)
X_check.shape

(100000, 24)

In [51]:
#y_check1=model1.predict(X_check)
y_check2=model2.predict(X_check)
#y_pred=(149*y_pred2+y_pred1)//150
y_check=y_check2

In [52]:
test_df["price"]=y_check*test_df["area"]

In [53]:
test_df.head()

Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,...,kw8,kw9,kw10,kw11,kw12,kw13,big_rooms,year,ar_div_room,price
0,100000,2012-3,422,0.0,0,59,3,0,25.0,0.0,...,0,0,0,0,0,0,0,2012,19,3496482.0
1,100001,2012-3,380,0.0,1,58,2,0,25.0,0.0,...,0,0,0,0,0,0,1,2012,29,5947983.0
2,100002,2012-3,362,1.0,1,54,2,0,10.0,0.0,...,0,0,0,0,0,0,1,2012,27,4343702.0
3,100003,2012-3,34,0.0,1,35,1,0,25.0,1.0,...,0,0,0,0,0,0,1,2012,35,1999030.0
4,100004,2012-3,562,0.0,1,56,3,0,30.0,0.0,...,0,0,0,0,0,0,0,2012,18,2763801.0


In [54]:
test_df[["id", "price"]].to_csv("ht5_results.csv", index=False)