In [1]:
import numpy as np
import pandas as pd
from scipy.stats import mode
import warnings
import pickle
import matplotlib
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse, r2_score as r2
from IPython.display import Image

In [2]:
DATASET_PATH = 'C:/Users/ASUS/Desktop/Учеба/Библиотеки Python для Data Science/Estate_project_training/test.csv'
PREPARED_DATASET_PATH = 'C:/Users/ASUS/Desktop/Учеба/Библиотеки Python для Data Science/Estate_project_training/test_done.csv'
REPORTS_FILE_PATH = 'C:/Users/ASUS/Desktop/Учеба/Библиотеки Python для Data Science/Estate_project_training/reports/'
SCALER_FILE_PATH = 'C:/Users/ASUS/Desktop/Учеба/Библиотеки Python для Data Science/Estate_project_training/scaler.pkl'
MODEL_FILE_PATH = 'C:/Users/ASUS/Desktop/Учеба/Библиотеки Python для Data Science/Estate_project_training/model.pkl'

In [3]:
df = pd.read_csv(DATASET_PATH, sep=",")
df.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,4567,44,1.0,36.84763,19.094182,5.0,5,9.0,1970,0.036122,B,B,24,4378,0,1036.0,1,1,B
1,5925,62,1.0,42.493907,42.568133,10.0,7,17.0,2017,0.072158,B,B,2,629,1,,0,0,A
2,960,27,2.0,59.463678,,9.0,19,19.0,1977,0.211401,B,B,9,1892,0,,0,1,B
3,3848,23,3.0,49.64603,33.893825,6.0,2,2.0,1965,0.014073,B,B,2,475,0,,0,0,B
4,746,74,1.0,53.837056,,1.0,8,17.0,1977,0.309479,B,B,35,7715,4,990.0,0,6,B


In [4]:
df.shape

(5000, 19)

In [5]:
df.dtypes

Id                 int64
DistrictId         int64
Rooms            float64
Square           float64
LifeSquare       float64
KitchenSquare    float64
Floor              int64
HouseFloor       float64
HouseYear          int64
Ecology_1        float64
Ecology_2         object
Ecology_3         object
Social_1           int64
Social_2           int64
Social_3           int64
Healthcare_1     float64
Helthcare_2        int64
Shops_1            int64
Shops_2           object
dtype: object

In [6]:
df['HouseFloor'] = df['HouseFloor'].astype('int64')
df['Id'] = df['Id'].astype('str')
df['HouseFloor'].dtype, df['Id'].dtype

(dtype('int64'), dtype('O'))

In [7]:
df['Ecology_2'].value_counts()

B    4952
A      48
Name: Ecology_2, dtype: int64

In [8]:
df['Ecology_2'] = (df['Ecology_2'] == 'B').astype(int)
df['Ecology_2'].value_counts()

1    4952
0      48
Name: Ecology_2, dtype: int64

In [9]:
df['Ecology_3'].value_counts()

B    4851
A     149
Name: Ecology_3, dtype: int64

In [10]:
df['Ecology_3'] = (df['Ecology_3'] == 'B').astype(int)
df['Ecology_3'].value_counts()

1    4851
0     149
Name: Ecology_3, dtype: int64

In [11]:
df['Shops_2'].value_counts()

B    4588
A     412
Name: Shops_2, dtype: int64

In [12]:
df['Shops_2'] = (df['Shops_2'] == 'B').astype(int)
df['Shops_2'].value_counts()

1    4588
0     412
Name: Shops_2, dtype: int64

In [13]:
df.describe()

Unnamed: 0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
count,5000.0,5000.0,5000.0,3959.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,2623.0,5000.0,5000.0,5000.0
mean,51.2792,1.91,56.4495,36.15881,5.9768,8.632,12.601,1984.3926,0.119874,0.9904,0.9702,24.9338,5406.9,8.2626,1146.657263,1.3194,4.2428,0.9176
std,44.179466,0.838594,19.092787,17.825287,9.950018,5.483228,6.789213,18.573149,0.12007,0.097518,0.170052,17.532202,4026.614773,23.863762,1044.744231,1.47994,4.777365,0.275001
min,0.0,0.0,1.378543,0.33349,0.0,1.0,0.0,1908.0,0.0,0.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,0.0
25%,21.0,1.0,41.906231,23.092026,1.0,4.0,9.0,1973.0,0.019509,1.0,1.0,6.0,1564.0,0.0,325.0,0.0,1.0,1.0
50%,37.0,2.0,52.92134,32.925087,6.0,7.0,12.0,1977.0,0.072158,1.0,1.0,25.0,5285.0,2.0,900.0,1.0,3.0,1.0
75%,77.0,2.0,66.285129,45.174091,9.0,12.0,17.0,2000.0,0.195781,1.0,1.0,36.0,7287.0,5.0,1548.0,2.0,6.0,1.0
max,212.0,17.0,223.453689,303.071094,620.0,78.0,99.0,2020.0,0.521867,1.0,1.0,74.0,19083.0,141.0,4849.0,6.0,23.0,1.0


In [14]:
df.isnull().sum()

Id                  0
DistrictId          0
Rooms               0
Square              0
LifeSquare       1041
KitchenSquare       0
Floor               0
HouseFloor          0
HouseYear           0
Ecology_1           0
Ecology_2           0
Ecology_3           0
Social_1            0
Social_2            0
Social_3            0
Healthcare_1     2377
Helthcare_2         0
Shops_1             0
Shops_2             0
dtype: int64

In [15]:
median_hc = df['Healthcare_1'].median()
df['Healthcare_1'].fillna(median_hc, inplace=True)
df['Healthcare_1'].isna().sum()

0

In [16]:
median_ls = df['Square'] * 0.6
df['LifeSquare'].fillna(median_ls, inplace=True)
df['LifeSquare'].isna().sum()

0

In [17]:
median_dist = df['DistrictId'].median()
df['DistrictId'].where(~(df.DistrictId == 0), other=median_dist, inplace=True)
df['DistrictId'].round()

0        44
1        62
2        27
3        23
4        74
       ... 
4995     29
4996     38
4997    101
4998     10
4999     21
Name: DistrictId, Length: 5000, dtype: int64

In [18]:
median_rooms = df['Rooms'].median()
df['Rooms'].where(~(df.Rooms < 1), other=2, inplace=True)
df['Rooms'].where(~(df.Rooms > 5), other=median_rooms, inplace=True)
df['Rooms'].round()
df['Rooms'] = df['Rooms'].astype('int64')
df['Rooms'].unique()

array([1, 2, 3, 4, 5], dtype=int64)

In [19]:
median_sq = df['Square'].median()
df['Square'].where(~(df.Square < 18), other=median_sq, inplace=True)
df['Square'].where(~(df.Square > 150), other=median_sq, inplace=True)
df['Square'].max(), df['Square'].min()

(148.78349161593997, 18.14926697395837)

In [20]:
median_ls = df['LifeSquare'].median()
df['LifeSquare'].where(~(df.LifeSquare > df.Square * 0.8), other=median_ls, inplace=True)
df['LifeSquare'].where(~(df.LifeSquare < 11), other=median_ls, inplace=True)
df['LifeSquare'].max(), df['LifeSquare'].min()

(92.4840984101504, 11.070513256260249)

In [21]:
median_ks = df['KitchenSquare'].median()
df['KitchenSquare'].where(~(df.KitchenSquare < 5), other=median_ks, inplace=True)
df['KitchenSquare'].where(~(df.KitchenSquare > 20), other=median_ks, inplace=True)

In [22]:
median_hf = df['HouseFloor'].median()
df['HouseFloor'].where(~(df.HouseFloor > 35), other=median_hf, inplace=True)

In [23]:
median_f = df['Floor'].median()
df['Floor'].where(~(df.HouseFloor < df.Floor), other=median_f, inplace=True)

In [24]:
median_hy = df['HouseYear'].median()
df['HouseYear'].where(~(df.HouseYear > 2020), other=median_hy, inplace=True)
df['HouseYear'].where(~(df.HouseYear < 1950), other=median_hy, inplace=True)

In [25]:
median_eco_1 = df['Ecology_1'].median()
df['Ecology_1'].where(~(df.Ecology_1 == 0), other=median_eco_1, inplace=True)

In [26]:
median_soc_1 = df['Social_1'].median()
df['Social_1'].where(~(df.Social_1 == 0), other=median_soc_1, inplace=True)

In [27]:
median_soc_2 = df['Social_2'].median()
df['Social_2'].where(~(df.Social_2 == 0), other=median_soc_2, inplace=True)

In [28]:
median_soc_3 = df['Social_3'].median()
df['Social_3'].where(~(df.Social_3 == 0), other=median_soc_3, inplace=True)

In [29]:
median_hc_1 = df['Healthcare_1'].median()
df['Healthcare_1'].where(~(df.Healthcare_1 == 0), other=median_hc_1, inplace=True)

In [30]:
median_hc_2 = df['Helthcare_2'].median()
df['Helthcare_2'].where(~(df.Helthcare_2 == 0), other=median_hc_2, inplace=True)

In [31]:
df.dtypes

Id                object
DistrictId         int64
Rooms              int64
Square           float64
LifeSquare       float64
KitchenSquare    float64
Floor              int64
HouseFloor         int64
HouseYear          int64
Ecology_1        float64
Ecology_2          int32
Ecology_3          int32
Social_1           int64
Social_2           int64
Social_3           int64
Healthcare_1     float64
Helthcare_2        int64
Shops_1            int64
Shops_2            int32
dtype: object

In [32]:
df.to_csv("PREPARED_DATASET_PATH.csv", index=False, encoding='utf-8', sep=',')

In [33]:
with open(MODEL_FILE_PATH, 'rb') as file:
    gb_model = pickle.load(file)
gb_model

GradientBoostingRegressor(criterion='mse', learning_rate=0.025, loss='huber',
                          max_depth=6, max_features='sqrt', min_samples_leaf=50,
                          n_estimators=2250, random_state=42)

In [34]:
submit = pd.read_csv('C:/Users/ASUS/Desktop/Учеба/Библиотеки Python для Data Science/Estate_project_training/sample_submission.csv')
submit.head()

Unnamed: 0,Id,Price
0,4567,200000.0
1,5925,200000.0
2,960,200000.0
3,3848,200000.0
4,746,200000.0


In [35]:
f_names = ['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2', 'Ecology_3',
           'Social_1', 'Social_2', 'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1', 'Shops_2']
df = df[f_names]
df.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,4567,44,1,36.84763,19.094182,5.0,5,9,1970,0.036122,1,1,24,4378,2,1036.0,1,1,1
1,5925,62,1,42.493907,33.337229,10.0,7,17,2017,0.072158,1,1,2,629,1,900.0,1,0,0
2,960,27,2,59.463678,35.678207,9.0,19,19,1977,0.211401,1,1,9,1892,2,900.0,1,1,1
3,3848,23,3,49.64603,33.893825,6.0,2,2,1965,0.014073,1,1,2,475,2,900.0,1,0,1
4,746,74,1,53.837056,32.302233,6.0,8,17,1977,0.309479,1,1,35,7715,4,990.0,1,6,1


In [36]:
predictions = gb_model.predict(df)
predictions

array([168069.94230574, 114253.11445644, 135220.68911647, ...,
       149073.45838915, 190019.11456751, 289942.40640652])

In [37]:
submit['Price'] = predictions
submit.head()

Unnamed: 0,Id,Price
0,4567,168069.942306
1,5925,114253.114456
2,960,135220.689116
3,3848,128799.649966
4,746,188657.128094


In [38]:
submit.to_csv('gb_submit.csv', index=False)