In [70]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib as plt
import seaborn as sns
import warnings
from pylab import rcParams
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score as r2
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

warnings.filterwarnings('ignore')

pd.options.display.max_columns = 100
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [22]:
data = pd.read_csv(r'C:\Users\user\Desktop\Kurs_project_task\train.csv')

In [23]:
train, valid = train_test_split(data, test_size=0.2, random_state=42)

In [24]:
data.shape

(10000, 20)

In [25]:
train.shape

(8000, 20)

In [26]:
valid.shape

(2000, 20)

In [27]:
train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
9254,12473,58,1.0,65.271225,62.798045,1.0,3,1.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,210849.693542
1561,14050,146,2.0,45.091598,43.816601,1.0,5,18.0,2017,0.236108,B,B,16,3893,27,,3,10,B,201549.998077
1670,3779,5,1.0,34.463114,20.561823,5.0,3,5.0,1960,0.150818,B,B,16,3433,4,2643.0,4,5,B,158433.168775
6087,3762,90,3.0,61.931107,46.126389,5.0,4,5.0,1966,0.265089,B,B,37,5288,0,1937.0,3,2,B,220194.81843
6669,358,1,1.0,42.67084,,1.0,8,17.0,1977,0.007122,B,B,1,264,0,,0,1,B,125834.270291


In [28]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8000 entries, 9254 to 7270
Data columns (total 20 columns):
Id               8000 non-null int64
DistrictId       8000 non-null int64
Rooms            8000 non-null float64
Square           8000 non-null float64
LifeSquare       6299 non-null float64
KitchenSquare    8000 non-null float64
Floor            8000 non-null int64
HouseFloor       8000 non-null float64
HouseYear        8000 non-null int64
Ecology_1        8000 non-null float64
Ecology_2        8000 non-null object
Ecology_3        8000 non-null object
Social_1         8000 non-null int64
Social_2         8000 non-null int64
Social_3         8000 non-null int64
Healthcare_1     4148 non-null float64
Helthcare_2      8000 non-null int64
Shops_1          8000 non-null int64
Shops_2          8000 non-null object
Price            8000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 1.3+ MB


In [29]:
import sys
!{sys.executable} -m pip install pandas-profiling



In [31]:
import pandas_profiling
pandas_profiling.ProfileReport(train)



In [32]:
mean_year = np.round(train.loc[train['HouseYear'] <= 2020, 'HouseYear'].mean())
mean_year

1985.0

In [36]:
def clean_year(df, mean_year):
    df.loc[df['HouseYear'] > 2020, 'HouseYear'] = mean_year

In [43]:
def clean_objects(df):
    object_map = {
        "A": 0, "B": 1 
    }
    df["Shops_2"] = df["Shops_2"].map(object_map)
    df["Ecology_2"] = df["Ecology_2"].map(object_map)
    df["Ecology_3"] = df["Ecology_3"].map(object_map)

In [44]:
def clean_life_square(df):
    df.loc[(df['LifeSquare'] < 5) | (df['LifeSquare'].isnull()), 'LifeSquare'] = df['Square']*0.85

In [45]:
def prepare_data(df, mean_year=mean_year):
    clean_year(df, mean_year)
    clean_life_square(df)
    clean_objects(df)

In [46]:
prepare_data(train)
prepare_data(valid)

In [154]:
feats = ['DistrictId', 'Ecology_1', 'Floor', 'Square', 'LifeSquare', 'Rooms', 'HouseYear']

In [47]:
#scaler = StandardScaler()

In [48]:
#train_scaled = scaler.fit_transform(train)

In [55]:
#valid_scaled = scaler.transform(valid)

In [221]:
model = RandomForestRegressor(n_estimators=50, max_depth=12, random_state=42,)

In [222]:
model.fit(train.loc[:, feats], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [223]:
pred_train = model.predict(train.loc[:, feats])

In [224]:
r2(train['Price'], pred_train)

0.8859705650186895

In [225]:
pred_valid = model.predict(valid.loc[:, feats])

In [226]:
r2(valid['Price'], pred_valid)

0.6657199472884365

prediction RendomForestRegressor

In [None]:
test = pd.read_csv(r'C:\Users\user\Desktop\Kurs_project_task\test.csv')

In [None]:
test.shape

In [None]:
prepare_data(test)

In [None]:
test['Price'] = model.predict(test.loc[:, feats])

In [None]:
test.head()

In [None]:
test.info()

In [5]:
test.loc[:, ['Id', 'Price']].to_csv('DLetyuk_predictions.csv', index=False)