## Курсовой проект для курса "Python для Data Science"

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as LR, Ridge
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.metrics import r2_score as r2, mean_squared_error as mse
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
data_base = pd.read_csv('train.csv')
t_data_base = pd.read_csv('test.csv')

In [3]:
data = data_base.copy()

In [4]:
def prepare_lifesquare(df, df_source):
    df.loc[df.LifeSquare > 1000, 'LifeSquare'] = df.LifeSquare / 100
    df.LifeSquare = df.LifeSquare.fillna(df_source.LifeSquare.median())
    return df

In [5]:
def prepare_square(df):
    df.loc[df.LifeSquare > df.Square, 'Square'] = df.LifeSquare
    df.loc[df.Square > (df['Rooms'] * 40 + df.KitchenSquare), 'Square'] = df.Rooms * 40 + df.KitchenSquare
    df.loc[df.Square < (df.Rooms * 15), 'Square'] = df.Rooms * 15
    return df

In [6]:
# Средняя цена по район и количеству комнат
data_stat1 = data.groupby(['DistrictId', 'Rooms'], as_index=False)[['Price']].mean().rename(columns={'Price': 'mean_price'})

In [7]:
def join_stats1(df, data_stat1, df_source, mode):
    df = pd.merge(df, data_stat1, on=['DistrictId', 'Rooms'], how='left')
    if mode == 'test':
        df.mean_price = df.mean_price.fillna(df_source.mean_price.mean())
    return df

In [8]:
price_per_meter = pd.DataFrame(data.Price / data.Square)
data['price_per_meter'] = price_per_meter
# Средняя цена за 1 кв. метр в районах
data_stat2 = data.groupby(['DistrictId'], as_index=False)[['price_per_meter']].mean().rename(columns={'price_per_meter': 'price_per_meter_by_district'})

In [9]:
def join_stats2(df, data_stat2, df_source, mode):
    df = pd.merge(df, data_stat2, on=['DistrictId'], how='left')
    if mode == 'test':
        df.price_per_meter_by_district = df.price_per_meter_by_district.fillna(df_source.price_per_meter_by_district.mean())
    return df


In [10]:
def prepare_df(df, df_source, data_stat1, data_stat2, mode='train'):
    
    # Гипотеза 1 
    # - редактирование выбросов по HouseYear
    df.loc[df.HouseYear == 20052011, 'HouseYear'] = 2008
    df.loc[df.HouseYear == 4968, 'HouseYear'] = 1968

    # Гипотеза 2 
    # - редактирование выбросов по Rooms
    df.loc[df.Rooms == 19, 'Rooms'] = 1
    df.loc[df.Rooms == 10, 'Rooms'] = 2
    df.loc[df.Rooms == 6, 'Rooms'] = 2
    df.loc[df.Rooms == 0, 'Rooms'] = 1    
    
    # Гипотеза 3 
    # - деление на 100 значений переменной LifeSquare, если они > 1000;
    # - заполнение пустых значений медианным
    df = prepare_lifesquare(df, df_source)
    
    # Гипотеза 5
    # - замена значений Square значениями LifeSquare, если они меньше
    # - редактирование выбросов
    df = prepare_square(df)
    
    # Гипотеза 6
    # - добавление признака, отражающего среднюю стоимость за комнаты в районах
    df = join_stats1(df, data_stat1, df_source, mode)
    
    # Гипотеза 7
    # - замена значений Floor значениями HouseFloor, если они меньше
    df.loc[df.Floor > df.HouseFloor, 'HouseFloor'] = df.loc[df.Floor > df.HouseFloor, 'Floor']
           
    # Гипотеза 8 
    # - добавление признака, отражающего среднюю стоимость за 1 кв. метр в районах
    df = join_stats2(df, data_stat2, df_source, mode)
    
    return df

In [11]:
data = prepare_df(data, data, data_stat1, data_stat2)

In [12]:
features = data[['Rooms', 'Square', 'LifeSquare', 'HouseYear', 'mean_price', 
                 'HouseFloor', 'price_per_meter_by_district']]
target= data[['Price']]

In [13]:
def test_hypothesis(features, target):
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)
    
    model1 = LR()
    model2 = Ridge()
    model3 = RF()
    
    model1.fit(X_train, y_train)
    model2.fit(X_train, y_train)
    model3.fit(X_train, y_train)
    
    y_pred_1 = model1.predict(X_test)
    y_pred_2 = model2.predict(X_test)
    y_pred_3 = model3.predict(X_test)
    
    print(mse(y_test, y_pred_1), r2(y_test, y_pred_1))
    print(mse(y_test, y_pred_2), r2(y_test, y_pred_2))
    print(mse(y_test, y_pred_3), r2(y_test, y_pred_3))
    return

In [14]:
test_hypothesis(features, target)

2675245666.28519 0.6889883906084198
2675222190.276896 0.6889911198198764
2455063833.6298866 0.7145857056497718


## Проверка модели

In [15]:
data = data_base.copy()

In [16]:
feats=['Rooms', 'Square', 'LifeSquare', 'HouseYear', 'mean_price', 
       'HouseFloor', 'price_per_meter_by_district']

### Определение оптимальных параметров

In [17]:
params = {'min_samples_leaf': [3, 5, 7, 10], 
          'n_estimators': [25, 50, 75, 100],
          'max_depth': [3, 5, 7]
         }

In [18]:
regr = GridSearchCV(RF(), param_grid=params, cv=2)

In [19]:
data = prepare_df(data, data, data_stat1, data_stat2)

In [20]:
# regr.fit(data.loc[:,feats], data['Price'])
# print(regr.best_params_)
# print(regr.best_score_)

### Проверка валидационной выборки

In [21]:
RFR = RF(max_depth=7, min_samples_leaf=5, n_estimators=75, random_state=42)

In [22]:
data = data_base.copy()

In [23]:
train, valid = train_test_split(data, test_size=0.3, random_state=42)

In [24]:
train = prepare_df(train, train, data_stat1, data_stat2)

In [25]:
RFR.fit(train.loc[:,feats], train.Price)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=7,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=5, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=75,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [26]:
valid = prepare_df(valid, train, data_stat1, data_stat2, mode='test')

In [27]:
y_pred = RFR.predict(valid.loc[:, feats])
    
print(mse(valid.Price, y_pred), r2(valid.Price, y_pred))

2350361512.838564 0.7267579101342894


## Обучение модели

In [29]:
data = data_base.copy()
data = prepare_df(data, data, data_stat1, data_stat2)

In [30]:
RFR.fit(data.loc[:,feats], data.Price)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=7,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=5, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=75,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

## Рассчет цены тестовой выборки

In [31]:
test = pd.read_csv('test.csv')

In [32]:
test = prepare_df(test, data, data_stat1, data_stat2, mode='test')

In [33]:
test['Price'] = RFR.predict(test.loc[:, feats])

In [34]:
test[['Id', 'Price']].to_csv('AKonstantinov_predictions.csv', index=False)