In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [2]:
from utils import top_40_cols

df = pd.read_csv('../Dataset/train.csv/train.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month

df.drop(['timestamp'], axis=1, inplace=True)
target = df['price_doc']

columns_to_drop = [column for column in df.columns if column not in top_40_cols]
df.drop(columns=columns_to_drop, axis=1, inplace=True)
missing_column = [column for column in top_40_cols if column not in df.columns]
print(missing_column)
df['price_doc'] = target


df

['product_type_Investment']


Unnamed: 0,full_sq,life_sq,floor,max_floor,build_year,num_room,kitch_sq,state,product_type,sub_area,...,cafe_count_3000_price_1500,cafe_count_3000_price_2500,sport_count_3000,office_sqm_5000,cafe_count_5000_na_price,cafe_count_5000_price_2500,cafe_count_5000_price_high,year,month,price_doc
0,43,27.0,4.0,,,,,,Investment,Bibirevo,...,16,3,21,807385,12,9,0,2011,8,5850000
1,34,19.0,3.0,,,,,,Investment,Nagatinskij Zaton,...,4,2,19,2690465,9,15,0,2011,8,6000000
2,43,29.0,2.0,,,,,,Investment,Tekstil'shhiki,...,9,3,20,1478160,10,10,0,2011,8,5700000
3,89,50.0,9.0,,,,,,Investment,Mitino,...,10,3,18,244166,4,11,1,2011,9,13100000
4,77,77.0,4.0,,,,,,Investment,Basmannoe,...,262,149,77,8404624,143,319,17,2011,9,16331452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30466,44,27.0,7.0,9.0,1975.0,2.0,6.0,3.0,Investment,Otradnoe,...,15,5,29,838601,18,15,0,2015,6,7400000
30467,86,59.0,3.0,9.0,1935.0,4.0,10.0,3.0,Investment,Tverskoe,...,230,155,80,9949843,136,313,24,2015,6,25000000
30468,45,,10.0,20.0,,1.0,1.0,1.0,OwnerOccupier,Poselenie Vnukovskoe,...,2,1,6,117300,1,1,0,2015,6,6970959
30469,64,32.0,5.0,15.0,2003.0,2.0,11.0,2.0,Investment,Obruchevskoe,...,26,13,33,1225712,11,22,1,2015,6,13500000


In [3]:
print("Columns with NAN: ", df.columns[df.isna().any()].tolist())
# columns with NAN are ['life_sq', 'floor', 'max_floor', 'build_year', 'num_room', 'kitch_sq', 'state', 'metro_min_walk']
# first step is to fill in NaN columns with some values.
# we take different approaches for different columns

# For life_sq, I think it is acceptable that we replace NaN values with the full_sq values of those rows
df['life_sq'].fillna(df['full_sq'], inplace=True)
print("Columns with NAN: ", df.columns[df.isna().any()].tolist())



Columns with NAN:  ['life_sq', 'floor', 'max_floor', 'build_year', 'num_room', 'kitch_sq', 'state', 'metro_min_walk']
Columns with NAN:  ['floor', 'max_floor', 'build_year', 'num_room', 'kitch_sq', 'state', 'metro_min_walk']


In [4]:
from utils import preprocess_train

df_no_outliers = preprocess_train(df)


# # drop rows where life_sq and kitch_sq higher than full_sq
# df = df.drop(df[(df['full_sq'] <= df['life_sq'])].index)
# df = df.drop(df[(df['full_sq'] <= df['kitch_sq'])].index)

# # for max_floor, we could fill NaN with the median max_floor of properties in the same sub_area
# sub_area_medians = df.groupby('sub_area')['max_floor'].median().reset_index()
# # sub_area_medians['max_floor'] = np.ceil(sub_area_medians['max_floor'])
# df = df.merge(sub_area_medians, on='sub_area', suffixes=('', '_median'), how='left')
# df['max_floor'].fillna(df['max_floor_median'], inplace=True)
# df.drop(columns='max_floor_median', inplace=True)

# # and then for floor, we just fill NaN with the max_floor
# df['floor'].fillna(df['max_floor'], inplace=True)

# # finally we replace the max_floor with the floor, if there are any value of floor greater than max_floor(e.g row 63)
# df['max_floor'] = df.apply(lambda row: row['floor'] if row['floor'] > row['max_floor'] else row['max_floor'], axis=1)

# # we do the same for build_year, fill NaN with the median build_year of properties in the same sub_area
# sub_area_medians = df.groupby('sub_area')['build_year'].median().reset_index()
# # sub_area_medians['build_year'] = np.ceil(sub_area_medians['build_year'])
# df = df.merge(sub_area_medians, on='sub_area', suffixes=('', '_median'), how='left')
# df['build_year'].fillna(df['build_year_median'], inplace=True)
# df.drop(columns='build_year_median', inplace=True)

# # for num_room, we shall split the data into different ranges of full_sq value and calculate the average num_room for each range
# # then, we replace the num_room NaN values depending on which range the row's full_sq belongs to
# # Define the ranges for 'full_sq' bins
# bins = [0, 30, 52, 80, float('inf')]  # these values are eyeballed
# print(bins)

# # Use pd.cut to create bins for 'full_sq'
# df['full_sq_bins'] = pd.cut(df['full_sq'], bins=bins)

# # Calculate the average 'num_room' for each 'full_sq' range
# # num_room_averages = df.groupby('full_sq_bins')['num_room'].transform('mean')
# num_room_averages = df.groupby('full_sq_bins')['num_room'].transform(lambda x: np.ceil(x.mean()))
# df['num_room'].fillna(num_room_averages, inplace=True)
# df.drop(columns='full_sq_bins', inplace=True)

# # for kitch_sq, we shall group by sub_area and calculate the average kitch_sq/life_sq proportion, then replace NaN values with the proporiton multiplied by life_sq
# # Calculate the 'kitch_sq/life_sq' for each row
# df['kitch_sq_per_life_sq'] = df['kitch_sq'] / df['life_sq']

# # Calculate the average 'kitch_sq/life_sq' for each 'sub_area'
# sub_area_avg = df.groupby('sub_area')['kitch_sq_per_life_sq'].mean()

# # Define a function to fill NaN values in 'kitch_sq' based on 'sub_area'
# def fill_kitch_sq(row):
#     sub_area = row['sub_area']
#     if pd.notna(row['kitch_sq']):
#         return row['kitch_sq']
#     if sub_area in sub_area_avg:
#         return np.ceil(row['life_sq'] * sub_area_avg[sub_area])
#     return row['kitch_sq']

# # Apply the function to fill NaN values in 'kitch_sq'
# df['kitch_sq'] = df.apply(fill_kitch_sq, axis=1)

# # Drop the 'kitch_sq_per_life_sq' column if you no longer need it
# df.drop(columns='kitch_sq_per_life_sq', inplace=True)

# # we do the same for state, fill NaN with the median state of properties in the same sub_area
# sub_area_medians = df.groupby('sub_area')['state'].median().reset_index()
# # sub_area_medians['build_year'] = np.ceil(sub_area_medians['build_year'])
# df = df.merge(sub_area_medians, on='sub_area', suffixes=('', '_median'), how='left')
# df['state'].fillna(df['state_median'], inplace=True)
# df.drop(columns='state_median', inplace=True)

categorical_cols = ['floor', 'max_floor', 'state', 'product_type', 'sub_area', 'num_room', 'year', 'month']

# def remove_outliers_iqr(df):
#     Q1 = df.quantile(0.25)
#     Q3 = df.quantile(0.75)
#     IQR = Q3 - Q1
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
#     return df[~((df < lower_bound) | (df > upper_bound)).any(axis=1)]

# df_drop_categorical = df.drop(columns=categorical_cols, axis=1)
# # Remove rows with outliers
# df_no_outliers = remove_outliers_iqr(df_drop_categorical)
# # scaler = StandardScaler()
# # df_no_outliers_scaled = scaler.fit_transform(df_no_outliers)
# display(df_no_outliers)

# for c in categorical_cols:
#     df_no_outliers[c] = df[c]

# print("Columns with NAN: ", df_no_outliers.columns[df_no_outliers.isna().any()].tolist())
# print("Num NANs: ", df_no_outliers.isnull().sum().sum())

# for c in categorical_cols:
#     df_no_outliers[c] = df_no_outliers[c].astype('category')

# df_no_outliers['price_doc'] = df["price_doc"] * .969 + 10


[0, 30, 52, 80, inf]
Index(['Ajeroport', 'Akademicheskoe', 'Alekseevskoe', 'Altuf'evskoe', 'Arbat',
       'Babushkinskoe', 'Basmannoe', 'Begovoe', 'Beskudnikovskoe', 'Bibirevo',
       ...
       'Vnukovo', 'Vojkovskoe', 'Vostochnoe', 'Vostochnoe Degunino',
       'Vostochnoe Izmajlovo', 'Vyhino-Zhulebino', 'Zamoskvorech'e',
       'Zapadnoe Degunino', 'Zjablikovo', 'Zjuzino'],
      dtype='object', name='sub_area', length=146)


  num_room_averages = df.groupby('full_sq_bins')['num_room'].transform(lambda x: np.ceil(x.mean()))
  return np.ceil(row['life_sq'] * sub_area_avg[sub_area])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_outliers[c] = df[c]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_outliers[c] = df[c]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-co

In [5]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# display(df[df['sub_area'] == 'Novogireevo'])
# pd.reset_option('display.max_rows')
# pd.reset_option('display.max_columns')

In [6]:


# # Define a function to remove rows with outliers based on the IQR
# def remove_outliers_iqr(df):
#     Q1 = df.quantile(0.25)
#     Q3 = df.quantile(0.75)
#     IQR = Q3 - Q1
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
#     return df[~((df < lower_bound) | (df > upper_bound)).any(axis=1)]

# df_drop_categorical = df.drop(columns=categorical_cols, axis=1)
# # Remove rows with outliers
# df_no_outliers = remove_outliers_iqr(df_drop_categorical)
# # scaler = StandardScaler()
# # df_no_outliers_scaled = scaler.fit_transform(df_no_outliers)
# display(df_no_outliers)

# for c in categorical_cols:
#     df_no_outliers[c] = df[c]

In [7]:
display(df_no_outliers)
# df_no_outliers.to_csv('processed_train.csv', index=False)

Unnamed: 0,full_sq,life_sq,build_year,kitch_sq,metro_min_avto,metro_km_avto,metro_min_walk,kindergarten_km,green_zone_km,industrial_km,...,cafe_count_5000_price_high,price_doc,floor,max_floor,state,product_type,sub_area,num_room,year,month
0,43,27.0,1976.0,8.0,2.590241,1.131260,13.575119,0.145700,0.600973,1.080934,...,0,5668660.00,4.0,12.0,2.0,Investment,Bibirevo,2.0,2011,8
1,34,19.0,1982.0,6.0,0.936700,0.647337,7.620630,0.147754,0.065321,0.966479,...,0,5814010.00,3.0,16.0,2.0,Investment,Nagatinskij Zaton,2.0,2011,8
2,43,29.0,1969.0,9.0,2.120999,1.637996,17.351515,0.049102,0.453172,0.939275,...,0,5523310.00,2.0,9.0,2.0,Investment,Tekstil'shhiki,2.0,2011,8
6,42,27.0,1967.5,8.0,2.175431,1.338707,14.742289,0.112905,0.148957,0.670432,...,1,5135710.00,5.0,9.0,2.0,Investment,Koptevo,2.0,2011,9
7,36,21.0,1980.0,7.0,1.383373,0.396275,4.755297,0.309673,0.191191,2.024971,...,0,1938010.00,9.0,12.0,2.0,Investment,Kuncevo,2.0,2011,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21265,52,30.0,1992.0,8.0,1.207656,0.695893,8.350721,0.621263,0.079039,0.547333,...,1,969010.00,8.0,14.0,2.0,Investment,Mitino,2.0,2015,6
21271,56,29.0,2001.0,11.0,2.622565,1.580238,8.510351,0.225720,0.349807,1.646417,...,0,11628010.00,13.0,14.0,3.0,Investment,Severnoe Tushino,2.0,2015,6
21272,56,51.0,2017.0,1.0,0.815305,0.652244,7.447930,0.414927,0.143626,0.000000,...,0,9943897.69,19.0,19.0,1.0,OwnerOccupier,Sviblovo,2.0,2015,6
21273,44,27.0,1975.0,6.0,1.384021,0.659002,8.158093,0.132645,0.139814,0.702853,...,0,7170610.00,7.0,9.0,3.0,Investment,Otradnoe,2.0,2015,6


In [8]:
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
import xgboost
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error, r2_score, mean_squared_error

y = df_no_outliers['price_doc']
X = df_no_outliers.drop(columns='price_doc', axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
one_hot_encoder.fit(X_train[categorical_cols])

feature_names = one_hot_encoder.get_feature_names_out(input_features=categorical_cols)

X_train[feature_names] = one_hot_encoder.transform(X_train[categorical_cols])
X_test[feature_names] = one_hot_encoder.transform(X_test[categorical_cols])

X_train = X_train.drop(categorical_cols, axis=1)
X_test = X_test.drop(categorical_cols, axis=1)


X_train.to_csv('X_train.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)


# te_encoder = TargetEncoder(cols=categorical_cols, min_samples_leaf=5, smoothing=8)

# # Fit the encoder on the encoding split.
# te_encoder.fit(X_train, y_train)

# for c in categorical_cols:
#     X_train[c] = te_encoder.transform(X_train)[c]
#     X_test[c] = te_encoder.transform(X_test)[c]

train_data = xgboost.DMatrix(X_train, label=y_train)
test_data = xgboost.DMatrix(X_test, label=y_test)

params = {
    'objective': 'reg:squarederror',
    'tree_method': 'auto',
    'max_depth': 5,
    'eta': 0.05
}

n = 100
model = xgboost.train(
    params=params,
    dtrain=train_data,
    num_boost_round=n,
    )

pred = model.predict(test_data)

rmse = mean_squared_error(y_test, pred, squared=False)
r2 = r2_score(y_test, pred)

print(f"RMSE: {rmse}")
print(f"R2: {r2}")

  X_train[feature_names] = one_hot_encoder.transform(X_train[categorical_cols])
  X_train[feature_names] = one_hot_encoder.transform(X_train[categorical_cols])
  X_train[feature_names] = one_hot_encoder.transform(X_train[categorical_cols])
  X_train[feature_names] = one_hot_encoder.transform(X_train[categorical_cols])
  X_train[feature_names] = one_hot_encoder.transform(X_train[categorical_cols])
  X_train[feature_names] = one_hot_encoder.transform(X_train[categorical_cols])
  X_train[feature_names] = one_hot_encoder.transform(X_train[categorical_cols])
  X_train[feature_names] = one_hot_encoder.transform(X_train[categorical_cols])
  X_train[feature_names] = one_hot_encoder.transform(X_train[categorical_cols])
  X_train[feature_names] = one_hot_encoder.transform(X_train[categorical_cols])
  X_train[feature_names] = one_hot_encoder.transform(X_train[categorical_cols])
  X_train[feature_names] = one_hot_encoder.transform(X_train[categorical_cols])
  X_train[feature_names] = one_hot_encod

RMSE: 2170894.9642047808
R2: 0.36502090298518786


In [9]:
# TESTING ON ACTUAL TEST DATA FROM HERE
from utils import preprocess_test
test_df = pd.read_csv('../Dataset/test.csv/test.csv')
df = test_df.copy()
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month

df.drop(['timestamp'], axis=1, inplace=True)

columns_to_drop = [column for column in df.columns if column not in top_40_cols]
df.drop(columns=columns_to_drop, axis=1, inplace=True)
# missing_column = [column for column in top_40_cols if column not in df.columns]
# print(missing_column)
# drop rows where life_sq and kitch_sq higher than full_sq
# df = df.drop(df[(df['full_sq'] <= df['life_sq'])].index)
# df = df.drop(df[(df['full_sq'] <= df['kitch_sq'])].index)

df = preprocess_test(df)

df

[0, 30, 52, 80, inf]


  num_room_averages = df.groupby('full_sq_bins')['num_room'].transform(lambda x: np.ceil(x.mean()))


Unnamed: 0,full_sq,life_sq,floor,max_floor,build_year,num_room,kitch_sq,state,product_type,sub_area,...,cafe_count_3000,cafe_count_3000_price_1500,cafe_count_3000_price_2500,sport_count_3000,office_sqm_5000,cafe_count_5000_na_price,cafe_count_5000_price_2500,cafe_count_5000_price_high,year,month
0,39.00,20.70,2,9,1998.0,1,8.9,3.0,Investment,Juzhnoe Butovo,...,12,7,0,7,37550,2,0,0,2015,7
1,79.20,79.20,8,17,0.0,3,1.0,1.0,OwnerOccupier,Poselenie Vnukovskoe,...,10,2,1,7,177300,2,1,0,2015,7
2,40.50,25.10,3,5,1960.0,2,4.8,2.0,Investment,Perovo,...,37,7,2,22,427889,5,11,0,2015,7
3,62.80,36.00,17,17,2016.0,2,62.8,3.0,OwnerOccupier,Poselenie Voskresenskoe,...,1,0,0,0,0,0,1,0,2015,7
4,40.00,40.00,17,17,0.0,1,1.0,1.0,OwnerOccupier,Poselenie Vnukovskoe,...,9,2,1,6,117300,1,1,0,2015,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7657,52.20,31.80,10,12,1973.0,2,9.1,2.0,Investment,Kon'kovo,...,91,25,7,20,1742694,15,28,1,2016,5
7658,54.09,54.09,14,14,2016.0,2,0.0,1.0,OwnerOccupier,Poselenie Desjonovskoe,...,2,1,0,0,0,0,2,0,2016,5
7659,41.08,1.00,12,12,1.0,1,1.0,1.0,OwnerOccupier,Tverskoe,...,1815,446,255,88,9997846,170,371,26,2016,5
7660,34.80,19.80,8,9,1977.0,1,6.4,2.0,Investment,Orehovo-Borisovo Juzhnoe,...,50,9,4,16,54500,10,7,0,2016,5


In [10]:
# # For life_sq, I think it is acceptable that we replace NaN values with the full_sq values of those rows
# df['life_sq'].fillna(df['full_sq'], inplace=True)

# # for max_floor, we could fill NaN with the median max_floor of properties in the same sub_area
# sub_area_medians = df.groupby('sub_area')['max_floor'].median().reset_index()
# # sub_area_medians['max_floor'] = np.ceil(sub_area_medians['max_floor'])
# df = df.merge(sub_area_medians, on='sub_area', suffixes=('', '_median'), how='left')
# df['max_floor'].fillna(df['max_floor_median'], inplace=True)
# df.drop(columns='max_floor_median', inplace=True)

# # and then for floor, we just fill NaN with the max_floor
# df['floor'].fillna(df['max_floor'], inplace=True)

# # finally we replace the max_floor with the floor, if there are any value of floor greater than max_floor(e.g row 63)
# df['max_floor'] = df.apply(lambda row: row['floor'] if row['floor'] > row['max_floor'] else row['max_floor'], axis=1)

# # we do the same for build_year, fill NaN with the median build_year of properties in the same sub_area
# sub_area_medians = df.groupby('sub_area')['build_year'].median().reset_index()
# # sub_area_medians['build_year'] = np.ceil(sub_area_medians['build_year'])
# df = df.merge(sub_area_medians, on='sub_area', suffixes=('', '_median'), how='left')
# df['build_year'].fillna(df['build_year_median'], inplace=True)
# df.drop(columns='build_year_median', inplace=True)

# # for num_room, we shall split the data into different ranges of full_sq value and calculate the average num_room for each range
# # then, we replace the num_room NaN values depending on which range the row's full_sq belongs to
# # Define the ranges for 'full_sq' bins
# bins = [0, 30, 52, 80, float('inf')]  # these values are eyeballed
# print(bins)

# # Use pd.cut to create bins for 'full_sq'
# df['full_sq_bins'] = pd.cut(df['full_sq'], bins=bins)

# # Calculate the average 'num_room' for each 'full_sq' range
# # num_room_averages = df.groupby('full_sq_bins')['num_room'].transform('mean')
# num_room_averages = df.groupby('full_sq_bins')['num_room'].transform(lambda x: np.ceil(x.mean()))
# df['num_room'].fillna(num_room_averages, inplace=True)
# df.drop(columns='full_sq_bins', inplace=True)

# # for kitch_sq, we shall group by sub_area and calculate the average kitch_sq/life_sq proportion, then replace NaN values with the proporiton multiplied by life_sq
# # Calculate the 'kitch_sq/life_sq' for each row
# df['kitch_sq_per_life_sq'] = df['kitch_sq'] / df['life_sq']

# # Calculate the average 'kitch_sq/life_sq' for each 'sub_area'
# sub_area_avg = df.groupby('sub_area')['kitch_sq_per_life_sq'].mean()

# # Apply the function to fill NaN values in 'kitch_sq'
# df['kitch_sq'] = df.apply(fill_kitch_sq, axis=1)

# # Drop the 'kitch_sq_per_life_sq' column if you no longer need it
# df.drop(columns='kitch_sq_per_life_sq', inplace=True)

# # we do the same for state, fill NaN with the median state of properties in the same sub_area
# sub_area_medians = df.groupby('sub_area')['state'].median().reset_index()
# # sub_area_medians['build_year'] = np.ceil(sub_area_medians['build_year'])
# df = df.merge(sub_area_medians, on='sub_area', suffixes=('', '_median'), how='left')
# df['state'].fillna(df['state_median'], inplace=True)
# df.drop(columns='state_median', inplace=True)


In [11]:
# df_drop_categorical = df.drop(columns=categorical_cols, axis=1)
# # Remove rows with outliers
# # df_no_outliers = remove_outliers_iqr(df_drop_categorical)

# for c in categorical_cols:
#     df_no_outliers[c] = df[c]

for c in categorical_cols:
    df[c] = df[c].astype('category')

df_drop_categorical = df.drop(columns=categorical_cols, axis=1)

for c in categorical_cols:
    df_drop_categorical[c] = df[c]

display(df_drop_categorical)

# df = pd.get_dummies(df, columns=categorical_cols, dtype='int64')

Unnamed: 0,full_sq,life_sq,build_year,kitch_sq,metro_min_avto,metro_km_avto,metro_min_walk,kindergarten_km,green_zone_km,industrial_km,...,cafe_count_5000_price_2500,cafe_count_5000_price_high,floor,max_floor,state,product_type,sub_area,num_room,year,month
0,39.00,20.70,1998.0,8.9,1.258957,0.735908,8.830901,0.078502,0.061485,1.205404,...,0,0,2,9,3.0,Investment,Juzhnoe Butovo,1,2015,7
1,79.20,79.20,0.0,1.0,4.230425,3.444625,41.335498,1.192193,0.000000,0.742377,...,1,0,8,17,1.0,OwnerOccupier,Poselenie Vnukovskoe,3,2015,7
2,40.50,25.10,1960.0,4.8,1.585306,1.122214,13.466563,0.065324,0.580638,0.900408,...,11,0,3,5,2.0,Investment,Perovo,2,2015,7
3,62.80,36.00,2016.0,62.8,7.931398,6.038848,68.559794,3.189083,0.025446,0.466738,...,1,0,17,17,3.0,OwnerOccupier,Poselenie Voskresenskoe,2,2015,7
4,40.00,40.00,0.0,1.0,2.152792,1.722233,20.666800,0.897889,0.427248,0.353642,...,1,0,17,17,1.0,OwnerOccupier,Poselenie Vnukovskoe,1,2015,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7657,52.20,31.80,1973.0,9.1,1.789207,1.179312,13.968183,0.334653,0.199909,0.624466,...,28,1,10,12,2.0,Investment,Kon'kovo,2,2016,5
7658,54.09,54.09,2016.0,0.0,15.839451,12.718568,144.236338,4.415205,0.991824,1.351922,...,2,0,14,14,1.0,OwnerOccupier,Poselenie Desjonovskoe,2,2016,5
7659,41.08,1.00,1.0,1.0,1.482746,1.036568,13.459068,1.048962,0.189089,2.640803,...,371,26,12,12,1.0,OwnerOccupier,Tverskoe,1,2016,5
7660,34.80,19.80,1977.0,6.4,1.469263,0.930198,11.162378,0.073023,0.766444,0.521349,...,7,0,8,9,2.0,Investment,Orehovo-Borisovo Juzhnoe,1,2016,5


In [12]:
# for c in categorical_cols:
#     df_drop_categorical[c] = te_encoder.transform(df_drop_categorical)[c]

df_drop_categorical[feature_names] = one_hot_encoder.transform(df_drop_categorical[categorical_cols])

df_drop_categorical = df_drop_categorical.drop(categorical_cols, axis=1)

df_drop_categorical.to_csv('processed_test.csv', index=False)


test_data = xgboost.DMatrix(data=df_drop_categorical, enable_categorical=True)
pred = model.predict(test_data)

prediction_df = pd.DataFrame({
    'id': test_df['id'],
    'price_doc': pred
}).to_csv('xgboost_pred.csv', index=False)

  df_drop_categorical[feature_names] = one_hot_encoder.transform(df_drop_categorical[categorical_cols])


  df_drop_categorical[feature_names] = one_hot_encoder.transform(df_drop_categorical[categorical_cols])
  df_drop_categorical[feature_names] = one_hot_encoder.transform(df_drop_categorical[categorical_cols])
  df_drop_categorical[feature_names] = one_hot_encoder.transform(df_drop_categorical[categorical_cols])
  df_drop_categorical[feature_names] = one_hot_encoder.transform(df_drop_categorical[categorical_cols])
  df_drop_categorical[feature_names] = one_hot_encoder.transform(df_drop_categorical[categorical_cols])
  df_drop_categorical[feature_names] = one_hot_encoder.transform(df_drop_categorical[categorical_cols])
  df_drop_categorical[feature_names] = one_hot_encoder.transform(df_drop_categorical[categorical_cols])
  df_drop_categorical[feature_names] = one_hot_encoder.transform(df_drop_categorical[categorical_cols])
  df_drop_categorical[feature_names] = one_hot_encoder.transform(df_drop_categorical[categorical_cols])
  df_drop_categorical[feature_names] = one_hot_encoder.transform