Data Exploration:
- Read the data
    - Note down data's relevancy, suitability, quality and cleanliness
    - Plotting out graphs showing suitable trends
        - Note down suitable features that we should use
        - Currently, in the dataset, they provided information about the house and macro information about the house’s surroundings and environment

Model Consideration
- Find examples of models that can predict time series data well
    - Consider their strengths and their weaknesses
        -Example weakness: Does not work well in high dimension input data
    -Discuss how the raw data should be transformed to yield the best outcome for each model


# General Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import datasets

Data Files
- train.csv, test.csv: information about individual transactions. The rows are indexed by the "id" field, which refers to individual transactions (particular properties might appear more than once, in separate transactions). These files also include supplementary information about the local area of each property.
- macro.csv: data on Russia's macroeconomy and financial sector (could be joined to the train and test sets on the "timestamp" column)
- sample_submission.csv: an example submission file in the correct format
- data_dictionary.txt: explanations of the fields available in the other data files

In [2]:
train_df = pd.read_csv('Dataset/train.csv/train.csv')
train_df

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,1,2011-08-20,43,27.0,4.0,,,,,,...,9,4,0,13,22,1,0,52,4,5850000
1,2,2011-08-23,34,19.0,3.0,,,,,,...,15,3,0,15,29,1,10,66,14,6000000
2,3,2011-08-27,43,29.0,2.0,,,,,,...,10,3,0,11,27,0,4,67,10,5700000
3,4,2011-09-01,89,50.0,9.0,,,,,,...,11,2,1,4,4,0,0,26,3,13100000
4,5,2011-09-05,77,77.0,4.0,,,,,,...,319,108,17,135,236,2,91,195,14,16331452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30466,30469,2015-06-30,44,27.0,7.0,9.0,1.0,1975.0,2.0,6.0,...,15,5,0,15,26,1,2,84,6,7400000
30467,30470,2015-06-30,86,59.0,3.0,9.0,2.0,1935.0,4.0,10.0,...,313,128,24,98,182,1,82,171,15,25000000
30468,30471,2015-06-30,45,,10.0,20.0,1.0,,1.0,1.0,...,1,1,0,2,12,0,1,11,1,6970959
30469,30472,2015-06-30,64,32.0,5.0,15.0,1.0,2003.0,2.0,11.0,...,22,1,1,6,31,1,4,65,7,13500000


In [3]:
pd.set_option('display.max_rows', None)
print(train_df.isnull().sum())
pd.set_option('display.max_rows', 10)

id                                           0
timestamp                                    0
full_sq                                      0
life_sq                                   6383
floor                                      167
max_floor                                 9572
material                                  9572
build_year                               13605
num_room                                  9572
kitch_sq                                  9572
state                                    13559
product_type                                 0
sub_area                                     0
area_m                                       0
raion_popul                                  0
green_zone_part                              0
indust_part                                  0
children_preschool                           0
preschool_quota                           6688
preschool_education_centers_raion            0
children_school                              0
school_quota 

In [4]:
new_df = train_df.dropna(axis=0,how='any')
new_df

# Shows that there are (30471 - 6042) NaN rows dropped

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
8056,8059,2013-05-21,11,11.0,2.0,5.0,2.0,1907.0,1.0,12.0,...,339,135,26,133,207,1,89,161,10,2750000
8154,8157,2013-05-27,45,27.0,6.0,9.0,1.0,1970.0,2.0,6.0,...,2,1,0,3,8,1,0,19,3,7100000
8287,8290,2013-06-03,77,50.0,3.0,5.0,2.0,1957.0,3.0,8.0,...,214,85,21,48,89,1,54,146,12,11700000
8387,8390,2013-06-07,56,29.0,5.0,16.0,5.0,1987.0,2.0,10.0,...,74,32,8,19,34,0,10,121,7,10400000
8391,8394,2013-06-07,31,21.0,5.0,9.0,5.0,1962.0,1.0,5.0,...,99,43,8,17,41,1,18,93,5,6200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30462,30465,2015-06-30,47,30.0,23.0,25.0,4.0,2016.0,1.0,10.0,...,99,57,12,23,42,1,13,123,7,10544070
30463,30466,2015-06-30,56,29.0,13.0,14.0,1.0,2001.0,2.0,11.0,...,14,2,0,13,12,0,1,53,6,12000000
30467,30470,2015-06-30,86,59.0,3.0,9.0,2.0,1935.0,4.0,10.0,...,313,128,24,98,182,1,82,171,15,25000000
30469,30472,2015-06-30,64,32.0,5.0,15.0,1.0,2003.0,2.0,11.0,...,22,1,1,6,31,1,4,65,7,13500000


In [5]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6042 entries, 8056 to 30470
Columns: 292 entries, id to price_doc
dtypes: float64(119), int64(157), object(16)
memory usage: 13.5+ MB


In [6]:
macro_df = pd.read_csv('Dataset/macro.csv/macro.csv')
macro_df

Unnamed: 0,timestamp,oil_urals,gdp_quart,gdp_quart_growth,cpi,ppi,gdp_deflator,balance_trade,balance_trade_growth,usdrub,...,provision_retail_space_modern_sqm,turnover_catering_per_cap,theaters_viewers_per_1000_cap,seats_theather_rfmin_per_100000_cap,museum_visitis_per_100_cap,bandwidth_sports,population_reg_sports_share,students_reg_sports_share,apartment_build,apartment_fund_sqm
0,2010-01-01,76.1000,,,,,,,,,...,690.0,6221.0,527.0,0.41,993.0,,,63.03,22825.0,
1,2010-01-02,76.1000,,,,,,,,,...,690.0,6221.0,527.0,0.41,993.0,,,63.03,22825.0,
2,2010-01-03,76.1000,,,,,,,,,...,690.0,6221.0,527.0,0.41,993.0,,,63.03,22825.0,
3,2010-01-04,76.1000,,,,,,,,29.9050,...,690.0,6221.0,527.0,0.41,993.0,,,63.03,22825.0,
4,2010-01-05,76.1000,,,,,,,,29.8360,...,690.0,6221.0,527.0,0.41,993.0,,,63.03,22825.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2479,2016-10-15,44.3677,19979.4,-0.6,531.0,601.9,133.16,5.823,2.6,62.9573,...,,,,,,,,,,
2480,2016-10-16,44.3677,19979.4,-0.6,531.0,601.9,133.16,5.823,2.6,62.9573,...,,,,,,,,,,
2481,2016-10-17,44.3677,19979.4,-0.6,531.0,601.9,133.16,5.823,2.6,63.0856,...,,,,,,,,,,
2482,2016-10-18,44.3677,19979.4,-0.6,531.0,601.9,133.16,5.823,2.6,62.9512,...,,,,,,,,,,


In [7]:
merge_df = train_df.merge(macro_df, on='timestamp', how='outer')
merge_df

# Correct number of columns after merging macro_df with train_df

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,provision_retail_space_modern_sqm,turnover_catering_per_cap,theaters_viewers_per_1000_cap,seats_theather_rfmin_per_100000_cap,museum_visitis_per_100_cap,bandwidth_sports,population_reg_sports_share,students_reg_sports_share,apartment_build,apartment_fund_sqm
0,1.0,2011-08-20,43.0,27.0,4.0,,,,,,...,271.0,6943.0,565.0,0.45356,1240.0,269768.0,22.37,64.12,23587.0,230310.0
1,2.0,2011-08-23,34.0,19.0,3.0,,,,,,...,271.0,6943.0,565.0,0.45356,1240.0,269768.0,22.37,64.12,23587.0,230310.0
2,3.0,2011-08-27,43.0,29.0,2.0,,,,,,...,271.0,6943.0,565.0,0.45356,1240.0,269768.0,22.37,64.12,23587.0,230310.0
3,4.0,2011-09-01,89.0,50.0,9.0,,,,,,...,271.0,6943.0,565.0,0.45356,1240.0,269768.0,22.37,64.12,23587.0,230310.0
4,5.0,2011-09-05,77.0,77.0,4.0,,,,,,...,271.0,6943.0,565.0,0.45356,1240.0,269768.0,22.37,64.12,23587.0,230310.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31789,,2016-10-15,,,,,,,,,...,,,,,,,,,,
31790,,2016-10-16,,,,,,,,,...,,,,,,,,,,
31791,,2016-10-17,,,,,,,,,...,,,,,,,,,,
31792,,2016-10-18,,,,,,,,,...,,,,,,,,,,


In [8]:
pd.set_option('display.max_rows', None)
merge_df.isnull().sum()
pd.set_option('display.max_rows', 10)

In [9]:
newMerge_df = merge_df.dropna(axis=0,how='any')
newMerge_df

# Problem with merging on timestamp with macro
# There isn't a single row left

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,provision_retail_space_modern_sqm,turnover_catering_per_cap,theaters_viewers_per_1000_cap,seats_theather_rfmin_per_100000_cap,museum_visitis_per_100_cap,bandwidth_sports,population_reg_sports_share,students_reg_sports_share,apartment_build,apartment_fund_sqm


# Tackling Feature Choice
- Trivial Case
    - Drop all NaN rows and train with the remaining rows
        - Problem: There are only 6042 rows left

In [10]:
train_df

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,1,2011-08-20,43,27.0,4.0,,,,,,...,9,4,0,13,22,1,0,52,4,5850000
1,2,2011-08-23,34,19.0,3.0,,,,,,...,15,3,0,15,29,1,10,66,14,6000000
2,3,2011-08-27,43,29.0,2.0,,,,,,...,10,3,0,11,27,0,4,67,10,5700000
3,4,2011-09-01,89,50.0,9.0,,,,,,...,11,2,1,4,4,0,0,26,3,13100000
4,5,2011-09-05,77,77.0,4.0,,,,,,...,319,108,17,135,236,2,91,195,14,16331452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30466,30469,2015-06-30,44,27.0,7.0,9.0,1.0,1975.0,2.0,6.0,...,15,5,0,15,26,1,2,84,6,7400000
30467,30470,2015-06-30,86,59.0,3.0,9.0,2.0,1935.0,4.0,10.0,...,313,128,24,98,182,1,82,171,15,25000000
30468,30471,2015-06-30,45,,10.0,20.0,1.0,,1.0,1.0,...,1,1,0,2,12,0,1,11,1,6970959
30469,30472,2015-06-30,64,32.0,5.0,15.0,1.0,2003.0,2.0,11.0,...,22,1,1,6,31,1,4,65,7,13500000


In [11]:
train_noNA_df = train_df.dropna(axis=0,how='any')
train_noNA_df

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
8056,8059,2013-05-21,11,11.0,2.0,5.0,2.0,1907.0,1.0,12.0,...,339,135,26,133,207,1,89,161,10,2750000
8154,8157,2013-05-27,45,27.0,6.0,9.0,1.0,1970.0,2.0,6.0,...,2,1,0,3,8,1,0,19,3,7100000
8287,8290,2013-06-03,77,50.0,3.0,5.0,2.0,1957.0,3.0,8.0,...,214,85,21,48,89,1,54,146,12,11700000
8387,8390,2013-06-07,56,29.0,5.0,16.0,5.0,1987.0,2.0,10.0,...,74,32,8,19,34,0,10,121,7,10400000
8391,8394,2013-06-07,31,21.0,5.0,9.0,5.0,1962.0,1.0,5.0,...,99,43,8,17,41,1,18,93,5,6200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30462,30465,2015-06-30,47,30.0,23.0,25.0,4.0,2016.0,1.0,10.0,...,99,57,12,23,42,1,13,123,7,10544070
30463,30466,2015-06-30,56,29.0,13.0,14.0,1.0,2001.0,2.0,11.0,...,14,2,0,13,12,0,1,53,6,12000000
30467,30470,2015-06-30,86,59.0,3.0,9.0,2.0,1935.0,4.0,10.0,...,313,128,24,98,182,1,82,171,15,25000000
30469,30472,2015-06-30,64,32.0,5.0,15.0,1.0,2003.0,2.0,11.0,...,22,1,1,6,31,1,4,65,7,13500000


In [12]:
print(train_noNA_df.info())
print(train_noNA_df.dtypes)
train_noNA_df.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6042 entries, 8056 to 30470
Columns: 292 entries, id to price_doc
dtypes: float64(119), int64(157), object(16)
memory usage: 13.5+ MB
None
id                      int64
timestamp              object
full_sq                 int64
life_sq               float64
floor                 float64
                       ...   
mosque_count_5000       int64
leisure_count_5000      int64
sport_count_5000        int64
market_count_5000       int64
price_doc               int64
Length: 292, dtype: object


Unnamed: 0,id,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
count,6042.0,6042.0,6042.0,6042.0,6042.0,6042.0,6042.0,6042.0,6042.0,6042.0,...,6042.0,6042.0,6042.0,6042.0,6042.0,6042.0,6042.0,6042.0,6042.0,6042.0
mean,19849.99007,53.374545,33.198941,6.564548,11.586064,2.049156,5271.443,1.997021,7.391096,2.403012,...,62.326879,21.463588,3.597153,27.559914,51.001821,0.632075,16.797418,84.975339,9.038894,8750423.0
std,6081.011373,24.011941,20.190005,4.960885,6.216155,1.495809,257944.0,0.889523,3.806198,0.790574,...,95.232817,37.640043,7.236304,37.620903,60.351762,0.680212,26.734757,43.921642,4.178001,6277290.0
min,8059.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,500000.0
25%,14576.25,38.0,20.0,3.0,7.0,1.0,1964.0,1.0,5.0,2.0,...,9.0,2.0,0.0,8.0,17.0,0.0,2.0,55.0,5.0,5800000.0
50%,19874.5,46.0,29.0,5.0,10.0,1.0,1972.0,2.0,7.0,2.0,...,21.0,4.0,0.0,11.0,27.0,1.0,6.0,75.0,10.0,7400000.0
75%,25150.75,63.0,42.0,9.0,16.0,2.0,1990.0,3.0,9.0,3.0,...,55.0,14.0,2.0,23.0,46.0,1.0,12.0,106.0,12.0,10500000.0
max,30473.0,637.0,637.0,77.0,48.0,6.0,20052010.0,17.0,96.0,33.0,...,376.0,146.0,29.0,151.0,250.0,2.0,106.0,215.0,20.0,95122500.0


### Output is continuous variable, there are object values in some columns
    - Another trivial case: Drop all object column and do simple regression model

In [13]:
print(train_noNA_df.select_dtypes(include='object').columns)
train_noNA_df_numeric = train_noNA_df.select_dtypes(include='number')
train_noNA_df_numeric

Index(['timestamp', 'product_type', 'sub_area', 'culture_objects_top_25',
       'thermal_power_plant_raion', 'incineration_raion',
       'oil_chemistry_raion', 'radiation_raion', 'railroad_terminal_raion',
       'big_market_raion', 'nuclear_reactor_raion', 'detention_facility_raion',
       'water_1line', 'big_road1_1line', 'railroad_1line', 'ecology'],
      dtype='object')


Unnamed: 0,id,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
8056,8059,11,11.0,2.0,5.0,2.0,1907.0,1.0,12.0,3.0,...,339,135,26,133,207,1,89,161,10,2750000
8154,8157,45,27.0,6.0,9.0,1.0,1970.0,2.0,6.0,3.0,...,2,1,0,3,8,1,0,19,3,7100000
8287,8290,77,50.0,3.0,5.0,2.0,1957.0,3.0,8.0,2.0,...,214,85,21,48,89,1,54,146,12,11700000
8387,8390,56,29.0,5.0,16.0,5.0,1987.0,2.0,10.0,2.0,...,74,32,8,19,34,0,10,121,7,10400000
8391,8394,31,21.0,5.0,9.0,5.0,1962.0,1.0,5.0,3.0,...,99,43,8,17,41,1,18,93,5,6200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30462,30465,47,30.0,23.0,25.0,4.0,2016.0,1.0,10.0,1.0,...,99,57,12,23,42,1,13,123,7,10544070
30463,30466,56,29.0,13.0,14.0,1.0,2001.0,2.0,11.0,3.0,...,14,2,0,13,12,0,1,53,6,12000000
30467,30470,86,59.0,3.0,9.0,2.0,1935.0,4.0,10.0,3.0,...,313,128,24,98,182,1,82,171,15,25000000
30469,30472,64,32.0,5.0,15.0,1.0,2003.0,2.0,11.0,2.0,...,22,1,1,6,31,1,4,65,7,13500000


In [14]:
# Drop id column
train_noNA_df_numeric = train_noNA_df_numeric.drop('id', axis='columns')
train_noNA_df_numeric

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,area_m,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
8056,11,11.0,2.0,5.0,2.0,1907.0,1.0,12.0,3.0,1.007156e+07,...,339,135,26,133,207,1,89,161,10,2750000
8154,45,27.0,6.0,9.0,1.0,1970.0,2.0,6.0,3.0,8.889467e+06,...,2,1,0,3,8,1,0,19,3,7100000
8287,77,50.0,3.0,5.0,2.0,1957.0,3.0,8.0,2.0,4.662813e+06,...,214,85,21,48,89,1,54,146,12,11700000
8387,56,29.0,5.0,16.0,5.0,1987.0,2.0,10.0,2.0,1.803644e+07,...,74,32,8,19,34,0,10,121,7,10400000
8391,31,21.0,5.0,9.0,5.0,1962.0,1.0,5.0,3.0,1.880000e+07,...,99,43,8,17,41,1,18,93,5,6200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30462,47,30.0,23.0,25.0,4.0,2016.0,1.0,10.0,1.0,9.629358e+06,...,99,57,12,23,42,1,13,123,7,10544070
30463,56,29.0,13.0,14.0,1.0,2001.0,2.0,11.0,3.0,9.249237e+06,...,14,2,0,13,12,0,1,53,6,12000000
30467,86,59.0,3.0,9.0,2.0,1935.0,4.0,10.0,3.0,7.307411e+06,...,313,128,24,98,182,1,82,171,15,25000000
30469,64,32.0,5.0,15.0,1.0,2003.0,2.0,11.0,2.0,6.050065e+06,...,22,1,1,6,31,1,4,65,7,13500000


In [15]:
train_noNA_df_numeric.corr()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,area_m,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
full_sq,1.000000,0.856345,0.200456,0.283062,0.064619,-0.008767,0.677960,0.402315,-0.057500,0.007485,...,0.182624,0.180142,0.183898,0.157010,0.163534,0.123307,0.165024,0.148878,-0.010628,0.675455
life_sq,0.856345,1.000000,0.124117,0.140861,0.035447,-0.011010,0.583203,0.201836,-0.114944,0.000990,...,0.209645,0.211150,0.215235,0.181320,0.185768,0.105711,0.193871,0.178399,0.012895,0.524602
floor,0.200456,0.124117,1.000000,0.596279,0.039001,0.003762,-0.006258,0.232916,-0.009736,0.018937,...,-0.016917,-0.012158,-0.015336,-0.022505,-0.024340,0.037772,-0.026999,-0.052949,-0.089184,0.172476
max_floor,0.283062,0.140861,0.596279,1.000000,0.105891,0.000956,-0.003007,0.442625,0.000469,0.035859,...,-0.028679,-0.025446,-0.029840,-0.036699,-0.035531,0.067526,-0.043717,-0.082754,-0.125881,0.219462
material,0.064619,0.035447,0.039001,0.105891,1.000000,-0.008991,-0.045673,0.107458,-0.057490,-0.038074,...,0.069499,0.066189,0.067239,0.058449,0.063897,0.037952,0.065590,0.111396,0.081250,0.079882
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mosque_count_5000,0.123307,0.105711,0.037772,0.067526,0.037952,0.025793,0.068225,0.024130,-0.000032,-0.213852,...,0.529042,0.480391,0.483215,0.487447,0.539950,1.000000,0.496904,0.518903,0.126366,0.194333
leisure_count_5000,0.165024,0.193871,-0.026999,-0.043717,0.065590,-0.004875,0.088383,-0.061446,-0.117423,-0.143155,...,0.985003,0.970969,0.935684,0.957637,0.970192,0.496904,1.000000,0.876830,0.403856,0.232902
sport_count_5000,0.148878,0.178399,-0.052949,-0.082754,0.111396,0.001637,0.084602,-0.066005,-0.112994,-0.219801,...,0.896317,0.858435,0.836994,0.864578,0.895124,0.518903,0.876830,1.000000,0.524981,0.248535
market_count_5000,-0.010628,0.012895,-0.089184,-0.125881,0.081250,0.005983,0.008182,-0.073730,-0.046944,-0.317035,...,0.359862,0.314291,0.286443,0.415789,0.447759,0.126366,0.403856,0.524981,1.000000,0.043842


In [16]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

###  Feature selection, finding best features for dataset

In [17]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [18]:
X= train_noNA_df_numeric.iloc[:,:274]   #all features
print(X)
Y= train_noNA_df_numeric.iloc[:,-1]   #target output (price_doc)
print(Y)

       full_sq  life_sq  floor  max_floor  material  build_year  num_room  \
8056        11     11.0    2.0        5.0       2.0      1907.0       1.0   
8154        45     27.0    6.0        9.0       1.0      1970.0       2.0   
8287        77     50.0    3.0        5.0       2.0      1957.0       3.0   
8387        56     29.0    5.0       16.0       5.0      1987.0       2.0   
8391        31     21.0    5.0        9.0       5.0      1962.0       1.0   
...        ...      ...    ...        ...       ...         ...       ...   
30462       47     30.0   23.0       25.0       4.0      2016.0       1.0   
30463       56     29.0   13.0       14.0       1.0      2001.0       2.0   
30467       86     59.0    3.0        9.0       2.0      1935.0       4.0   
30469       64     32.0    5.0       15.0       1.0      2003.0       2.0   
30470       43     28.0    1.0        9.0       1.0      1968.0       2.0   

       kitch_sq  state        area_m  ...  cafe_count_5000_price_1500  \
80

In [19]:
best_features= SelectKBest(score_func=chi2, k=95)
fit= best_features.fit(X,Y)
fit

In [20]:
df_scores= pd.DataFrame(fit.scores_)
df_columns= pd.DataFrame(X.columns)

In [21]:
features_scores= pd.concat([df_columns, df_scores], axis=1)
features_scores.columns= ['Features', 'Score']
features_score_sorted = features_scores.sort_values(by = 'Score')

In [22]:
# Top k features in terms of Score
top_k_features = features_score_sorted[-5:]
top_k_features

Unnamed: 0,Features,Score
5,build_year,1280136000.0
208,office_sqm_2000,2276114000.0
231,office_sqm_3000,4393525000.0
9,area_m,4524159000.0
254,office_sqm_5000,7329143000.0


In [23]:
feature_arr = []
for i in top_k_features['Features']:
    feature_arr.append(i)
print(feature_arr)
top_k_features_df = train_noNA_df_numeric[feature_arr]
print(top_k_features_df)
X = pd.DataFrame(top_k_features_df)
X

['build_year', 'office_sqm_2000', 'office_sqm_3000', 'area_m', 'office_sqm_5000']
       build_year  office_sqm_2000  office_sqm_3000        area_m  \
8056       1907.0          1394447          3025460  1.007156e+07   
8154       1970.0                0            15000  8.889467e+06   
8287       1957.0          1849285          3353936  4.662813e+06   
8387       1987.0           165480           465504  1.803644e+07   
8391       1962.0            56032           863668  1.880000e+07   
...           ...              ...              ...           ...   
30462      2016.0          1996056          3207952  9.629358e+06   
30463      2001.0             9881            85311  9.249237e+06   
30467      1935.0          2186592          4208928  7.307411e+06   
30469      2003.0           107800           473168  6.050065e+06   
30470      1968.0                0           155237  4.395333e+06   

       office_sqm_5000  
8056          10742760  
8154             81947  
8287          

Unnamed: 0,build_year,office_sqm_2000,office_sqm_3000,area_m,office_sqm_5000
8056,1907.0,1394447,3025460,1.007156e+07,10742760
8154,1970.0,0,15000,8.889467e+06,81947
8287,1957.0,1849285,3353936,4.662813e+06,6956461
8387,1987.0,165480,465504,1.803644e+07,5179946
8391,1962.0,56032,863668,1.880000e+07,4245601
...,...,...,...,...,...
30462,2016.0,1996056,3207952,9.629358e+06,4967675
30463,2001.0,9881,85311,9.249237e+06,712296
30467,1935.0,2186592,4208928,7.307411e+06,9949843
30469,2003.0,107800,473168,6.050065e+06,1225712


In [24]:
Y= train_noNA_df_numeric[['price_doc']]  #the target output
Y

Unnamed: 0,price_doc
8056,2750000
8154,7100000
8287,11700000
8387,10400000
8391,6200000
...,...
30462,10544070
30463,12000000
30467,25000000
30469,13500000


In [25]:
# Split into train and test
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=100)

In [26]:
# Create logistic Regression Body
logreg= LogisticRegression()
logreg.fit(X_train,np.ravel(y_train))

#pip install scipy==1.5.3
# Attribute error that requires lower version of scipy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
y_pred=logreg.predict(X_test)
print(X_test) #test dataset
print(y_pred) #predicted values
print(pd.DataFrame(y_pred).describe())

       build_year  office_sqm_2000  office_sqm_3000        area_m  \
28830      1960.0          2008347          4050318  4.280227e+06   
22483      1962.0            58053           341580  8.841267e+06   
21935      2005.0           307275           709434  7.811375e+06   
21144      1968.0           214860          1068980  4.787424e+06   
27026      1965.0           266740           779354  1.686153e+07   
...           ...              ...              ...           ...   
10826      1965.0           170712           875593  4.787424e+06   
21755      1961.0           590204          1465570  4.662813e+06   
11424      1992.0            15000            39464  8.889467e+06   
23913      1972.0           397411           925664  5.586343e+06   
21410      1968.0           155237           216937  4.395333e+06   

       office_sqm_5000  
28830          9094340  
22483          1453587  
21935          1439263  
21144          2396582  
27026           868063  
...                ..

In [28]:
print(pd.DataFrame(y_pred))
print(pd.DataFrame(y_test))

             0
0     23700000
1      2000000
2      2000000
3      2000000
4      2000000
...        ...
1204   2000000
1205   2000000
1206   2000000
1207   2000000
1208   1000000

[1209 rows x 1 columns]
       price_doc
28830    9600000
22483    5700000
21935   32500000
21144    9000000
27026    6300000
...          ...
10826    4630000
21755    9250000
11424    5800000
23913    1000000
21410    7800000

[1209 rows x 1 columns]


In [29]:
from sklearn import metrics
from sklearn.metrics import classification_report
print("Accuracy: ",metrics.accuracy_score(y_test, y_pred))

Accuracy:  0.043010752688172046
