# Housing Prices: Advanced Regression


In [12]:
import random
import os
import pandas as pd
import tensorflow as tf

In [19]:
def encode_features(df_train, df_test):
    '''
    Takes columns whose values are strings (objects)
    and categorizes them into discrete numbers.
    This makes it feasible to use regression
    '''
    features = list(df_train.select_dtypes(include=['object']).columns)
    df_combined = pd.concat([df_train[features], df_test[features]])

    for feature in features:
        unique_categories = list(df_combined[feature].unique())
        map_dict = {}
        for idx, category in enumerate(unique_categories):
            map_dict[category] = idx + 1
        df_train[feature] = df_train[feature].map(map_dict)
        df_test[feature] = df_test[feature].map(map_dict)
    
    return df_train, df_test


def cleanup(df):
    '''
    Cleans data
        1. Drops unwanted features
        2. Fills missing values with the mode
    '''
    to_drop = ['MiscFeature', 'MiscVal', 'GarageArea', 'GarageYrBlt']
    df = df.drop(to_drop, axis=1)
    for column in df.columns:
        x = df[column].dropna().value_counts().index[0]
        df = df.fillna(x)
    return df

In [15]:
data_dir = 'data'

train_dataset = pd.read_csv(os.path.join(data_dir, 'train.csv'))
test_dataset = pd.read_csv(os.path.join(data_dir, 'test.csv'))

train_dataset = cleanup(train_dataset)
test_dataset = cleanup(test_dataset)
train_dataset, test_dataset = encode_features(train_dataset, test_dataset)

0       1
1       2
2       1
3       1
4       1
5       3
6       2
7       1
8       3
9       4
10      2
11      1
12      2
13      2
14      2
15      4
16      2
17      2
18      2
19      2
20      1
21      4
22      2
23      2
24      2
25      2
26      2
27      2
28      2
29      2
       ..
1430    1
1431    2
1432    2
1433    1
1434    2
1435    2
1436    2
1437    2
1438    2
1439    6
1440    8
1441    2
1442    1
1443    4
1444    2
1445    5
1446    2
1447    1
1448    1
1449    5
1450    1
1451    2
1452    6
1453    2
1454    2
1455    1
1456    2
1457    1
1458    2
1459    2
Name: HouseStyle, Length: 1460, dtype: int64
