# Feature Engineering Overview

## Dataset

In [3]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('Melbourne_housing_FULL.csv')

# Display all rows of DataFrame
df


Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.80140,144.99580,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.79960,144.99840,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.80790,144.99340,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.81140,145.01160,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.80930,144.99440,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,13 Burns St,4,h,1480000.0,PI,Jas,24/02/2018,6.3,3013.0,...,1.0,3.0,593.0,,,Maribyrnong City Council,-37.81053,144.88467,Western Metropolitan,6543.0
34853,Yarraville,29A Murray St,2,h,888000.0,SP,Sweeney,24/02/2018,6.3,3013.0,...,2.0,1.0,98.0,104.0,2018.0,Maribyrnong City Council,-37.81551,144.88826,Western Metropolitan,6543.0
34854,Yarraville,147A Severn St,2,t,705000.0,S,Jas,24/02/2018,6.3,3013.0,...,1.0,2.0,220.0,120.0,2000.0,Maribyrnong City Council,-37.82286,144.87856,Western Metropolitan,6543.0
34855,Yarraville,12/37 Stephen St,3,h,1140000.0,SP,hockingstuart,24/02/2018,6.3,3013.0,...,,,,,,Maribyrnong City Council,,,Western Metropolitan,6543.0


In [4]:
import pandas as pd
import numpy as np

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('Melbourne_housing_FULL.csv')

# Display all rows of DataFrame
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values
df = df.dropna()

# Convert categorical features to numerical
df = pd.get_dummies(df, columns=['Suburb', 'Type', 'Method', 'CouncilArea', 'Regionname'])

# Drop irrelevant features
df = df.drop(['Address', 'SellerG', 'Date'], axis=1)

# Separate the target variable (Price) from the features
X = df.drop('Price', axis=1)
y = df['Price']

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

       Suburb             Address  Rooms Type      Price Method SellerG  \
0  Abbotsford       68 Studley St      2    h        NaN     SS  Jellis   
1  Abbotsford        85 Turner St      2    h  1480000.0      S  Biggin   
2  Abbotsford     25 Bloomburg St      2    h  1035000.0      S  Biggin   
3  Abbotsford  18/659 Victoria St      3    u        NaN     VB  Rounds   
4  Abbotsford        5 Charles St      3    h  1465000.0     SP  Biggin   

        Date  Distance  Postcode  ...  Bathroom  Car  Landsize  BuildingArea  \
0  3/09/2016       2.5    3067.0  ...       1.0  1.0     126.0           NaN   
1  3/12/2016       2.5    3067.0  ...       1.0  1.0     202.0           NaN   
2  4/02/2016       2.5    3067.0  ...       1.0  0.0     156.0          79.0   
3  4/02/2016       2.5    3067.0  ...       2.0  1.0       0.0           NaN   
4  4/03/2017       2.5    3067.0  ...       2.0  0.0     134.0         150.0   

   YearBuilt         CouncilArea Lattitude  Longtitude             R

In [7]:
import pandas as pd

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('Melbourne_housing_FULL.csv')

# Check the column names in the dataframe
print(df.columns)

# Drop unnecessary columns
df = df.drop(['Address', 'SellerG', 'Date', 'Propertycount'], axis=1)

# Fill missing values
df['Price'].fillna(df['Price'].median(), inplace=True)
df['Distance'].fillna(df['Distance'].mean(), inplace=True)
df['Bedroom2'].fillna(df['Rooms'], inplace=True)
df['Bathroom'].fillna(df['Bathroom'].median(), inplace=True)
df['Car'].fillna(df['Car'].median(), inplace=True)
df['Landsize'].fillna(df['Landsize'].median(), inplace=True)
df['BuildingArea'].fillna(df['BuildingArea'].median(), inplace=True)
df['YearBuilt'].fillna(df['YearBuilt'].mode()[0], inplace=True)

# Convert categorical columns to numerical using one-hot encoding
cat_cols = ['Type', 'Method', 'CouncilArea', 'Regionname']
for col in cat_cols:
    if col in df.columns:
        df = pd.get_dummies(df, columns=[col])

print(df.head())


Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')
       Suburb  Rooms      Price  Distance  Postcode  Bedroom2  Bathroom  Car  \
0  Abbotsford      2   870000.0       2.5    3067.0       2.0       1.0  1.0   
1  Abbotsford      2  1480000.0       2.5    3067.0       2.0       1.0  1.0   
2  Abbotsford      2  1035000.0       2.5    3067.0       2.0       1.0  0.0   
3  Abbotsford      3   870000.0       2.5    3067.0       3.0       2.0  1.0   
4  Abbotsford      3  1465000.0       2.5    3067.0       3.0       2.0  0.0   

   Landsize  BuildingArea  ...  CouncilArea_Yarra City Council  \
0     126.0         136.0  ...                               1   
1     202.0         136.0  ...                               1   
2     156.0          79.0

In [26]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X = melb_data.drop(['Price', 'Suburb', 'Address'], axis=1)
y = melb_data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Identify categorical columns for one-hot encoding
categorical_columns = ['Type', 'Method', 'Regionname']

# Check column names in training set
print(X_train.columns)

# One-hot encode categorical columns
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore')
X_train_encoded = ohe.fit_transform(X_train[categorical_columns])
X_test_encoded = ohe.transform(X_test[categorical_columns])

# Concatenate encoded categorical features with numerical features
import scipy.sparse as sp
X_train_final = sp.hstack([X_train_encoded, X_train.drop(categorical_columns, axis=1)])
X_test_final = sp.hstack([X_test_encoded, X_test.drop(categorical_columns, axis=1)])

# Train the linear regression model
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(X_train_final, y_train)

# Evaluate the model on the testing set
from sklearn.metrics import mean_squared_error, r2_score
y_pred = lr_model.predict(X_test_final)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("MSE: ", mse)
print("R2 score: ", r2)

KeyError: "['Suburb', 'Address'] not found in axis"