## Importing Libraries  

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.options.display.max_rows = 100


# modeling
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer
from sklearn.preprocessing import PolynomialFeatures

In [3]:
train = pd.read_csv('../datasets/train.csv')
test = pd.read_csv('../datasets/test.csv')
data = pd.read_csv('../datasets/test_train_cleaned.csv')

# Refining the Data

Now we will go back to work with the dataFrame the contains all the data

#### 1. Garage columns

- Garage Area and Garage Cars

Garage Area and Garage Cars are two columns that tell almost the same information; if the garage size is bigger, it will fit more cars. To avoid having a collinear relationship, i will use feature engineering to combine them together and then drop the two columns.

In [4]:
data['Garage Area * Garage Cars'] = data['Garage Area'] * data['Garage Cars']

- Garage Condition, Garage Quality and Garage Finish:

In [5]:
data['Garage Cond'].value_counts()

TA      2665
None     159
Fa        74
Gd        15
Po        14
Ex         3
Name: Garage Cond, dtype: int64

In [6]:
data['Garage Qual'].value_counts()

TA      2615
None     159
Fa       124
Gd        24
Po         5
Ex         3
Name: Garage Qual, dtype: int64

In [7]:
data['Garage Finish'].value_counts()

Unf     1231
RFn      812
Fin      728
None     159
Name: Garage Finish, dtype: int64

Garage Condition and Garage Quality columns have very similar values with only a few differences in their value count. I will drop the garage condition column.


In [8]:
data.drop('Garage Cond', axis = 1, inplace = True)

#### 2. Total House size

There are 3 columns that represent the house size; '1st Flr SF', '2nd Flr SF' and 'Total Bsmt SF'. I will combine them all in one column to form one house size in sq/ft.

In [9]:
data['HouseSizeSF'] = data['1st Flr SF'] + data['2nd Flr SF'] + data['Total Bsmt SF']

In [10]:
data.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Garage Area * Garage Cars,HouseSizeSF
0,109,533352170,60,RL,69.0552,13517,Pave,,IR1,Lvl,...,0,,,,0,3,2010,WD,950.0,2204.0
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,,,,0,4,2009,WD,1118.0,3035.0
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,,,,0,1,2010,WD,246.0,2114.0
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,800.0,1828.0
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,,,,0,3,2010,WD,968.0,2121.0


#### 3. Neighborhood

In [11]:
#train[['Neighborhood','SalePrice']].groupby('Neighborhood').mean().sort_values(by='SalePrice')

In [12]:
# data['Neighborhood'] = data['Neighborhood'].map({'StoneBr': 'Ex',
#                           'NridgHt': 'Ex', 
#                           'NoRidge': 'Ex', 
#                           'GrnHill': 'Gr',
#                           'Veenker': 'Gr',
#                           'Timber': 'Gr',
#                           'Somerst': 'Gd',
#                           'ClearCr': 'Gd',
#                           'Crawfor': 'Gd',
#                           'CollgCr': 'Gd',
#                           'Blmngtn': 'Gd',
#                           'NWAmes': 'Av',
#                           'Gilbert': 'Av',
#                           'Greens': 'Av',
#                           'SawyerW': 'Av',
#                           'Mitchel': 'Av',
#                           'NAmes': 'Bd',
#                           'Blueste': 'Bd',
#                           'NPkVill': 'Bd',
#                           'Sawyer': 'Bd',
#                           'Landmrk': 'Bd',
#                           'SWISU': 'Bd',
#                           'Edwards': 'Bd',
#                           'BrkSide': 'Bd',
#                           'OldTown': 'Bd',
#                           'BrDale': 'Bd',
#                           'IDOTRR': 'Bd',
#                           'MeadowV': 'Bd',
#                          })





In [13]:
# data['MS SubClass'] = data['MS SubClass'].map({ '20': 'Gr', '60': 'Ex', '50': 'Av', '120': 'Ex',
#                                                '30': 'Hr', '160': 'Av', '70': 'Gd', '80': 'Gr',
#                                                '90': 'Av', '190': 'Bd', '85': 'Gd', '75': 'Ex',
#                                                '45': 'Bd', '180': 'Bd', '40': 'Bd', '150': 'Gd' }) 


because the value counts for the `Neighborhood` and `MS SubClass` is not very well distributed, i will not do the transformation above anymore

#### 4. droping Id and PID


The Id and PID shouldn't have any effect on the house price, i will drop them

In [14]:
data.drop(['Id', 'PID'], axis=1, inplace = True)

In [15]:
data.head(0)

Unnamed: 0,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Garage Area * Garage Cars,HouseSizeSF


### Polynomial Features

In [16]:
#a degree of 3 gave me better score results that 2 or 4
poly = PolynomialFeatures(include_bias=False, degree=3)

In [17]:
#featurs that have high positive correlation with the price
good_features = ['Overall Qual', 'Garage Area * Garage Cars', 'HouseSizeSF', 'Year Built']

In [18]:
X_poly = poly.fit_transform(data[good_features])


In [154]:
#changing it to dataFrame
poly_df = pd.DataFrame(X_poly,columns=poly.get_feature_names(good_features))

### Encoding Qualitative Features 

In [155]:
#merging the polynomial features with the data
polyed_data = data.merge(poly_df, left_index=True, right_index=True)

In [156]:
# getting dummies
polyed_data = pd.get_dummies(polyed_data, drop_first = True)

In [157]:
polyed_data.shape

(2930, 301)

In [158]:
polyed_data.shape

(2930, 301)

## Saving Polyed Data

In [159]:
polyed_data.to_csv('../datasets/test_train_ployed.csv', index=False)