In [4]:
import os
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline

#this file path can be changed depending on where you want to work.
#for me, this folder is the root folder for all my ml_learning projects
ROOT_DIR = os.path.join('C:\\', 'users', 'sebas', 'onedrive', 'python', 'machine_learning', 'ml_learning')

#this data path is specific for the iowa housing project, and its where the datasets are stored on my local machine
DATA_PATH = os.path.join(ROOT_DIR, 'datasets', 'iowa_housing')

def load_data(filename, data_path=DATA_PATH):
    '''This function will load the data as a pandas dataframe. it takes the filename 
    as an argument which should be the name that the csv file is saved as in your directory.'''
    
    csv_path = os.path.join(data_path, filename)
    return pd.read_csv(csv_path).fillna(0) #we import the data with the null values replaced with 0s right off the bat. this avoids us having to do it later

training_data = load_data('train.csv')

#set id as our index column
training_data.set_index('Id', inplace=True)
#convert object categories to strings
training_data = training_data.apply(lambda x: x.astype('|S') if x.dtype == 'object' else x, axis=0)
training_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,b'RL',65.0,8450,b'Pave',b'0',b'Reg',b'Lvl',b'AllPub',b'Inside',...,0,b'0',b'0',b'0',0,2,2008,b'WD',b'Normal',208500
2,20,b'RL',80.0,9600,b'Pave',b'0',b'Reg',b'Lvl',b'AllPub',b'FR2',...,0,b'0',b'0',b'0',0,5,2007,b'WD',b'Normal',181500
3,60,b'RL',68.0,11250,b'Pave',b'0',b'IR1',b'Lvl',b'AllPub',b'Inside',...,0,b'0',b'0',b'0',0,9,2008,b'WD',b'Normal',223500
4,70,b'RL',60.0,9550,b'Pave',b'0',b'IR1',b'Lvl',b'AllPub',b'Corner',...,0,b'0',b'0',b'0',0,2,2006,b'WD',b'Abnorml',140000
5,60,b'RL',84.0,14260,b'Pave',b'0',b'IR1',b'Lvl',b'AllPub',b'FR2',...,0,b'0',b'0',b'0',0,12,2008,b'WD',b'Normal',250000


In [7]:
#lets split this dataset into a seperate testing and training set so that we can evaluate our model, even though there is already a testing set provided.
from sklearn.model_selection import train_test_split
train_set, test_set, y_train, y_test = train_test_split(training_data, training_data.SalePrice, test_size=0.2, random_state=42)

train_set_cats = train_set.select_dtypes(include=['object'].copy()) #this line will show you all the columns which have a categorical value, if you needed to know that

X_train = train_set.drop('SalePrice', axis=1)
y_train = train_set['SalePrice'].copy()

In [25]:
relevant_columns = ['OverallQual','GrLivArea','GarageCars','GarageArea','YearBuilt','BsmtFinSF1','FullBath','GarageYrBlt']
Xc_train = X_train[relevant_columns]
X_train.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [26]:
#engineer some new columns
Xc_train['TotalHouseSFA'] = X_train['1stFlrSF'] + X_train['2ndFlrSF'] + X_train['TotalBsmtSF']
Xc_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,OverallQual,GrLivArea,GarageCars,GarageArea,YearBuilt,BsmtFinSF1,FullBath,GarageYrBlt,TotalHouseSFA
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
255,5,1314,1,294,1957,922,1,1957.0,2628
1067,6,1571,2,380,1993,0,2,1993.0,2370
639,5,796,0,0,1910,0,1,0.0,1592
800,5,1768,1,240,1937,569,1,1939.0,2499
381,5,1691,1,308,1924,218,2,1924.0,2717
...,...,...,...,...,...,...,...,...,...
1096,6,1314,2,440,2006,24,2,2006.0,2628
1131,4,1981,2,576,1928,622,2,1981.0,3103
1295,5,864,2,572,1955,167,1,1957.0,1728
861,7,1426,1,216,1918,0,1,1925.0,2338


In [30]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

'''Here you must find out how to make a custom pipeline that selects the desired columns, and combines the other columns to give you the columns you want
that can then be passed to the linear regression model.'''

pipeline = Pipeline([
    ('scaler', StandardScaler())
])

X_train_prepared = pipeline.fit_transform(Xc_train, y_train)

In [31]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train_prepared, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [33]:
some_data = train_set.iloc[:5]
some_labels = y_train.iloc[:5]

some_data_prepared = pipeline.transform(some_data)
print('Predictions: ', lin_reg.predict(some_data_prepared))
print('Labels: ', list(some_labels))

ValueError: could not convert string to float: b'RL'

In [None]:
#this is a test