# Starting Your ML Project
**Run the equivalent commands (to read the data and print the summary) in the code cell below. **

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

main_file_path = '../input/train.csv'
data = pd.read_csv(main_file_path)
print(data.describe())

# Selecting and Filtering in Pandas
* Print a list of the columns
* From the list of columns, find a name of the column with the sales prices of the homes. Use the dot notation to extract this to a variable (as you saw above to create melbourne_price_data.)
* Use the head command to print out the top few lines of the variable you just created.
* Pick any two variables and store them to a new DataFrame (as you saw above to create two_columns_of_data.)
* Use the describe command with the DataFrame you just created to see summaries of those variables.

In [None]:
print(data.columns)

In [None]:
data_price=data.SalePrice
print(data_price.head())

In [None]:
columns_of_interest=['LotArea','YearBuilt']
two_columns_of_data=data[columns_of_interest]

In [None]:
two_columns_of_data.describe()

# Your First Scikit-Learn Model
* Select the target variable you want to predict. You can go back to the list of columns from your earlier commands to recall what it's called (hint: you've already worked with this variable). Save this to a new variable called y.
* Create a list of the names of the predictors we will use in the initial model. Use just the following columns in the list (you can copy and paste the whole list to save some typing, though you'll still need to add quotes):
    * LotArea
    * YearBuilt
    * 1stFlrSF
    * 2ndFlrSF
    * FullBath
    * BedroomAbvGr
    * TotRmsAbvGrd
* Using the list of variable names you just created, select a new DataFrame of the predictors data. Save this with the variable name X.

* Create a DecisionTreeRegressorModel and save it to a variable (with a name like my_model or iowa_model). Ensure you've done the relevant import so you can run this command.
* Fit the model you have created using the data in X and the target data you saved above.
* Make a few predictions with the model's predict command and print out the predictions.

In [None]:
Y=data.SalePrice

In [None]:
data_predictors=['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath',
                'BedroomAbvGr','TotRmsAbvGrd']
X=data[data_predictors]

In [None]:
from sklearn.tree import DecisionTreeRegressor

iowa_model=DecisionTreeRegressor()

iowa_model.fit(X,Y)

In [None]:
print("predictions for the following 5 houses:")
print(X.head())
print("the predictions are:")
print(iowa_model.predict(X.head()))
print("the real prices is:")
print(Y.head())

# Model Validation
* Use the train_test_split command to split up your data.
* Fit the model with the training data
* Make predictions with the validation predictors
* Calculate the mean absolute error between your predictions and the actual target values for the validation data.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
#split data
train_X, val_X, train_y, val_y=train_test_split(X,Y,random_state=0)
#define model
iowa_model=DecisionTreeRegressor()
#fit model
iowa_model.fit(train_X,train_y)

#do prediction
val_predictions= iowa_model.predict(val_X)
print(mean_absolute_error(val_y,val_predictions))

# Underfitting, Overfitting and Model Optimization
* Use a for loop that tries different values of max_leaf_nodes and calls the get_mae function on each to find the ideal number of leaves for your Iowa data.

In [None]:
def get_mae(max_leaf_nodes,predictors_train,predictors_val,targ_train,targ_val):
    model=DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes,random_state=0)
    model.fit(predictors_train,targ_train)
    preds_val= model.predict(predictors_val)
    mae=mean_absolute_error(targ_val,preds_val)
    return(mae)

for max_leaf_nodes in [10,20,30,40,50,60,70,80,90,100,200,300,400,500,1000]:
    my_mae=get_mae(max_leaf_nodes,train_X,val_X,train_y,val_y)
    print("max leaf nodes: %d \t\t mean absolute error: %d"%(max_leaf_nodes,my_mae))

# Random Forests
* Run the RandomForestRegressor on your data. You should see a big improvement over your best Decision Tree models.

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_model=RandomForestRegressor()
forest_model.fit(train_X,train_y)
iowa_preds=forest_model.predict(val_X)

print(mean_absolute_error(val_y,iowa_preds))

## Submitting "Random Forests Model" From A Kernel


In [None]:
import numpy as np

train=pd.read_csv('../input/train.csv')
test=pd.read_csv('../input/test.csv')

train_y=train.SalePrice
predictors_cols=['LotArea','OverallQual','YearBuilt','TotRmsAbvGrd']
train_x=train[predictors_cols]
test_x=test[predictors_cols]

my_model=RandomForestRegressor()
my_model.fit(train_x,train_y)

predicted_prices=my_model.predict(test_x)
print(predicted_prices)

In [None]:
my_submission = pd.DataFrame({'ID':test.Id, 'SalePrice':predicted_prices})

my_submission.to_csv('submission1.csv', index=False)

# Handling Missing Values
* Find some columns with missing values in your dataset.
* Use the Imputer class so you can impute missing values
* Add columns with missing values to your predictors.

In [None]:
import pandas as pd
data = pd.read_csv('../input/train.csv')
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

iowa_target = data.SalePrice
iowa_predictors = data.drop(['SalePrice'], axis=1)

iowa_numeric_predictors = iowa_predictors.select_dtypes(exclude=['object'])

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iowa_numeric_predictors, iowa_target,random_state=0)

In [None]:
from  sklearn.preprocessing import Imputer

cols_with_missing = [col for col in X_train.columns 
                                 if X_train[col].isnull().any()]

my_imputer=Imputer()
imputed_X_train=my_imputer.fit_transform(X_train)
imputed_X_test=my_imputer.fit_transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

my_model=RandomForestRegressor()
my_model.fit(imputed_X_train,y_train)
preds= my_model.predict(imputed_X_test)
print(mean_absolute_error(y_test,preds))

## Submitting  Random Forests Model after Handling Missing Values

In [None]:
import numpy as np
from  sklearn.preprocessing import Imputer

iowa_train=pd.read_csv('../input/train.csv')
iowa_test=pd.read_csv('../input/test.csv')

train=iowa_train.select_dtypes(exclude=['object'])
test=iowa_test.select_dtypes(exclude=['object'])

y_train=train.SalePrice
X_train= train.drop(['SalePrice'], axis=1)

my_imputer=Imputer()
imputed_X_train=my_imputer.fit_transform(X_train)
imputed_X_test=my_imputer.fit_transform(test)

my_model=RandomForestRegressor()
my_model.fit(imputed_X_train,y_train)

predicted_prices=my_model.predict(imputed_X_test)
print(predicted_prices)

In [None]:
my_submission = pd.DataFrame({'ID':test.Id, 'SalePrice':predicted_prices})

my_submission.to_csv('submission2.csv', index=False)

# Using Categorical Data with One Hot Encoding
* Use one-hot encoding to allow categoricals in your course project. Then add some categorical columns to your X data. If you choose the right variables, your model will improve quite a bit.

In [None]:
target = data.SalePrice
cols_with_missing = [col for col in data.columns 
                                 if data[col].isnull().any()]                                  
predictors = data.drop(['Id', 'SalePrice'] + cols_with_missing, axis=1)

X_train, X_test, y_train, y_test = train_test_split(predictors, 
                                                    target,
                                                    random_state=0)

In [None]:
encoded_X_train=pd.get_dummies(X_train)
encoded_X_test=pd.get_dummies(X_test)
final_X_train, final_X_test= encoded_X_train.align(encoded_X_test,join='inner',axis=1)

my_model=RandomForestRegressor()
my_model.fit(final_X_train,y_train)
preds= my_model.predict(final_X_test)
print(mean_absolute_error(y_test,preds))

## Submitting  Random Forests Model after use One Hot Encoding

In [None]:
import numpy as np
from  sklearn.preprocessing import Imputer

iowa_train=pd.read_csv('../input/train.csv')
iowa_test=pd.read_csv('../input/test.csv')

y_train= iowa_train.SalePrice
cols_with_missing_train = [col for col in iowa_train.columns 
                                 if iowa_train[col].isnull().any()]   
X_train= iowa_train.drop(['SalePrice'] + cols_with_missing_train, axis=1)
cols_with_missing_test = [col for col in iowa_test.columns 
                                 if iowa_test[col].isnull().any()]  
X_test= iowa_test.drop(cols_with_missing_test, axis=1)

encoded_X_train=pd.get_dummies(X_train)
encoded_X_test=pd.get_dummies(X_test)
final_X_train, final_X_test= encoded_X_train.align(encoded_X_test,join='inner',axis=1)

my_model=RandomForestRegressor()
my_model.fit(final_X_train,y_train)

predicted_prices=my_model.predict(final_X_test)
print(predicted_prices)

In [None]:
my_submission = pd.DataFrame({'ID':X_test.Id, 'SalePrice':predicted_prices})

my_submission.to_csv('submission3.csv', index=False)

# Use XGBoost
* Convert your model to use XGBoost.
* Use early stopping to find a good value for n_estimators. Then re-estimate the model with all of your training data, and that value of n_estimators.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

data = pd.read_csv('../input/train.csv')
data.dropna(axis=0, subset=['SalePrice'], inplace=True)
y=data.SalePrice
X=data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])
train_X, test_X, train_y, test_y= train_test_split(X.as_matrix(), y.as_matrix(), test_size=0.25)

my_imputer=Imputer()
train_X=my_imputer.fit_transform(train_X)
test_X=my_imputer.transform(test_X)

In [None]:
from xgboost import XGBRegressor

my_model=XGBRegressor()
my_model.fit(train_X,train_y, verbose=False)

predictions= my_model.predict(test_X)

from sklearn.metrics import mean_absolute_error
print("Mean absolute error: "+str(mean_absolute_error(predictions,test_y)))

In [None]:
my_model=XGBRegressor(n_estimators=1000)
my_model.fit(train_X, train_y, early_stopping_rounds=5,
            eval_set=[(test_X,test_y)], verbose=False)

predictions= my_model.predict(test_X)

from sklearn.metrics import mean_absolute_error
print("Mean absolute error: "+str(mean_absolute_error(predictions,test_y)))

## Submitting  Random Forests Model after use XGBoost

In [None]:
import numpy as np
from xgboost import XGBRegressor
from  sklearn.preprocessing import Imputer
from sklearn.pipeline import make_pipeline

iowa_train=pd.read_csv('../input/train.csv')
iowa_test=pd.read_csv('../input/test.csv')

train=iowa_train.select_dtypes(exclude=['object'])
test=iowa_test.select_dtypes(exclude=['object'])

y_train=train.SalePrice
X_train= train.drop(['SalePrice'], axis=1)

my_pipeline=make_pipeline(Imputer(), XGBRegressor())
my_pipeline.fit(X_train,y_train)
predictions=my_pipeline.predict(test)
print(predictions)

In [None]:
my_submission = pd.DataFrame({'ID':test.Id, 'SalePrice':predictions})

my_submission.to_csv('submission4.csv', index=False)

# Partial Dependence Plots
* Pick three predictors in your project. Formulate an hypothesis about what the partial dependence plot will look like. Create the plots, and check the results against your hypothesis.


In [None]:
data_predictors=['LotArea','YearBuilt','BedroomAbvGr']
X=data[data_predictors]
y=data.SalePrice

my_imputer=Imputer()
X=my_imputer.fit_transform(X)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.ensemble.partial_dependence import plot_partial_dependence

my_model=GradientBoostingRegressor()
my_model.fit(X,y)

my_plot=plot_partial_dependence(my_model,features=[0],X=X,
                                feature_names=['LotArea','YearBuilt','BedroomAbvGr'],grid_resolution=15)
my_plot=plot_partial_dependence(my_model,features=[1], X=X,
                                feature_names=['LotArea','YearBuilt','BedroomAbvGr'],grid_resolution=15)
my_plot=plot_partial_dependence(my_model,features=[2],X=X,
                                feature_names=['LotArea','YearBuilt','BedroomAbvGr'],grid_resolution=15)

# Pipelines
* Take your modeling code and convert it to use pipelines. For now, you'll need to do one-hot encoding of categorical variables outside of the pipeline (i.e. before putting the data in the pipeline).

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.pipeline import make_pipeline

y=data.SalePrice
X=data.drop(['SalePrice'], axis=1)

train_X, test_X, train_y, test_y= train_test_split(X,y)
en_train_X=pd.get_dummies(train_X)
en_test_X=pd.get_dummies(test_X)
final_train_X,final_test_X=en_train_X.align(en_test_X,join='inner',axis=1)

my_pipeline=make_pipeline(Imputer(),XGBRegressor(n_estimates=1000,early_stopping_rounds=5))
my_pipeline.fit(final_train_X , train_y)
predictions = my_pipeline.predict(final_test_X)
print(mean_absolute_error(predictions,test_y))

# Cross-Validation
* Convert the code for your on-going project over from train-test split to cross-validation. Make sure to remove all code that divides your dataset into training and testing datasets. Leaving code you don't need any more would be sloppy.
* Add or remove a predictor from your models. See the cross-validation score using both sets of predictors, and see how you can compare the scores.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

data_predictors=['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath',
                'BedroomAbvGr','TotRmsAbvGrd']
X=data[data_predictors]
y=data.SalePrice

my_pipeline=make_pipeline(Imputer(),RandomForestRegressor())
my_pipeline.fit(X , y)

scores = cross_val_score(my_pipeline, X, y, scoring='neg_mean_absolute_error')
print(scores)
print('Mean Absolute Error %2f' %(-1 * scores.mean()))

# Final Submit

In [None]:
import numpy as np
from xgboost import XGBRegressor
from  sklearn.preprocessing import Imputer
from sklearn.pipeline import make_pipeline

iowa_train=pd.read_csv('../input/train.csv')
iowa_test=pd.read_csv('../input/test.csv')

train=iowa_train.select_dtypes(exclude=['object'])
test=iowa_test.select_dtypes(exclude=['object'])

y_train=train.SalePrice
X_train= train.drop(['SalePrice'], axis=1)

en_X_train=pd.get_dummies(X_train)
en_test=pd.get_dummies(test)
final_X_train,final_test=en_X_train.align(en_test,join='inner',axis=1)

my_pipeline=make_pipeline(Imputer(), XGBRegressor(n_estimates=1000,early_stopping_rounds=5))
my_pipeline.fit(final_X_train,y_train)
predictions=my_pipeline.predict(final_test)
print(predictions)


In [None]:
my_submission = pd.DataFrame({'ID':final_test.Id, 'SalePrice':predictions})

my_submission.to_csv('submission5.csv', index=False)