{{ badge }}

<a href="https://colab.research.google.com/github/farishijazi/ai-ml-dl-course/blob/master/1_b_housing_eda_fit.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg"></a>


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from sklearn.datasets import fetch_openml

# load house prices data (this is a special function for well known datasets)
dataset = fetch_openml('house_prices', as_frame=True)

data = dataset.data.copy()
data['SalePrice'] = dataset.target

# reduce the number of features (you can comment this line)
# data = data[["MSSubClass", "MSZoning", "LotArea", "LotConfig", "BldgType", "OverallCond", "YearBuilt", "YearRemodAdd", "Exterior1st", "BsmtFinSF2", "TotalBsmtSF", "SalePrice", ]]

X = data.drop(['SalePrice'],1) # the input features (questions)
y = data['SalePrice'] # the output feature (answer)

data.head()

In [None]:
data.describe()

In [None]:
plt.figure(figsize=(20, 10))
sns.distplot(data['SalePrice'], color='g', bins=100, hist_kws={'alpha': 0.4});

In [None]:
num_cols = [x for x in X if data[x].dtype in ['float64','int64']]
obj_cols = [x for x in X if data[x].dtype == 'object']

In [None]:
plt.figure(figsize=(12, 6))
sns.heatmap(data[num_cols].corr(),
            fmt = '.2f',
            linewidths = 2,
            annot = True)

In [None]:
num_data = data.select_dtypes(include = ['float64', 'int64'])
num_data.hist(figsize=(20, 20), bins=50, xlabelsize=8, ylabelsize=8);

In [None]:
for i in range(0, len(num_data.columns), 5):
    sns.pairplot(data=num_data,
                x_vars=num_data.columns[i:i+5],
                y_vars=['SalePrice'])

In [None]:
# count strings in each column
data_obj = data.select_dtypes(include = ['object'])
fig, axes = plt.subplots(round(len(data_obj.columns) / 3), 3, figsize=(15, 15))

for i, ax in enumerate(fig.axes):
    if i < len(data_obj.columns):
        ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=45)
        sns.countplot(x=data_obj.columns[i], alpha=0.7, data=data_obj, ax=ax)

fig.tight_layout()

In [None]:
## filling missing values using "most_frequent" strategy
# this part is optional, try skipping it and see what happens

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

num_pipe = Pipeline(steps = [('impute', SimpleImputer(strategy='mean'))])
obj_pipe = Pipeline(steps = [('impute', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder())])
preprocess = ColumnTransformer([
    ('number', num_pipe, num_cols),
    ('object', obj_pipe, obj_cols)
])
X = preprocess.fit_transform(X).toarray()

In [None]:
# splitting train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# from sparse matrix to numpy array
X_train = X_train
X_test = X_test
y_train = y_train
y_test = y_test

X_train.shape, y_train.shape

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
reg = LinearRegression()
reg.fit(X_train, y_train)

In [None]:
reg.score(X_test,y_test)

In [None]:
y_pred = reg.predict(X_test)
plt.scatter(y_test, y_pred, alpha=0.4)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted')
plt.show()

In [None]:
from sklearn import ensemble
clf = ensemble.GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2,
          learning_rate = 0.1, loss = 'ls')

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test,y_test)

In [None]:
y_pred = clf.predict(X_test)

# plot predictions vs actual values
plt.scatter(y_test, y_pred, alpha=0.4)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted')
plt.show()

## Going further

We have only scratched the surface of Machine Learning.

There are multiple methods to improve a Machine Learning model:

- [Normalize data](https://inside-machinelearning.com/en/normalize-your-data/)
- [Cross-Validation](https://inside-machinelearning.com/en/cross-validation-tutorial/)
- [Changing the models hyperparameters](https://inside-machinelearning.com/en/decision-tree-and-hyperparameters/)
- [Data Augmentation](https://inside-machinelearning.com/en/data-augmentation-csv-data/)
- [Ensemble methods](https://inside-machinelearning.com/en/ensemble-methods/)

