In [2]:
# Loading Datasets 

import pandas as pd 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn import linear_model

df_train = pd.read_csv("train.csv", index_col = "Id")
df_test = pd.read_csv("test.csv", index_col = "Id")

# Obtain target
y = df_train.SalePrice
X_full = df_train.drop(['SalePrice'], axis = 1).copy()

# Call categorical features in conditions 

categorical_cols = [fea for fea in X_full.columns if X_full[fea].nunique() < 10 and X_full[fea].dtype == 'object']

# Select numerical features

numerical_cols = [fea for fea in X_full.columns if X_full[fea].dtype in ['int64', 'float64']]

# Preprocessing for numerical data 

numerical_transformer = SimpleImputer(strategy = 'constant')

# Preprocessing for categorical data

categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])

# Bundle preprocessing for numerical and categorical data

preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [3]:
# Make function for grid search

def get_score(alpha_estim):
    """Return the average MAE over 3 CV folds of random forest model.
    
    Keyword argument:
    n_estimators -- the number of trees in the forest
    """
    # Replace this body with your own code
    my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', linear_model.Lasso(alpha = alpha_estim))
    ])
    
    scores = cross_val_score(my_pipeline, X, y, cv = 4)
    
    return scores.mean()

In [4]:
# Grid searching 

alphas = [0.5, 1, 2, 5, 10, 20, 50, 100, 200, 500]
result = [get_score(alpha) for alpha in alphas]
grid_df = pd.DataFrame({'alpha' : alphas, 'result' : result})

In [5]:
# Find best model

import chart_studio.plotly as py
import cufflinks as cf
cf.go_offline(connected = True )
import plotly.express as px

fig = px.scatter(grid_df, x = alphas, y = result)
fig.show()

In [6]:
# Make predictions 

best_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', linear_model.Lasso(alpha = 200))
    ])

best_pipeline.fit(X, y)

preds_test = best_pipeline.predict(df_test)

output = pd.DataFrame({'Id': df_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)