## Content:

- [Model selection](#Model-selection)
- [Importing libraries](#Importing-libraries)
- [Data Cleaning](#Data-Cleaning)
- [Data preprocessing](#Data-preprocessing)
- [Models Regressions](#Models-Regressions)
    - [Baseline Model](#Baseline-Model)
    - [Metrics](#Metrics)
- [Models Classifications](#Models-Classifications)

## Model selection

<img src="https://scikit-learn.org/stable/_static/ml_map.png" >

## Importing libraries

In [None]:
# basic
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, train_test_split

# regressors
from sklearn.linear_model import LinearRegressionRidge, RidgeCV
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import ElasticNet, ElasticNetCV

# classifiers


# metrics
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
df = pd.read_csv("")
df.isnull().sum()

In [None]:
df.dtypes

## Data Cleaning

In [None]:
# Inputation of medians
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
imputer.fit(df_features)

In [None]:
# Creating polynomial features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures()
poly.fit_transform(X)

## Data preprocessing

In [None]:
# One hot encoding categorical variables
from sklearn.preprocessing import OneHotEncoder

categorical_feature = df.select_dtypes()
cat_encoder = OneHotEncoder()
cat_encoder.fit_transform(df[categorical_feature])

In [None]:
# Creating X - features variables and y - target variable for train and test dataset
features = [col for col in df_train.columns if col !='target']
X = df_train[features]
y = df_train['target']
X_test = df_test[features]

In [None]:
# Standardizing train and test features
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X_train_scaled = ss.fit_transform(X) # for train used fit_transform
X_test_scaled = ss.transform(X_test) # for test transform only

In [None]:
# Splitting scaled train dataset for further verification of model
from sklearn.preprocessing import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train_scaled,y)

## Models Regressions

### Baseline Model

In [None]:
# Baseline model for Regression - mean
from sklearn.metrics import r2_score, mean_squared_error

yhat = [np.mean(y) for i in range(len(y))]
test_rmse = np.sqrt(mean_squared_error(y_true=y, y_pred=yhat))
test_r2 = r2_score(y_true=y, y_pred=yhat)
print('--- Baseline model scores ---')
print('Root mean squared error RMSE:', test_rmse)
print('R2:', test_r2)

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression(fit_intercept=False)
lr.fit(x_matrix, y)
print("Coefficients", lr.coef_)
predictions  =  lr.predict(X)
print('Score:",lr.score(X, y)


### Lasso

In [None]:
from sklearn.linear_model import Lasso, LassoCV

lasso_alpha = np.arange(0.001,0.15,0.0025)
lasso_model = LassoCV(alphas=lasso_alpha,cv=5)
lasso_model.fit(X,y)
opt_alpha = lasso_model.alpha_
lasso_optimal_model = Lasso(alpha=opt_alpha)
lasso_optimal_model.fit(X,y)
predictions = lasso_optimal_model.predict(X_test)
#  all 0 is usless columns
lasso_optimal_model.coef_

### Ridge Regression

In [None]:
alpha = 10
ridge_model = Ridge(alpha = 10)
r_alpha = np.logspace(0,5,200)
ridge_model = RidgeCV(alphas = r_alpha,store_cv_values=True)
ridge_model.fit(X,y)
ridge_model.alpha_
ridge_optimal = Ridge(alpha=ridge_optimal_alpha)
print(cross_val_score(ridge_optimal,X,y).mean())
ridge_optimal.fit(X,y)
predictions = ridge_optimal.predict(X_test)
ridge_optimal.coef_

### ElasticNet Regression

In [None]:
from sklearn.linear_model import ElasticNet, ElasticNetCV

enet_alpha = np.arange(0.1,1,0.05)
ent_lratio = 0.5 # 50% of Lasso, 50% of Ridge
enet_model = ElasticNetCV(alphas=enet_alpha,l1_ratio=ent_lratio,cv=5)
enet_model.fit(X_overfit,y)
enet_optimal_alpha = enet_model.alpha_
lasso_optimal_model = Lasso(alpha=opt_alpha)
lasso_optimal_model.fit(X,y)
predictions = lasso_optimal_model.predict(X_test)

### Metrics

In [None]:
def metrics_function(y,predictions):
    # Mean Absolute Error
    mae = metrics.mean_absolute_error(y,predictions)
    # Residual Sum of Squares
    rss = ((y-predictions)**2).sum()
    # Mean Squared Error
    mse = metrics.mean_squared_error(y,predictions)
    # Root Mean Squared Error
    rmse = mse**0.5
    # Coefficient of Determination
    r = metrics.r2_score(y,predictions)
    # Adjusted R2
    r_adj = r2_adj(y,predictions)
    return f'''\
    Mean Absolute Error: {mae},\
    Residual Sum of Squares: {rss},\
    Mean Squared Error: {mse},\
    Root Mean Squared Error: {rmse},\
    Coefficient of Determination R2: {r},\
    Adjusted R2: {r_adj}.\
    '''

## Models Classifications

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 6 (2×3) combinations of hyperparameters
    {'n_estimators': [5, 10], 'max_features': [6, 8, 10]}]

forest_reg = RandomForestRegressor(random_state=42)
# train across 3 folds, that's a total of 6*3=18 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=3,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)