<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Init" data-toc-modified-id="Init-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Init</a></span></li><li><span><a href="#Notes" data-toc-modified-id="Notes-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Notes</a></span></li><li><span><a href="#Systematic-go-through-stuff" data-toc-modified-id="Systematic-go-through-stuff-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Systematic go through stuff</a></span><ul class="toc-item"><li><span><a href="#Meta-analysis" data-toc-modified-id="Meta-analysis-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Meta analysis</a></span></li><li><span><a href="#Split-data" data-toc-modified-id="Split-data-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Split data</a></span></li><li><span><a href="#Transformer" data-toc-modified-id="Transformer-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Transformer</a></span></li></ul></li></ul></div>

# Init

In [2]:
# Model libs (you can use any of the models below)
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor

# Data
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import re

# Text processing
import nltk
nltk.download('stopwords')

# Validation/scoring
from sklearn.metrics import mean_squared_error

# Plotting
from matplotlib import pyplot as plt

# Other
from IPython.display import HTML, display

# Config
pd.set_option('display.max_columns', 500)
pd.set_option('max_colwidth', 100)
y_col = "points"

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/epedersen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


https://www.kaggle.com/zynicide/wine-reviews


In [4]:
df_src = pd.read_csv("../data/wine.csv", encoding="utf-8", sep=";")


## DS explore
- "meta analysis"
- What pipelines to build, for which coloumns


In [22]:

def ds_explore(df_src,target,cols_rm):


    cols_num = [i for i in df_src._get_numeric_data().columns if i not in cols_rm + target]
    cols_str = [i for i in df_src.columns if i not in cols_num + cols_rm + target]

    unique_count = df_src[cols_str].nunique()
    unique_lim = 50

    cols_str_cat = list(unique_count[unique_count<unique_lim].index)
    cols_str_nlp = [i for i in cols_str if i not in cols_str_cat]

    meta = {
        "cols_str_cat" : cols_str_cat
        ,"cols_str_nlp" : cols_str_nlp
        ,"cols_num" : cols_num
    }
    # we now have split the dataset in 3 categories
    # num_cols = numeric columns
    # str_cat = string columns that can be treated as categorical
    # str_nlp = string columns that can be treated as nlp

    return meta
    
# Auto categorize columns
target = ["points"]
cols_rm = ["id"]

meta = ds_explore(df_src=df_src
           ,target =target
           ,cols_rm = cols_rm)

In [21]:
meta

{'cols_str_cat': ['country',
  'region_2',
  'taster_name',
  'taster_twitter_handle'],
 'cols_str_nlp': ['description',
  'designation',
  'province',
  'region_1',
  'title',
  'variety',
  'winery'],
 'cols_num': ['price']}

## Split data

In [8]:
import scipy
test_size = 0.3
random_state = 1

X_train, X_test, y_train, y_test = train_test_split(
        df_src[cols_num+cols_str],
        df_src[target].values.ravel(),
        test_size = test_size, 
        random_state = random_state
    )



## Transformer

In [9]:
# https://stackoverflow.com/questions/54160370/how-to-use-sklearn-column-transformer
# https://medium.com/vickdata/a-simple-guide-to-scikit-learn-pipelines-4ac0d974bdcf

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

meta["trans_num"] = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

meta["trans_str_cat"] = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', meta["trans_num"], meta["cols_num"]),
        ('cat', meta["trans_str_cat"], meta["cols_str_cat"])])



X_train_prep = preprocessor.fit_transform(X_train.copy())

In [15]:
meta

{'cols_str_cat': ['country',
  'region_2',
  'taster_name',
  'taster_twitter_handle'],
 'cols_str_nlp': ['description',
  'designation',
  'province',
  'region_1',
  'title',
  'variety',
  'winery'],
 'cols_num': ['price'],
 'trans_num': Pipeline(memory=None,
      steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
        strategy='median', verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))]),
 'trans_str_cat': Pipeline(memory=None,
      steps=[('imputer', SimpleImputer(copy=True, fill_value='missing', missing_values=nan,
        strategy='constant', verbose=0)), ('onehot', OneHotEncoder(categorical_features=None, categories=None,
        dtype=<class 'numpy.float64'>, handle_unknown='ignore',
        n_values=None, sparse=True))])}

In [11]:
#GradientBoostingRegressor
model = RandomForestRegressor(n_estimators=10)


model.fit(X_train_prep,y_train.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [19]:
X_test_prep = preprocessor.transform(X_test)

y_test_model = model.predict(X_test_prep)

mse = mean_squared_error(y_test, y_test_model)
mse


5.4652023256787725

5.4652023256787725