<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Init" data-toc-modified-id="Init-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Init</a></span></li><li><span><a href="#Notes" data-toc-modified-id="Notes-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Notes</a></span></li><li><span><a href="#Systematic-go-through-stuff" data-toc-modified-id="Systematic-go-through-stuff-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Systematic go through stuff</a></span><ul class="toc-item"><li><span><a href="#Meta-analysis" data-toc-modified-id="Meta-analysis-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Meta analysis</a></span></li><li><span><a href="#Split-data" data-toc-modified-id="Split-data-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Split data</a></span></li><li><span><a href="#Transformer" data-toc-modified-id="Transformer-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Transformer</a></span></li></ul></li></ul></div>

# Init

In [2]:
# Model libs (you can use any of the models below)
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor

# Data
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import re
import os.path

# Text processing
# import nltk
# nltk.download('stopwords')

# Validation/scoring
from sklearn.metrics import mean_squared_error

# Plotting
from matplotlib import pyplot as plt

# Other
from IPython.display import HTML, display

# Config
pd.set_option('display.max_columns', 500)
pd.set_option('max_colwidth', 100)
y_col = "points"

https://www.kaggle.com/zynicide/wine-reviews


In [3]:


if os.path.isfile("../data/wine.csv"):
    df_src = pd.read_csv("../data/wine.csv", encoding="utf-8", sep=";")
elif os.path.isfile("../data/winemag-data_first150k.json"): 
    df_src = pd.read_json("../data/winemag-data_first150k.json", encoding="utf-8")

#(""../data/winemag-data_first150k.json")

#https://www.kaggle.com/zynicide/wine-reviews#winemag-data-130k-v2.csv

In [4]:
df_src.head()

Unnamed: 0,id,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,40136,US,"Thickly concentrated in a rich style, this wine also possesses a balanced backbone of citrus and...",,87,16.0,California,Napa Valley,Napa,Virginie Boone,@vboone,Flint & Steel 2016 Sauvignon Blanc (Napa Valley),Sauvignon Blanc,Flint & Steel
1,25383,US,This very substantive wine carries rich and complex flavors on a firm and lively frame. It has a...,Gloria Estate,94,54.0,California,Russian River Valley,Sonoma,Jim Gordon,@gordone_cellars,Freeman 2012 Gloria Estate Pinot Noir (Russian River Valley),Pinot Noir,Freeman
2,55907,South Africa,"Lemon, green gooseberry and plum lead the nose and mouth of this refreshing Sauvignon Blanc. The...",,87,19.0,Walker Bay,,,Lauren Buzzeo,@laurbuzz,Southern Right 2011 Sauvignon Blanc (Walker Bay),Sauvignon Blanc,Southern Right
3,50353,US,"There is an oaky toastiness to the nose of this bottling, which shows light browned butter, hone...",Reserve,92,46.0,California,Paso Robles Willow Creek District,Central Coast,Matt Kettmann,@mattkettmann,Daou 2015 Reserve Chardonnay (Paso Robles Willow Creek District),Chardonnay,Daou
4,105242,Italy,"Aromas of fleshy black-skinned fruit, kitchen spice and underbrush emerge from this blend of Mer...",Le Volte,90,30.0,Tuscany,Toscana,,Kerin O’Keefe,@kerinokeefe,Ornellaia 2012 Le Volte Red (Toscana),Red Blend,Ornellaia


## DS explore
- "meta analysis"
- What pipelines to build, for which coloumns


In [5]:

def ds_explore(df_src,target,cols_rm,**kwargs):
    unique_lim = kwargs.get("unique_lim",50)
    
    target=[target]
    
    cols_num = [i for i in df_src._get_numeric_data().columns if i not in cols_rm + target]
    cols_str = [i for i in df_src.columns if i not in cols_num + cols_rm + target]

    unique_count = df_src[cols_str].nunique()
    

    cols_str_cat = list(unique_count[unique_count<unique_lim].index)
    cols_str_nlp = [i for i in cols_str if i not in cols_str_cat]

    meta = {
        "cols_str_cat" : cols_str_cat
        ,"cols_str_nlp" : cols_str_nlp
        ,"cols_num" : cols_num
        ,"target" : target
    }
    # we now have split the dataset in 3 categories
    # num_cols = numeric columns
    # str_cat = string columns that can be treated as categorical
    # str_nlp = string columns that can be treated as nlp

    return meta
    
# Auto categorize columns
target = ["points"]
cols_rm = ["id"]

meta = ds_explore(df_src=df_src
           ,target =target
           ,cols_rm = cols_rm)

In [6]:
meta

{'cols_num': [u'price'],
 'cols_str_cat': [u'country',
  u'region_2',
  u'taster_name',
  u'taster_twitter_handle'],
 'cols_str_nlp': [u'description',
  u'designation',
  u'province',
  u'region_1',
  u'title',
  u'variety',
  u'winery']}

In [7]:
df_src.isnull().sum()/len(df_src.index)

id                       0.00000
country                  0.00050
description              0.00000
designation              0.28767
points                   0.00000
price                    0.06982
province                 0.00050
region_1                 0.16412
region_2                 0.61133
taster_name              0.20253
taster_twitter_handle    0.24088
title                    0.00000
variety                  0.00000
winery                   0.00000
dtype: float64

## Split data

In [8]:
import scipy
test_size = 0.3
random_state = 1

X_train, X_test, y_train, y_test = train_test_split(
        df_src[meta["cols_num"] + meta["cols_str_cat"] + meta["cols_str_nlp"]],
        df_src[target].values.ravel(),
        test_size = test_size, 
        random_state = random_state
    )

X_train_clean = X_train.dropna()



0

2

## Transformer

In [12]:
# https://stackoverflow.com/questions/54160370/how-to-use-sklearn-column-transformer
# https://medium.com/vickdata/a-simple-guide-to-scikit-learn-pipelines-4ac0d974bdcf

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

meta["trans_num"] = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

meta["trans_str_cat"] = Pipeline(steps=[
    #('imputer', SimpleImputer(missing_values=[None,float('nan')],strategy='constant', fill_value='missing')),
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', meta["trans_num"], meta["cols_num"]),
        ('cat', meta["trans_str_cat"], meta["cols_str_cat"])
    ])


X_train_prep = preprocessor.fit_transform(X_train.copy())

In [13]:
meta

{'cols_num': [u'price'],
 'cols_str_cat': [u'country',
  u'region_2',
  u'taster_name',
  u'taster_twitter_handle'],
 'cols_str_nlp': [u'description',
  u'designation',
  u'province',
  u'region_1',
  u'title',
  u'variety',
  u'winery'],
 'trans_num': Pipeline(memory=None,
      steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
        strategy='median', verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))]),
 'trans_str_cat': Pipeline(memory=None,
      steps=[('imputer', SimpleImputer(copy=True, fill_value='missing', missing_values=nan,
        strategy='constant', verbose=0)), ('onehot', OneHotEncoder(categorical_features=None, categories=None,
        dtype=<type 'numpy.float64'>, handle_unknown='ignore',
        n_values=None, sparse=True))])}

In [14]:
#GradientBoostingRegressor
model = RandomForestRegressor(n_estimators=10)


model.fit(X_train_prep,y_train.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [15]:
X_test_prep = preprocessor.transform(X_test)

y_test_model = model.predict(X_test_prep)

mse = mean_squared_error(y_test, y_test_model)
mse


5.468532583863232

In [18]:
df = pd.DataFrame({"target":[1,0,1,0], "a":[1,2,3,4],"b":["a","a","b","b"]})



In [26]:
df[[]].nunique()

0
1
2
3


In [28]:
if not []:
    print(True)

True
