In [1]:
import pandas as pd
import dill
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

#working with text
from sklearn.feature_extraction.text import TfidfVectorizer
#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion

In [2]:
choc_ratings_df = pd.read_csv('./input/chocolate_ratings.csv')

target = choc_ratings_df['Rating']
choc_ratings_df.head(6)

Unnamed: 0,REF,Company (Manufacturer),Company Location,Review Date,Country of Bean Origin,Specific Bean Origin or Bar Name,Cocoa Percent,Ingredients,Most Memorable Characteristics,Rating
0,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76%,"3- B,S,C","rich cocoa, fatty, bready",3.25
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76%,"3- B,S,C","cocoa, vegetal, savory",3.5
2,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76%,"3- B,S,C","cocoa, blackberry, full body",3.75
3,2542,5150,U.S.A.,2021,Fiji,"Matasawalevu, batch 1",68%,"3- B,S,C","chewy, off, rubbery",3.0
4,2546,5150,U.S.A.,2021,Venezuela,"Sur del Lago, batch 1",72%,"3- B,S,C","fatty, earthy, moss, nutty,chalky",3.0
5,2546,5150,U.S.A.,2021,Uganda,"Semuliki Forest, batch 1",80%,"3- B,S,C","mildly bitter, basic cocoa, fatty",3.25


In [3]:
choc_ratings_df.shape

(2530, 10)

In [4]:
choc_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2530 entries, 0 to 2529
Data columns (total 10 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   REF                               2530 non-null   int64  
 1   Company (Manufacturer)            2530 non-null   object 
 2   Company Location                  2530 non-null   object 
 3   Review Date                       2530 non-null   int64  
 4   Country of Bean Origin            2530 non-null   object 
 5   Specific Bean Origin or Bar Name  2530 non-null   object 
 6   Cocoa Percent                     2530 non-null   object 
 7   Ingredients                       2443 non-null   object 
 8   Most Memorable Characteristics    2530 non-null   object 
 9   Rating                            2530 non-null   float64
dtypes: float64(1), int64(2), object(7)
memory usage: 197.8+ KB


In [5]:
X_train, X_test, y_train, y_test = train_test_split(choc_ratings_df, 
                                                    target, test_size=0.33, random_state=6)
#save test
X_test.to_csv("./output/X_test.csv", index=None)
y_test.to_csv("./output/y_test.csv", index=None)
#save train
X_train.to_csv("./output/X_train.csv", index=None)
y_train.to_csv("./output/y_train.csv", index=None)

In [6]:
cat_columns = ['Company (Manufacturer)', 'Company Location', 'Country of Bean Origin',
               'Specific Bean Origin or Bar Name']
# переводим категориальные признаки в числовые
for cat_col in cat_columns:
    indexing_data = choc_ratings_df[cat_col].unique()
    file_path = f"./cat_index/'{cat_col}'.csv"
    indexing_df = pd.DataFrame(indexing_data, columns=[cat_col])
    indexing_df[cat_col + '_idx'] = indexing_df.index
    indexing_df.to_csv(file_path, index=False)

In [7]:
class DataPreprocessing(BaseEstimator, TransformerMixin):
     def __init__(self):
        self.cat_columns = ['Company (Manufacturer)', 'Company Location', 'Country of Bean Origin',
                            'Specific Bean Origin or Bar Name']
        self.num_columns = ['Cocoa Percent', 'Review Date']

     def fit(self, X, y=None):
        return self

     def transform(self, X):
        X_ = X.copy()
        # заполняем пропуски в текстовых колонках
        text_columns = self.cat_columns + ['Cocoa Percent']
        X_[text_columns] = X_[text_columns].fillna('')
        # переводим стобец в число убирая знак %
        X_['Cocoa Percent'] = pd.to_numeric(X_['Cocoa Percent'].apply(lambda x: x.rstrip('%')))
    
        return X_

In [8]:
class FeatureGenetator(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.cat_columns = ['Company (Manufacturer)', 'Company Location', 'Country of Bean Origin',
                            'Specific Bean Origin or Bar Name']
        self.num_columns = ['Cocoa Percent', 'Review Date']
        self.ptn = {'Bean Origin Unknown': None,
                    'Bean Bar Unknown': None}
        self.all_columns = []

    def fit(self, X, y=None):
        return self
    
    # создаем бинарные признаки
    def binarizer(self, X):
        self.ptn['Bean Origin Unknown'] = (X['Country of Bean Origin'].str.contains('Blend')) | \
                                          (X['Country of Bean Origin'].str.len == 0)
        self.ptn['Bean Bar Unknown'] = X['Specific Bean Origin or Bar Name'] == X['Country of Bean Origin']
        for bin_col, pattern in self.ptn.items():
            self.all_columns.append(bin_col)
            X[bin_col] = 0.0
            X.loc[pattern, bin_col] = 1.0

    # создаем категориальные признаки (их надо время от времени обновлять)
    def categorizer(self, X):
        X_ = X
        for cat_col in self.cat_columns:
            cat_file_path = f"./cat_index/'{cat_col}'.csv"
            cat_df = pd.read_csv(cat_file_path) 
            X_ = X_.merge(cat_df, how='left', on=cat_col)
            idx_col = cat_col + '_idx'
            self.all_columns.append(idx_col)
            X_[idx_col].fillna(-1, inplace=True)
        return X_

    def transform(self, X):
        X_ = X.copy()
        self.all_columns = []

        self.binarizer(X_)
        new_df = self.categorizer(X_)
        self.all_columns.extend(self.num_columns)
        
        return new_df[self.all_columns]

In [9]:
# классы для tfidf признаков
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key, value):
        self.key = key
        self.value = value
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X[self.key] = X[self.key].fillna(self.value)
        return X

In [10]:
pipes = []

pipes.append(('common_feats', 
              Pipeline([('preproc', DataPreprocessing()),
                        ('features_gen', FeatureGenetator())])
             ))
pipes.append(('memory_feat', 
              Pipeline([('imputer', TextImputer('Most Memorable Characteristics', '')),
                        ('memory_selector', FeatureSelector(column='Most Memorable Characteristics')), 
                        ('memory_tfidf', TfidfVectorizer())])
             ))
pipes.append(('ingredients_feat', 
              Pipeline([('imputer', TextImputer('Ingredients', '')),
                        ('ingredients_selector', FeatureSelector(column='Ingredients')), 
                        ('ingredients_tfidf', TfidfVectorizer())])
             ))

feats = FeatureUnion(pipes)

In [11]:
?GradientBoostingRegressor

Object `GradientBoostingRegressor` not found.


In [12]:
model = RandomForestRegressor(min_samples_split = 20, 
                                 max_depth=41,
                                 n_estimators=600, 
                                 min_samples_leaf=2,
                                 min_weight_fraction_leaf=0.005, 
                                 random_state=23)


In [13]:
pipeline = Pipeline([
    ('features_gen', feats),
    ('regressor', model)
])

In [14]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features_gen',
                 FeatureUnion(transformer_list=[('common_feats',
                                                 Pipeline(steps=[('preproc',
                                                                  DataPreprocessing()),
                                                                 ('features_gen',
                                                                  FeatureGenetator())])),
                                                ('memory_feat',
                                                 Pipeline(steps=[('imputer',
                                                                  TextImputer(key='Most '
                                                                                  'Memorable '
                                                                                  'Characteristics',
                                                                              value='')),
                                                

In [15]:
with open("./output/choco_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)

In [16]:
from sklearn.metrics import r2_score as r2

predictions = pipeline.predict(X_test)
print(f"Train R2:\t {r2(y_test, predictions):.3}\n")

Train R2:	 0.315

