In [1]:
from rampwf.workflows import FeatureExtractor
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from process_data import prep_medium
import numpy as np


def medium_extraction(X):
    val_medium = X['Medium'].values
    Medium = []
    sep = ","

    vectorizer = CountVectorizer(stop_words="english", max_features=24)
    vectorizer.fit(val_medium)
    vectorized_input = vectorizer.transform(val_medium)
    inv_transform = vectorizer.inverse_transform(vectorized_input)
            
    for arr in inv_transform:
        arr = list(arr)
        arr = sorted(arr)
        arr = sep.join(arr)
        Medium.append(arr)
            
    X['Medium'] = Medium
    X[X['Medium']==""]=pd.NA
    
    return X['Medium']

def mean_target_encoding_classif(df):
    y_ = np.array(y)
    map_period = {"Contemporary Era":1., "Modern Times": 2., "Middle Ages":3., "Antiquity":4.}
    df["num_period"] = [map_period[period] for period in y_]
    tmp_classif = df.groupby(["Classification"]).describe()
    map_classif = {classif_cat: classif_num for (classif_cat, classif_num) in zip(tmp_classif.index, tmp_classif[('num_period', 'mean')])}
            
    Classification =  [map_classif[m] for m in df["Classification"]]
    
    return np.array(Classification).reshape(-1, 1)
        
def mean_target_encoding_medium(df):
    y_ = np.array(y)
    map_period = {"Contemporary Era":1., "Modern Times": 2., "Middle Ages":3., "Antiquity":4.}
    df["num_period"] = [map_period[period] for period in y_]
    tmp_medium = df.groupby(["Medium"]).describe()
    map_medium = {medium_cat: medium_num for (medium_cat, medium_num) in zip(tmp_medium.index, tmp_medium[('num_period', 'mean')])}
            
    Medium = [map_medium[m] for m in df["Medium"]]
    return np.array(Medium).reshape(-1,1)
        
def mean_target_encoding_culture(df):
    y_ = np.array(y)
    map_period = {"Contemporary Era":1., "Modern Times": 2., "Middle Ages":3., "Antiquity":4.}
    df["num_period"] = [map_period[period] for period in y_]
    tmp_culture = df.groupby(["Culture"]).describe()
    map_culture = {culture_cat: culture_num for (culture_cat, culture_num) in zip(tmp_culture.index, tmp_culture[('num_period', 'mean')])}
            
    Culture = [map_culture[m] for m in df["Culture"]]     
    return np.array(Culture).reshape(-1, 1)

medium_extraction_transformer = FunctionTransformer(medium_extraction, validate=False)
MTE_classif = FunctionTransformer(mean_target_encoding_classif, validate=False)
MTE_medium = FunctionTransformer(mean_target_encoding_medium, validate=False)
MTE_culture =  FunctionTransformer(mean_target_encoding_culture, validate=False) 
        
preprocessor = ColumnTransformer(
            transformers=[
                ('medium_extrac', make_pipeline(MTE_medium, SimpleImputer(strategy = 'constant', fill_value = -1)), ['Medium']),
                ('mte_classif', MTE_classif, ['Classification']),
                ('mte_culture', MTE_culture,  ['Culture'])
            ])


In [23]:
X = pd.read_csv("data/museum_data.csv")

In [24]:
X.columns

Index(['Unnamed: 0', 'Medium', 'Classification', 'Culture',
       'Object Begin Date', 'Historical Period', 'height', 'diam', 'width',
       'depth'],
      dtype='object')

In [25]:
y = X['Historical Period']

In [26]:
X = X[['Classification', 'Culture', 'height', 'diam', 'width', 'depth']]

In [27]:
X.shape

(106699, 6)

In [28]:
rr = pd.read_csv("data/museum_data.csv")

In [29]:
rr = rr.dropna(axis=0, subset=['Medium'])

In [30]:
X['Medium'] = rr['Medium'].values[:106699]

In [31]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106699 entries, 0 to 106698
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Classification  106699 non-null  object 
 1   Culture         106699 non-null  object 
 2   height          106699 non-null  float64
 3   diam            106699 non-null  float64
 4   width           106699 non-null  float64
 5   depth           106699 non-null  float64
 6   Medium          106699 non-null  object 
dtypes: float64(4), object(3)
memory usage: 5.7+ MB


In [32]:
X['Medium'] = medium_extraction(X)
preprocessor.fit(X, y)

ColumnTransformer(transformers=[('medium_extrac',
                                 Pipeline(steps=[('functiontransformer',
                                                  FunctionTransformer(func=<function mean_target_encoding_medium at 0x000001F6017DCAF0>)),
                                                 ('simpleimputer',
                                                  SimpleImputer(fill_value=-1,
                                                                strategy='constant'))]),
                                 ['Medium']),
                                ('mte_classif',
                                 FunctionTransformer(func=<function mean_target_encoding_classif at 0x000001F6017DCB80>),
                                 ['Classification']),
                                ('mte_culture',
                                 FunctionTransformer(func=<function mean_target_encoding_culture at 0x000001F6017DCC10>),
                                 ['Culture'])])

In [33]:
prep_class_cult = preprocessor.fit_transform(X)

In [34]:
med = prep_class_cult[:,0]
X['Medium'] = med

In [35]:
X[['Medium']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106699 entries, 0 to 106698
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Medium  106699 non-null  float64
dtypes: float64(1)
memory usage: 833.7 KB


In [36]:
X[['Medium','Classification', 'Culture']]= prep_class_cult 

In [37]:
X[['Classification', 'Culture']].nunique()

Classification    107
Culture           232
dtype: int64

In [38]:
X

Unnamed: 0,Classification,Culture,height,diam,width,depth,Medium
0,1.966078,1.656506,296.5,-1.0,261.6,-1.0,1.854215
1,2.295775,2.000000,12.07,-1.0,11.43,-1.0,2.927883
2,1.981467,1.871935,-1.0,-1.0,-1.0,105.5,1.666667
3,2.295775,1.886682,-1.0,-1.0,-1.0,14.5,1.914692
4,1.848797,1.871935,-1.0,8.3,-1.0,-1.0,2.166667
...,...,...,...,...,...,...,...
106694,1.000000,1.871935,22.8,-1.0,30.5,39.4,1.702391
106695,3.999804,3.997108,6.7,-1.0,3.8,9.5,3.875496
106696,2.392888,1.843485,-1.0,68.5,-1.0,-1.0,3.046062
106697,3.999804,3.997108,11.4,-1.0,7.0,15.9,3.875496


In [39]:
y

0             Modern Times
1             Modern Times
2         Contemporary Era
3             Modern Times
4             Modern Times
                ...       
106694    Contemporary Era
106695           Antiquity
106696    Contemporary Era
106697           Antiquity
106698           Antiquity
Name: Historical Period, Length: 106699, dtype: object

In [40]:
X.values

array([[1.9660783101003854, 1.656506447831184, 296.5, ..., 261.6, -1.0,
        1.854214876033058],
       [2.295774647887324, 2.0, 12.07, ..., 11.43, -1.0,
        2.9278826838087584],
       [1.9814665592264302, 1.8719351188230857, -1.0, ..., -1.0, 105.5,
        1.6666666666666667],
       ...,
       [2.3928881179531656, 1.8434845472618833, -1.0, ..., -1.0, -1.0,
        3.046061515378845],
       [3.9998039600078417, 3.99710843373494, 11.4, ..., 7.0, 15.9,
        3.8754961630060865],
       [3.9998039600078417, 4.0, 19.1, ..., 14.0, -1.0,
        3.8754961630060865]], dtype=object)

In [41]:
pip install ramp-workflow




In [2]:
a = pd.read_csv("data/museum_data_train.csv")

In [3]:
a[['Medium']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40180 entries, 0 to 40179
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Medium  40180 non-null  object
dtypes: object(1)
memory usage: 314.0+ KB
