# Prediction

In [8]:
import pandas as pd
import joblib

# removendo avisos de atualizacao de bibliotecas
import warnings
warnings.filterwarnings("ignore")

In [17]:
# Macros

COLS_TO_REMOVE = ['Unnamed: 0']
DATA_PATH = 'data/df_test_no_label.csv'
PREPARATION_PATH = 'outputs/data_pipeline.sav'
MODEL_PATH = 'outputs/model.sav'
OUTPUT_PATH = 'data/predictions.csv'

In [6]:
df = pd.read_csv(DATA_PATH)

# Removendo atributos que não acrescentam informações no modelo
df.drop(COLS_TO_REMOVE, axis=1, inplace=True)
df = df.set_index('id')
df.head().T

id,oib8mib64c,6ooe791roo,vuc45jn3n6,5hlqns1q9f,u3ypbvdw3z
date_account_created,2014-04-10,2013-10-29,2013-05-05,2013-11-20,2014-04-18
timestamp_first_active,20140410030115,20131029052221,20130505201711,20131120064542,20140418194219
gender,-unknown-,-unknown-,-unknown-,FEMALE,-unknown-
age,,,,,
signup_method,basic,basic,basic,basic,basic
signup_flow,25,23,0,0,0
language,en,en,en,en,en
affiliate_channel,direct,direct,sem-non-brand,direct,direct
affiliate_provider,direct,direct,google,direct,direct
first_affiliate_tracked,untracked,untracked,omg,untracked,linked


# Pre processing

In [20]:
data_pipeline = joblib.load(PREPARATION_PATH)
X = data_pipeline.transform(df)
X.head()

Unnamed: 0,timestamp_first_active,first_active_on_creation_date,register_year,register_month,register_day,register_weekday,age,signup_flow,gender,signup_method,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,20140410000000.0,1.0,2014.0,4.0,10.0,4.0,0.391355,1.0,0.0,0.0,4.0,2.0,3.0,7.0,3.0,8.0,0.0
1,20131030000000.0,1.0,2013.0,10.0,29.0,2.0,0.391355,0.92,0.0,0.0,4.0,2.0,3.0,7.0,0.0,4.0,0.0
2,20130510000000.0,1.0,2013.0,5.0,5.0,0.0,0.391355,0.0,0.0,0.0,4.0,6.0,7.0,4.0,2.0,6.0,7.0
3,20131120000000.0,1.0,2013.0,11.0,20.0,3.0,0.391355,0.0,1.0,0.0,4.0,2.0,3.0,7.0,2.0,3.0,14.0
4,20140420000000.0,1.0,2014.0,4.0,18.0,5.0,0.391355,0.0,0.0,0.0,4.0,2.0,3.0,1.0,2.0,6.0,17.0


# Prediction

In [10]:
clf = joblib.load(MODEL_PATH)
clf

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.3,
              eval_metric='mlogloss', gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1,
              objective='multi:softprob', random_state=2021, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [11]:
y = clf.predict(X)
y_proba = clf.predict_proba(X)
y

array(['NDF', 'NDF', 'NDF', ..., 'NDF', 'NDF', 'NDF'], dtype=object)

In [12]:
y_proba

array([[3.6030894e-03, 1.8765760e-03, 9.4804549e-01, 4.6474881e-02],
       [1.8549454e-03, 6.6965085e-04, 9.6155745e-01, 3.5917897e-02],
       [1.2191723e-02, 5.1443330e-03, 9.3402827e-01, 4.8635606e-02],
       ...,
       [1.1864457e-02, 7.1651698e-03, 8.8746524e-01, 9.3505107e-02],
       [1.4621754e-02, 6.3226549e-03, 5.4058647e-01, 4.3846911e-01],
       [1.8085745e-03, 6.4910908e-04, 8.1655669e-01, 1.8098564e-01]],
      dtype=float32)

In [13]:
clf.classes_

array(['FR', 'IT', 'NDF', 'US'], dtype=object)

# Reorganizing the data

Since the submission must be done following the convention of the dictionary, let's rearrange the order of the columns before saving the final csv

{'NDF': 0, 'US': 1, 'FR': 2, 'IT': 3}

In [16]:
csv = pd.DataFrame(y_proba, columns=clf.classes_)

column_names = ['NDF', 'US', 'FR', 'IT']
csv = csv.reindex(columns=column_names)
csv.head()

Unnamed: 0,NDF,US,FR,IT
0,0.948045,0.046475,0.003603,0.001877
1,0.961557,0.035918,0.001855,0.00067
2,0.934028,0.048636,0.012192,0.005144
3,0.757481,0.222916,0.016449,0.003154
4,0.880875,0.099814,0.011365,0.007946


In [19]:
csv.to_csv(OUTPUT_PATH, index=False, header=False)