# Predicting The Missing Loyalty Scores

import packages

In [146]:
import pandas as pd
import pickle

import cutomers for scoring

In [147]:
to_be_scored = pickle.load(open('Saved_files/Groc_regression_scoring.p','rb'))
to_be_scored.head()

Unnamed: 0,customer_id,distance_from_store,gender,credit_score,total_sales,total_items,transaction_count,product_area_count,avg_basket_value
6,1,4.78,F,0.66,3980.49,424,51,5,78.048824
7,120,3.49,F,0.38,2887.2,253,45,5,64.16
8,52,14.91,F,0.68,3342.75,335,47,5,71.12234
10,435,0.25,M,0.62,2326.71,267,48,5,48.473125
12,679,4.74,F,0.58,3448.59,370,49,5,70.379388


import model and model objects

In [148]:
regressor = pickle.load(open('Saved_files/Groc_forest_reg_model.p','rb'))
print(regressor)

RandomForestRegressor(random_state=42)


In [149]:
one_hot_encoder = pickle.load(open('Saved_files/Groc_forest_reg_ohe.p','rb'))
print(one_hot_encoder)

OneHotEncoder(drop='first', sparse=False, sparse_output=False)


drop missing values

In [150]:
to_be_scored.isna().sum()

customer_id            0
distance_from_store    3
gender                 2
credit_score           6
total_sales            0
total_items            0
transaction_count      0
product_area_count     0
avg_basket_value       0
dtype: int64

In [151]:
to_be_scored.dropna(how = 'any', inplace=True)
to_be_scored.head()

Unnamed: 0,customer_id,distance_from_store,gender,credit_score,total_sales,total_items,transaction_count,product_area_count,avg_basket_value
6,1,4.78,F,0.66,3980.49,424,51,5,78.048824
7,120,3.49,F,0.38,2887.2,253,45,5,64.16
8,52,14.91,F,0.68,3342.75,335,47,5,71.12234
10,435,0.25,M,0.62,2326.71,267,48,5,48.473125
12,679,4.74,F,0.58,3448.59,370,49,5,70.379388


drop unused columns 

In [152]:
customer_id = to_be_scored['customer_id']
to_be_scored.drop(['customer_id'],axis=1, inplace=True)
to_be_scored.head()

Unnamed: 0,distance_from_store,gender,credit_score,total_sales,total_items,transaction_count,product_area_count,avg_basket_value
6,4.78,F,0.66,3980.49,424,51,5,78.048824
7,3.49,F,0.38,2887.2,253,45,5,64.16
8,14.91,F,0.68,3342.75,335,47,5,71.12234
10,0.25,M,0.62,2326.71,267,48,5,48.473125
12,4.74,F,0.58,3448.59,370,49,5,70.379388


apply one hot encoding

In [153]:
categorical_vars = ["gender"]

In [154]:
# fit and transform on the training and only the transform on the test
# we do this because we want the encoding rules to be learned from the 
# training data and then applied to the new data 
encoded_vars_array = one_hot_encoder.transform(to_be_scored[categorical_vars])
print(encoded_vars_array.shape)

(463, 1)


In [155]:
encoder_feature_names = one_hot_encoder.get_feature_names_out(categorical_vars)

In [156]:
encoded_vars_array_df = pd.DataFrame(encoded_vars_array,
                               columns=encoder_feature_names)
to_be_scored = pd.concat([to_be_scored.reset_index(drop=True),
                   encoded_vars_array_df.reset_index(drop=True)], 
                   axis=1)
# reset index to ensure no rows are not aligned
# drop the original input 2 and 3
to_be_scored.drop(categorical_vars, axis=1,inplace=True)
print(to_be_scored.shape)
print(to_be_scored.head())

(463, 8)
   distance_from_store  credit_score  total_sales  total_items  \
0                 4.78          0.66      3980.49          424   
1                 3.49          0.38      2887.20          253   
2                14.91          0.68      3342.75          335   
3                 0.25          0.62      2326.71          267   
4                 4.74          0.58      3448.59          370   

   transaction_count  product_area_count  avg_basket_value  gender_M  
0                 51                   5         78.048824       0.0  
1                 45                   5         64.160000       0.0  
2                 47                   5         71.122340       0.0  
3                 48                   5         48.473125       1.0  
4                 49                   5         70.379388       0.0  


make our predictions

In [157]:
loyalty_predictions = regressor.predict(to_be_scored)
print(loyalty_predictions)

[0.42641 0.32992 0.34719 0.93166 0.3849  0.93566 0.43343 0.73502 0.31208
 0.74406 0.43255 0.51648 0.44943 0.49396 0.51378 0.35195 0.79852 0.27749
 0.9132  0.39764 0.28666 0.57642 0.45266 0.30921 0.6539  0.42941 0.5505
 0.90092 0.5734  0.32631 0.35492 0.41621 0.47523 0.56345 0.67158 0.34009
 0.33038 0.89301 0.36622 0.44413 0.85157 0.72539 0.45002 0.93642 0.94507
 0.22433 0.39676 0.96111 0.53929 0.44196 0.8682  0.86839 0.73586 0.29151
 0.60416 0.84114 0.23933 0.3014  0.24201 0.24305 0.24934 0.87329 0.86826
 0.55587 0.39386 0.57611 0.92211 0.41034 0.9446  0.25769 0.61682 0.45259
 0.64507 0.49405 0.62185 0.39843 0.16508 0.61596 0.7924  0.54604 0.28182
 0.29235 0.30199 0.71632 0.56894 0.2379  0.36931 0.28792 0.40425 0.37513
 0.39393 0.90329 0.91231 0.26916 0.38641 0.38063 0.87808 0.53341 0.31401
 0.92022 0.28421 0.22333 0.29671 0.72241 0.71109 0.44478 0.80592 0.51689
 0.29636 0.85891 0.93109 0.29279 0.27718 0.37517 0.24839 0.24201 0.21925
 0.51374 0.28497 0.88146 0.63028 0.2433  0.33895 0.8

In [158]:
customer_to_score = pd.DataFrame(loyalty_predictions, columns=['Loyalty_scores_pred'], index=None)
customer_to_score.head()

Unnamed: 0,Loyalty_scores_pred
0,0.42641
1,0.32992
2,0.34719
3,0.93166
4,0.3849


In [159]:
customer_id = pd.DataFrame(customer_id,columns=['customer_id']).reset_index(drop=True)
customer_id.head()

Unnamed: 0,customer_id
0,1
1,120
2,52
3,435
4,679


In [160]:
customer_to_score["customer_id"] = customer_id
customer_to_score.head(9)

Unnamed: 0,Loyalty_scores_pred,customer_id
0,0.42641,1
1,0.32992,120
2,0.34719,52
3,0.93166,435
4,0.3849,679
5,0.93566,182
6,0.43343,426
7,0.73502,560
8,0.31208,476
