In [69]:
import pandas as pd
import numpy as np
import datetime as dt
import calendar

In [70]:
CTA_rfm = pd.read_csv('CTA_rfm_allinfo.csv')

In [71]:
CTA_rfm.head(1)

Unnamed: 0.1,Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,...,property_valuation,recency,frequency,monetary,R,F,M,RFMClass,RFMscore,RFM_loyalty_level
0,0,1,2,2950,2017-02-25,False,Approved,Solex,Standard,medium,...,6,76,3,1953.15,3,4,4,344,11,Bronze


In [72]:
old_customers = CTA_rfm[['gender','past_3_years_bike_related_purchases','job_industry_category','wealth_segment','owns_car','tenure','Age','property_valuation','RFM_loyalty_level']]

In [73]:
old_customers = old_customers[old_customers.gender != 'Unspecified']

### Feature Engineering - Old customers dataset 

In [74]:
old_customers.shape

(19122, 9)

In [75]:
old_customers.dtypes

gender                                  object
past_3_years_bike_related_purchases      int64
job_industry_category                   object
wealth_segment                          object
owns_car                                object
tenure                                 float64
Age                                      int64
property_valuation                       int64
RFM_loyalty_level                       object
dtype: object

In [76]:
# using one hot coding change gender column into binary
gender = old_customers[['gender']]
gender = pd.get_dummies(gender,drop_first=True)
gender.head()

Unnamed: 0,gender_Male
0,1
1,1
2,1
3,0
4,0


In [77]:
# using one hot coding change job_industry_category column into binary
job_industry_category = old_customers[['job_industry_category']]
job_industry_category = pd.get_dummies(job_industry_category,drop_first=True)
job_industry_category.head()

Unnamed: 0,job_industry_category_Entertainment,job_industry_category_Financial Services,job_industry_category_Health,job_industry_category_IT,job_industry_category_Manufacturing,job_industry_category_Property,job_industry_category_Retail,job_industry_category_Telecommunications
0,0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0
4,0,0,1,0,0,0,0,0


In [78]:
# using one hot coding change owns_car column into binary
owns_car = old_customers[['owns_car']]
owns_car = pd.get_dummies(owns_car,drop_first=True)
owns_car.head()

Unnamed: 0,owns_car_Yes
0,1
1,1
2,1
3,1
4,1


In [79]:
# using label encoder change wealth_segment column into binary since it is an ordinal category column
from sklearn.preprocessing import LabelEncoder
old_customers['wealth_segment']=LabelEncoder().fit_transform(old_customers['wealth_segment'])

In [80]:
old_customers1=old_customers[['past_3_years_bike_related_purchases','tenure','Age','property_valuation','wealth_segment']]

In [81]:
#Concatenating transformed categorical columns with the old_customers dataframe
old_customers1=pd.concat([gender,job_industry_category,owns_car,old_customers1],axis=1)

In [82]:
old_customers1.shape

(19122, 15)

In [83]:
old_customers1.head(1)

Unnamed: 0,gender_Male,job_industry_category_Entertainment,job_industry_category_Financial Services,job_industry_category_Health,job_industry_category_IT,job_industry_category_Manufacturing,job_industry_category_Property,job_industry_category_Retail,job_industry_category_Telecommunications,owns_car_Yes,past_3_years_bike_related_purchases,tenure,Age,property_valuation,wealth_segment
0,1,0,1,0,0,0,0,0,0,1,19,10.0,66,6,2


### New customers dataset

In [84]:
new_customers=pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name = 'NewCustomerList')

In [85]:
new_customers.head(1)

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,...,state,country,property_valuation,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Rank,Value
0,Chickie,Brister,Male,86,1957-07-12,General Manager,Manufacturing,Mass Customer,N,Yes,...,QLD,Australia,6,0.96,1.2,1.5,1.275,1,1,1.71875


In [86]:
for i in ['Unnamed: 16','Unnamed: 17','Unnamed: 18','Unnamed: 19','Unnamed: 20']:
    new_customers.drop(i,axis=1,inplace=True)

In [87]:
new_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   first_name                           1000 non-null   object        
 1   last_name                            971 non-null    object        
 2   gender                               1000 non-null   object        
 3   past_3_years_bike_related_purchases  1000 non-null   int64         
 4   DOB                                  983 non-null    datetime64[ns]
 5   job_title                            894 non-null    object        
 6   job_industry_category                835 non-null    object        
 7   wealth_segment                       1000 non-null   object        
 8   deceased_indicator                   1000 non-null   object        
 9   owns_car                             1000 non-null   object        
 10  tenure       

In [88]:
new_customers.duplicated().sum()

0

In [89]:
cat_col = [col for col in new_customers.columns if new_customers[col].dtype=='object']
cat_col

['first_name',
 'last_name',
 'gender',
 'job_title',
 'job_industry_category',
 'wealth_segment',
 'deceased_indicator',
 'owns_car',
 'address',
 'state',
 'country']

In [90]:
for col in cat_col:
    print(col)
    print(new_customers[col].value_counts())
    print('*****')

first_name
Rozamond     3
Dorian       3
Mandie       3
Tyne         2
Kippar       2
            ..
Gardiner     1
Cecil        1
Brigg        1
Rosemonde    1
Berenice     1
Name: first_name, Length: 940, dtype: int64
*****
last_name
Sissel      2
Hallt       2
Eade        2
Borsi       2
Crellim     2
           ..
Mangion     1
Whitwell    1
Marrow      1
Kibble      1
Kybbye      1
Name: last_name, Length: 961, dtype: int64
*****
gender
Female    513
Male      470
U          17
Name: gender, dtype: int64
*****
job_title
Associate Professor             15
Environmental Tech              14
Software Consultant             14
Chief Design Engineer           13
Assistant Manager               12
                                ..
Computer Systems Analyst III     1
Automation Specialist IV         1
Human Resources Assistant IV     1
Budget/Accounting Analyst I      1
Safety Technician II             1
Name: job_title, Length: 184, dtype: int64
*****
job_industry_category
Financial Ser

### Features Engineering - New customers dataset

In [91]:
count_missing_new_customers = new_customers.isnull().sum()
percent_missing_new_customers = round(new_customers.isnull().sum()/len(new_customers) * 100, 1)
missing_new_customers = pd.concat([count_missing_new_customers, percent_missing_new_customers], axis = 1)
missing_new_customers.columns = ["Missing (count)", "Missing (%)"]
missing_new_customers

Unnamed: 0,Missing (count),Missing (%)
first_name,0,0.0
last_name,29,2.9
gender,0,0.0
past_3_years_bike_related_purchases,0,0.0
DOB,17,1.7
job_title,106,10.6
job_industry_category,165,16.5
wealth_segment,0,0.0
deceased_indicator,0,0.0
owns_car,0,0.0


In [92]:
# we will fill in the job_title,job_industry_category column with the mode value in the columns
new_customers['job_title'] = new_customers['job_title'].fillna(new_customers['job_title'].mode()[0])
new_customers['job_industry_category'] = new_customers['job_industry_category'].fillna(new_customers['job_industry_category'].mode()[0])

In [93]:
new_customers['DOB'].dropna(inplace=True)
new_customers['last_name'].dropna(inplace=True)

In [94]:
new_customers = new_customers[new_customers.gender != 'U']

In [95]:
def from_dob_to_age(i):
    today = dt.date.today()
    return today.year - i.year - ((today.month,today.day) < (i.month,i.day))

In [96]:
new_customers['Age'] = new_customers['DOB'].apply(lambda x: from_dob_to_age(x))

In [97]:
new_customers.head(1)

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,Rank,Value,Age
0,Chickie,Brister,Male,86,1957-07-12,General Manager,Manufacturing,Mass Customer,N,Yes,14,45 Shopko Center,4500,QLD,Australia,6,1,1.71875,64


In [98]:
# change categorical data columns into binary by one hot coding
gender_new = new_customers[['gender']]
gender_new = pd.get_dummies(gender_new,drop_first=True)
gender_new.head()

Unnamed: 0,gender_Male
0,1
1,1
2,0
3,0
4,0


In [99]:
job_industry_category_new = new_customers[['job_industry_category']]
job_industry_category_new = pd.get_dummies(job_industry_category_new,drop_first=True)
job_industry_category_new.head()

Unnamed: 0,job_industry_category_Entertainment,job_industry_category_Financial Services,job_industry_category_Health,job_industry_category_IT,job_industry_category_Manufacturing,job_industry_category_Property,job_industry_category_Retail,job_industry_category_Telecommunications
0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,1,0,0
2,0,1,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0


In [100]:
owns_car_new = new_customers[['owns_car']]
owns_car_new = pd.get_dummies(owns_car_new,drop_first=True)
owns_car_new.head()

Unnamed: 0,owns_car_Yes
0,1
1,0
2,0
3,1
4,0


In [101]:
#Transforming using label_encoder
from sklearn.preprocessing import LabelEncoder
new_customers['wealth_segment']=LabelEncoder().fit_transform(new_customers['wealth_segment'])

In [102]:
new_customers1 = new_customers[['past_3_years_bike_related_purchases','tenure','Age','property_valuation','wealth_segment']]

In [103]:
new_customers1 = pd.concat([gender_new,job_industry_category_new,owns_car_new,new_customers1],axis=1)

In [104]:
old_customers1.shape

(19122, 15)

In [105]:
new_customers1.shape

(983, 15)

### Model Building

In [106]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(old_customers1,
                                                                            old_customers['RFM_loyalty_level'],
                                                                            test_size= 0.25, random_state=10,)

##### Decision Tree

In [107]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=10)
tree.fit(train_features,train_labels)

DecisionTreeClassifier(random_state=10)

In [108]:
pred_labels_tree = tree.predict(test_features)

In [110]:
from sklearn.metrics import classification_report
class_rep_tree = classification_report(test_labels,pred_labels_tree)
print('Decision Tree: \n', class_rep_tree)

Decision Tree: 
               precision    recall  f1-score   support

      Bronze       1.00      0.94      0.97       487
        Gold       1.00      1.00      1.00      1772
    Platinum       0.99      1.00      1.00      1781
      Silver       0.98      0.99      0.98       741

    accuracy                           0.99      4781
   macro avg       0.99      0.98      0.99      4781
weighted avg       0.99      0.99      0.99      4781



##### Random Forest

In [111]:
from sklearn.ensemble import RandomForestClassifier
rs = RandomForestClassifier()
rs.fit(train_features, train_labels)

RandomForestClassifier()

In [112]:
pred_labels_rs = rs.predict(test_features)

In [114]:
class_rep_rs = classification_report(test_labels, pred_labels_rs)

In [115]:
print('RandomForestClassifier: \n', class_rep_rs)

RandomForestClassifier: 
               precision    recall  f1-score   support

      Bronze       1.00      0.93      0.96       487
        Gold       0.99      1.00      0.99      1772
    Platinum       0.99      1.00      1.00      1781
      Silver       0.99      0.99      0.99       741

    accuracy                           0.99      4781
   macro avg       0.99      0.98      0.99      4781
weighted avg       0.99      0.99      0.99      4781



##### Predict new segments on new data

In [116]:
output_label = tree.predict(new_customers1)

In [117]:
new_customers['RFM_segments_predicted'] = output_label.tolist()

In [118]:
new_customers[['first_name','last_name','gender','RFM_segments_predicted']]

Unnamed: 0,first_name,last_name,gender,RFM_segments_predicted
0,Chickie,Brister,Male,Bronze
1,Morly,Genery,Male,Bronze
2,Ardelis,Forrester,Female,Bronze
3,Lucine,Stutt,Female,Gold
4,Melinda,Hadlee,Female,Silver
...,...,...,...,...
995,Ferdinand,Romanetti,Male,Platinum
996,Burk,Wortley,Male,Bronze
997,Melloney,Temby,Female,Gold
998,Dickie,Cubbini,Male,Platinum


In [120]:
new_customers.to_excel('predicted_new_customers.xlsx')