## Direct Marketing without using Privacy-preserving techniques

### Import all necessary libraries 

In [418]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Import the dataset

In [419]:
dataset=pd.read_csv("DirectMarketing.csv")
dataset.head()

Unnamed: 0,Age,Gender,OwnHome,Married,Location(km),Salary,Children,History,Catalogs,AmountSpent,Customer Segment
0,58,Female,Own,Single,12.37,47500,0,High,6,755,Medium value
1,39,Male,Rent,Single,5.37,63600,0,High,6,1318,Medium value
2,26,Female,Rent,Single,3.02,13500,0,Low,18,296,Low value
3,54,Male,Own,Married,6.53,85600,1,High,18,2436,High value
4,37,Female,Own,Single,3.04,68400,0,High,12,1304,Medium value


Removing rows that have NaN values 

In [420]:
dataset.dropna(axis=0)

Unnamed: 0,Age,Gender,OwnHome,Married,Location(km),Salary,Children,History,Catalogs,AmountSpent,Customer Segment
0,58,Female,Own,Single,12.37,47500,0,High,6,755,Medium value
1,39,Male,Rent,Single,5.37,63600,0,High,6,1318,Medium value
2,26,Female,Rent,Single,3.02,13500,0,Low,18,296,Low value
3,54,Male,Own,Married,6.53,85600,1,High,18,2436,High value
4,37,Female,Own,Single,3.04,68400,0,High,12,1304,Medium value
...,...,...,...,...,...,...,...,...,...,...,...
991,69,Female,Rent,Single,15.20,11700,0,Low,18,540,Medium value
993,54,Female,Own,Married,15.13,99200,0,High,24,5503,High value
997,60,Male,Own,Single,7.35,44800,0,Medium,24,1417,Medium value
998,54,Male,Own,Married,3.67,79000,2,Medium,18,671,Medium value


In [421]:
dataset.columns
datasetColumns=dataset.columns

### Splitting the data into a set of predictors and responders 

In [422]:
x_columns=['Age', 'Gender', 'OwnHome', 'Married', 'Location(km)', 'Salary','Children', 'History', 'Catalogs', 'AmountSpent']
y_columns=['Customer Segment']
x_dataset = dataset[x_columns]
y_dataset = dataset[y_columns]

In [423]:
print(x_dataset[['Age','Gender', 'OwnHome', 'Married','History']])

     Age  Gender OwnHome  Married History
0     58  Female     Own   Single    High
1     39    Male    Rent   Single    High
2     26  Female    Rent   Single     Low
3     54    Male     Own  Married    High
4     37  Female     Own   Single    High
..   ...     ...     ...      ...     ...
995   21  Female    Rent   Single     NaN
996   43    Male    Rent   Single     NaN
997   60    Male     Own   Single  Medium
998   54    Male     Own  Married  Medium
999   19    Male    Rent  Married  Medium

[1000 rows x 5 columns]


In [424]:
x_datasetColumns=x_dataset.columns
y_datasetColumns=y_dataset.columns

In [425]:
categoricalData=x_dataset['Gender']
encodeGender=pd.get_dummies(x_dataset['Gender'])
encodeHome=pd.get_dummies(x_dataset['OwnHome'])
encodeMarried=pd.get_dummies(x_dataset['Married'])
encodeHistory=pd.get_dummies(x_dataset['History'])



In [426]:
decodeGender=encodeGender.idxmax(1)

In [427]:
print(decodeGender)

0      Female
1        Male
2      Female
3        Male
4      Female
        ...  
995    Female
996      Male
997      Male
998      Male
999      Male
Length: 1000, dtype: object


In [428]:
encodedData=pd.concat([encodeGender,encodeHome,encodeHistory,encodeMarried,x_dataset[['Age', 'Location(km)', 'Salary','Children','Catalogs', 'AmountSpent']]],axis=1)
print(encodedData)

     Female  Male  Own  Rent  High  Low  Medium  Married  Single  Age  \
0         1     0    1     0     1    0       0        0       1   58   
1         0     1    0     1     1    0       0        0       1   39   
2         1     0    0     1     0    1       0        0       1   26   
3         0     1    1     0     1    0       0        1       0   54   
4         1     0    1     0     1    0       0        0       1   37   
..      ...   ...  ...   ...   ...  ...     ...      ...     ...  ...   
995       1     0    0     1     0    0       0        0       1   21   
996       0     1    0     1     0    0       0        0       1   43   
997       0     1    1     0     0    0       1        0       1   60   
998       0     1    1     0     0    0       1        1       0   54   
999       0     1    0     1     0    0       1        1       0   19   

     Location(km)  Salary  Children  Catalogs  AmountSpent  
0           12.37   47500         0         6          755  
1

In [429]:
x_datasetColumns=encodedData.columns

In [430]:
print(x_datasetColumns)

Index(['Female', 'Male', 'Own', 'Rent', 'High', 'Low', 'Medium', 'Married',
       'Single', 'Age', 'Location(km)', 'Salary', 'Children', 'Catalogs',
       'AmountSpent'],
      dtype='object')


In [431]:
x_dataset=encodedData.iloc[:,:].values
y_dataset =y_dataset.iloc[:,:].values

### Changing categorical values into numerical values

In [432]:
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
y_dataset=lb.fit_transform(y_dataset)



  y = column_or_1d(y, warn=True)


### Splitting the data into train dataset and test dataset

In [433]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x_dataset,y_dataset,test_size=0.2, random_state=1)

In [434]:
print(x_dataset[0,:])

[1.000e+00 0.000e+00 1.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00
 0.000e+00 1.000e+00 5.800e+01 1.237e+01 4.750e+04 0.000e+00 6.000e+00
 7.550e+02]


### Feature Scaling of the variables

In [435]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_train[:,7:]=sc.fit_transform(x_train[:,7:])
x_test[:,7:]=sc.transform(x_test[:,7:])

In [436]:
print(x_train[1,:])

[ 0.          1.          0.          1.          0.          0.
  0.         -0.99252792  0.99252792 -1.06014183 -0.93209764 -1.25022859
 -0.88168494 -1.30249089 -0.97240374]


### Implementing the model (Random Forest classification models)

In [437]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(x_train, y_train)

In [438]:
from sklearn.metrics import confusion_matrix, accuracy_score,f1_score,precision_score, recall_score
y_pred = classifier.predict(x_test)
accuracy=accuracy_score(y_test, y_pred)
print(accuracy)
precision=precision_score(y_test, y_pred,average='macro')
recall=recall_score(y_test,y_pred,average='macro')
accuracy_measures=pd.DataFrame([accuracy,precision,recall],columns=["Accuracy Measures"],index=["accuracy score","precision score","recall score"])
accuracy_measures.to_csv('Accuracy metrics not using preserving privacy techniques in Advanced analytics.csv')


0.98


In [439]:
print(precision)

0.9863945578231292


In [440]:
print(recall)

0.9746986513903807


In [441]:
x_test[:,7:]=sc.inverse_transform(x_test[:,7:])
y_test=lb.inverse_transform(y_test)
y_pred=lb.inverse_transform(y_pred)




In [442]:
y_test=pd.DataFrame(y_test,columns=["Actual Customer segments"])
y_pred=pd.DataFrame(y_pred,columns=["Predicted Customer segments"])

In [443]:
print(y_test)

    Actual Customer segments
0               Medium value
1                 High value
2               Medium value
3               Medium value
4               Medium value
..                       ...
195               High value
196             Medium value
197             Medium value
198                Low value
199                Low value

[200 rows x 1 columns]


In [444]:
y_set=pd.concat([y_test,y_pred],axis=1)

In [445]:
print(y_set.head())

  Actual Customer segments Predicted Customer segments
0             Medium value                Medium value
1               High value                  High value
2             Medium value                Medium value
3             Medium value                Medium value
4             Medium value                Medium value


In [446]:
tableData =pd.DataFrame(x_test,columns=x_datasetColumns) 
print(tableData)

     Female  Male  Own  Rent  High  Low  Medium  Married  Single   Age  \
0       0.0   1.0  0.0   1.0   0.0  0.0     0.0      0.0     1.0  49.0   
1       1.0   0.0  0.0   1.0   0.0  0.0     0.0      1.0     0.0  48.0   
2       0.0   1.0  1.0   0.0   1.0  0.0     0.0      0.0     1.0  40.0   
3       1.0   0.0  0.0   1.0   0.0  0.0     1.0      1.0     0.0  44.0   
4       0.0   1.0  1.0   0.0   1.0  0.0     0.0      1.0     0.0  66.0   
..      ...   ...  ...   ...   ...  ...     ...      ...     ...   ...   
195     0.0   1.0  1.0   0.0   1.0  0.0     0.0      1.0     0.0  64.0   
196     0.0   1.0  1.0   0.0   0.0  0.0     1.0      1.0     0.0  51.0   
197     1.0   0.0  0.0   1.0   0.0  0.0     1.0      1.0     0.0  56.0   
198     0.0   1.0  1.0   0.0   0.0  1.0     0.0      1.0     0.0  42.0   
199     1.0   0.0  0.0   1.0   0.0  0.0     0.0      0.0     1.0  27.0   

     Location(km)    Salary  Children  Catalogs  AmountSpent  
0            3.70   52400.0       0.0      18.0 

In [447]:
tableData["Gender"]=tableData[["Female","Male"]].idxmax(1)
tableData["OwnHome"]=tableData[["Own","Rent"]].idxmax(1)
tableData["History"]=tableData[["High","Medium","Low"]].idxmax(1)
tableData["Married"]=tableData[["Married","Single"]].idxmax(1)
tableData=tableData.drop(["Female","Male","Own","Rent","High","Low","Medium","Single"],axis=1)
tableData=tableData.reindex(columns=x_columns)
tableData=pd.concat([tableData,y_set],axis=1)

In [448]:
print(tableData.head())

    Age  Gender OwnHome  Married  Location(km)    Salary  Children History  \
0  49.0    Male    Rent   Single          3.70   52400.0       0.0    High   
1  48.0  Female    Rent  Married         18.74   92400.0       1.0    High   
2  40.0    Male     Own   Single         11.96   50100.0       1.0    High   
3  44.0  Female    Rent  Married          6.29   46700.0       0.0  Medium   
4  66.0    Male     Own  Married          5.75  107400.0       0.0    High   

   Catalogs  AmountSpent Actual Customer segments Predicted Customer segments  
0      18.0        857.0             Medium value                Medium value  
1       6.0       2191.0               High value                  High value  
2      18.0       1071.0             Medium value                Medium value  
3      24.0        983.0             Medium value                Medium value  
4      12.0       1485.0             Medium value                Medium value  


In [449]:
tableData.to_csv('Advanced analytics without preserving privacy techniques.csv', index=False)