Customer Relationship Management (CRM) is a key element of modern marketing strategies. The KDD Cup 2009 offers the opportunity to work on large marketing databases from the French Telecom company Orange to predict the propensity of customers to switch provider (churn), buy new products or services (appetency), or buy upgrades or add-ons proposed to them to make the sale more profitable (up-selling).


The most practical way, in a CRM system, to build knowledge on customer is to produce scores. A score (the output of a model) is an evaluation for all instances of a target variable to explain (i.e. churn, appetency or up-selling). Tools which produce scores allow to project, on a given population, quantifiable information. The score is computed using input variables which describe instances. Scores are then used by the information system (IS), for example, to personalize the customer relationship. An industrial customer analysis platform able to build prediction models with a very large number of input variables has been developed by Orange Labs. This platform implements several processing methods for instances and variables selection, prediction and indexation based on an efficient model combined with variable selection regularization and model averaging method. The main characteristic of this platform is its ability to scale on very large datasets with hundreds of thousands of instances and thousands of variables. The rapid and robust detection of the variables that have most contributed to the output prediction can be a key factor in a marketing application.


More details:https://www.kdd.org/kdd-cup/view/kdd-cup-2009

# Step 1: Modeling 'Churn'

In [1]:
# Import Libraries

import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [2]:
# Upload of data

features = pd.read_csv('Machine Learning Kaggle 1\\kdd\\orange_small_train.data', sep = '\t', na_filter = False)
outcome = pd.read_csv('Machine Learning Kaggle 1\\kdd\\orange_small_train_churn.labels', header = None)
print(features.shape)

print("We have",features.shape[0],"data, and",features.shape[1],"Columns")

(50000, 230)
We have 50000 data, and 230 Columns


In [3]:
features.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230
0,,,,,,1526.0,7,,,,...,oslk,fXVEsaq,jySVZNlOJy,,,xb3V,RAYp,F2FyR07IdsN7I,,
1,,,,,,525.0,0,,,,...,oslk,2Kb5FSF,LM8l689qOp,,,fKCe,RAYp,F2FyR07IdsN7I,,
2,,,,,,5236.0,7,,,,...,Al6ZaUT,NKv4yOc,jySVZNlOJy,,kG3k,Qu4f,02N6s8f,ib5G6X1eUxUn6,am7c,
3,,,,,,,0,,,,...,oslk,CE7uk3u,LM8l689qOp,,,FSa2,RAYp,F2FyR07IdsN7I,,
4,,,,,,1029.0,7,,,,...,oslk,1J2cvxe,LM8l689qOp,,kG3k,FSa2,RAYp,F2FyR07IdsN7I,mj86,


These data were masked to security of users

In [4]:
outcome.head()

Unnamed: 0,0
0,-1
1,1
2,-1
3,-1
4,-1


In [5]:
# Return all avaliable Values 
np.unique(outcome)

array([-1,  1], dtype=int64)

In [6]:
all_variable=[]

In [7]:
# identify numerical variable and categorical variable
all_variable= np.array(features.columns)
print(all_variable)



['Var1' 'Var2' 'Var3' 'Var4' 'Var5' 'Var6' 'Var7' 'Var8' 'Var9' 'Var10'
 'Var11' 'Var12' 'Var13' 'Var14' 'Var15' 'Var16' 'Var17' 'Var18' 'Var19'
 'Var20' 'Var21' 'Var22' 'Var23' 'Var24' 'Var25' 'Var26' 'Var27' 'Var28'
 'Var29' 'Var30' 'Var31' 'Var32' 'Var33' 'Var34' 'Var35' 'Var36' 'Var37'
 'Var38' 'Var39' 'Var40' 'Var41' 'Var42' 'Var43' 'Var44' 'Var45' 'Var46'
 'Var47' 'Var48' 'Var49' 'Var50' 'Var51' 'Var52' 'Var53' 'Var54' 'Var55'
 'Var56' 'Var57' 'Var58' 'Var59' 'Var60' 'Var61' 'Var62' 'Var63' 'Var64'
 'Var65' 'Var66' 'Var67' 'Var68' 'Var69' 'Var70' 'Var71' 'Var72' 'Var73'
 'Var74' 'Var75' 'Var76' 'Var77' 'Var78' 'Var79' 'Var80' 'Var81' 'Var82'
 'Var83' 'Var84' 'Var85' 'Var86' 'Var87' 'Var88' 'Var89' 'Var90' 'Var91'
 'Var92' 'Var93' 'Var94' 'Var95' 'Var96' 'Var97' 'Var98' 'Var99' 'Var100'
 'Var101' 'Var102' 'Var103' 'Var104' 'Var105' 'Var106' 'Var107' 'Var108'
 'Var109' 'Var110' 'Var111' 'Var112' 'Var113' 'Var114' 'Var115' 'Var116'
 'Var117' 'Var118' 'Var119' 'Var120' 'Var121' 'Var1

In [8]:
# The first 190 variables are numerical
num_variable=np.array(all_variable[:190])
num_variable

array(['Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7', 'Var8',
       'Var9', 'Var10', 'Var11', 'Var12', 'Var13', 'Var14', 'Var15',
       'Var16', 'Var17', 'Var18', 'Var19', 'Var20', 'Var21', 'Var22',
       'Var23', 'Var24', 'Var25', 'Var26', 'Var27', 'Var28', 'Var29',
       'Var30', 'Var31', 'Var32', 'Var33', 'Var34', 'Var35', 'Var36',
       'Var37', 'Var38', 'Var39', 'Var40', 'Var41', 'Var42', 'Var43',
       'Var44', 'Var45', 'Var46', 'Var47', 'Var48', 'Var49', 'Var50',
       'Var51', 'Var52', 'Var53', 'Var54', 'Var55', 'Var56', 'Var57',
       'Var58', 'Var59', 'Var60', 'Var61', 'Var62', 'Var63', 'Var64',
       'Var65', 'Var66', 'Var67', 'Var68', 'Var69', 'Var70', 'Var71',
       'Var72', 'Var73', 'Var74', 'Var75', 'Var76', 'Var77', 'Var78',
       'Var79', 'Var80', 'Var81', 'Var82', 'Var83', 'Var84', 'Var85',
       'Var86', 'Var87', 'Var88', 'Var89', 'Var90', 'Var91', 'Var92',
       'Var93', 'Var94', 'Var95', 'Var96', 'Var97', 'Var98', 'Var99',
       'Var100', 'Va

In [9]:
# The others variables are categorical
categorical_variable=np.array(all_variable[190:])
categorical_variable


array(['Var191', 'Var192', 'Var193', 'Var194', 'Var195', 'Var196',
       'Var197', 'Var198', 'Var199', 'Var200', 'Var201', 'Var202',
       'Var203', 'Var204', 'Var205', 'Var206', 'Var207', 'Var208',
       'Var209', 'Var210', 'Var211', 'Var212', 'Var213', 'Var214',
       'Var215', 'Var216', 'Var217', 'Var218', 'Var219', 'Var220',
       'Var221', 'Var222', 'Var223', 'Var224', 'Var225', 'Var226',
       'Var227', 'Var228', 'Var229', 'Var230'], dtype=object)

In [12]:
# verify the types of dataset
features.dtypes

Var1      object
Var2      object
Var3      object
Var4      object
Var5      object
           ...  
Var226    object
Var227    object
Var228    object
Var229    object
Var230    object
Length: 230, dtype: object

In [15]:
var = features.groupby('Var1').size()
var

Var1
       49298
0        380
120        1
128        2
152        1
16        81
24        46
32        23
360        1
392        1
40        10
48         6
536        1
56         5
64         1
680        1
72         3
8        138
80         1
dtype: int64

In [17]:
column_per_variable=pd.DataFrame()
for col in num_variable:
    col_count = features.groupby(col).size()
    column_per_variable=column_per_variable.append(col_count, ignore_index=True)

  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col

  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col

  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col

  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col

  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col

  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col

  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)


In [20]:
column_per_variable.shape

(190, 351788)

In [21]:
column_per_variable.dtypes

            float64
0           float64
120         float64
128         float64
152         float64
             ...   
9607.05     float64
966.15      float64
9772.021    float64
98.1        float64
9857.88     float64
Length: 351788, dtype: object

In [22]:
column_per_variable

Unnamed: 0,Unnamed: 1,0,120,128,152,16,24,32,360,392,...,8900.46,920.88,9336.06,9379.709,9441.36,9607.05,966.15,9772.021,98.1,9857.88
0,49298.0,380.0,1.0,2.0,1.0,81.0,46.0,23.0,1.0,1.0,...,,,,,,,,,,
1,48759.0,1240.0,,,,,,,,,...,,,,,,,,,,
2,48760.0,996.0,3.0,,,,1.0,,,,...,,,,,,,,,,
3,48421.0,1561.0,,,,,,,,,...,,,,,,,,,,
4,48513.0,898.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,49298.0,525.0,,,,,8.0,,,,...,,,,,,,,,,
186,49298.0,240.0,,,,15.0,12.0,3.0,,,...,,,,,,,,,,
187,48759.0,44.0,,,,,,,,,...,,,,,,,,,,
188,28978.0,,95.0,,,,2.0,,354.0,,...,,,,,,,,,,


In [23]:
#Transposta Matrix
column_per_variable.T.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,180,181,182,183,184,185,186,187,188,189
,49298.0,48759.0,48760.0,48421.0,48513.0,5529.0,5539.0,50000.0,49298.0,48513.0,...,5009.0,48421.0,48759.0,48759.0,50000.0,49298.0,49298.0,48759.0,28978.0,49667.0
0.0,380.0,1240.0,996.0,1561.0,898.0,976.0,13090.0,,144.0,902.0,...,41990.0,706.0,830.0,771.0,,525.0,240.0,44.0,,4.0
120.0,1.0,,3.0,,,,,,2.0,,...,,,,,,,,,95.0,
128.0,2.0,,,,,,,,1.0,,...,,,,,,,,,,
152.0,1.0,,,,,,,,1.0,,...,,,,,,,,,,


In [25]:
column_per_variable = column_per_variable.T
column_per_variable.index = column_per_variable.index.astype('str')
column_per_variable.sort_index(inplace = True)

In [27]:
column_per_variable.index[:10]

Index(['', '-10', '-1000060.0', '-10002.16', '-100034.8', '-1000420.0',
       '-1000468.0', '-1000496.0', '-1000516.0', '-1000548.0'],
      dtype='object')

In [26]:
# The same to Categorical Variable

In [28]:
column_per_variable=pd.DataFrame()
for col in categorical_variable:
    col_count = features.groupby(col).size()
    column_per_variable=column_per_variable.append(col_count, ignore_index=True)

  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col

  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)
  column_per_variable=column_per_variable.append(col_count, ignore_index=True)


In [29]:
column_per_variable = column_per_variable.T
column_per_variable.index = column_per_variable.index.astype('str')
column_per_variable.sort_index(inplace = True)
column_per_variable.index[:10]

Index(['', '000J', '0062', '00AD', '00ARusu', '00AYONy', '00AhP4J', '00J8E9a',
       '00JHTSP', '00L3'],
      dtype='object')