In [1]:
#from kaggle.api.kaggle_api_extended import KaggleApi
import pandas as pd
import matplotlib.pyplot as plt

<div>
    <h5>Download dataset (first time access)</h5>
    <p>Comment out all lines for kaggle-dataset download, when running notebook next time.</p>
</div>

In [5]:
# create instance of KaggleApi
#api = KaggleApi()
# authentication with api-key
#api.authenticate()
# download the dataset
#api.dataset_download_files('imakash3011/customer-personality-analysis', path='./data/', unzip=True)

In [2]:
#read the tab-separated file
data = pd.read_csv('data/marketing_campaign.csv', sep="\t")

In [3]:
data.head(5)

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0


In [4]:
#basic info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

In [3]:
#list of all categorical column-names
categorical_colnames = data.select_dtypes(include=['object']).columns.tolist()
categorical_colnames

['Education', 'Marital_Status', 'Dt_Customer']

In [4]:
#remove "dt_customer" from the list- those are dates when customers joined
#inplace removal
categorical_colnames.remove('Dt_Customer')

In [5]:
#find all unique values in each column
for colname in categorical_colnames:
    print(f"{data[colname].value_counts()}")
    print()

Education
Graduation    1127
PhD            486
Master         370
2n Cycle       203
Basic           54
Name: count, dtype: int64

Marital_Status
Married     864
Together    580
Single      480
Divorced    232
Widow        77
Alone         3
Absurd        2
YOLO          2
Name: count, dtype: int64



In [6]:
#Numerical columns with important data
num_columns = ['Income', 'Kidhome', 'Teenhome','Recency','MntWines','MntFruits','MntMeatProducts', 'MntFishProducts',
              'MntSweetProducts','MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases','NumCatalogPurchases','NumStorePurchases',
              'NumWebVisitsMonth']

In [44]:
#stat-description of these columns
#data[num_columns].describe().drop(["count", "25%", "75%"])

### 2. Data Cleaning:

1. Missing values- Income column has missing values; those handful of rows will be dropped.
2. Incorrect data- Dates are in string format; will be converted to date-time format.
3. Categorical features- they will be encoded in numerical form.

#### 2.1 Missing values

In [7]:
#remove missing values
data = data.dropna()

In [8]:
#check total number of observations left
print(f"total rows in dataframe now: {len(data)}")

total rows in dataframe now: 2216


#### 2.2 Date of Joining column

In [9]:
#check type
type(data.Dt_Customer[1])

str

In [10]:
#convert the column to date-time
data['Dt_Customer'] = pd.to_datetime(data['Dt_Customer'], format='%d-%m-%Y')

In [11]:
data['Dt_Customer'].describe()

count                             2216
mean     2013-07-10 11:29:27.509025280
min                2012-07-30 00:00:00
25%                2013-01-16 00:00:00
50%                2013-07-08 12:00:00
75%                2013-12-31 00:00:00
max                2014-06-29 00:00:00
Name: Dt_Customer, dtype: object

In [12]:
dates = []

In [13]:
for i in data['Dt_Customer']:
    #extract only the date part from date-time format of column
    i = i.date()
    dates.append(i)
#Earliest and most recent date in the dataset
print("Newest date of customer enrollment:", max(dates))
print("Oldest date of customer enrollment:", min(dates))

Newest date of customer enrollment: 2014-06-29
Oldest date of customer enrollment: 2012-07-30


Now, I'll create a new feature <b>"Customer_For"</b> out of this <b>"Dt_Customer"</b>, which contains num. of days a customer has been shopping at this store (with respect to the most recent date in the dataset).

In [14]:
#most recent date in database
recent_date = max(dates)

In [15]:
#Creating "Customer_for" features
#customer-for-xxx-number-of-days

#list to store number of days for each customer
days = []
for i in dates:
    #this wont give a date, but a timedelta object.
    num_of_days = recent_date - i
    #add this to the days-list
    days.append(num_of_days)

#demo of what the days list values type
print("Number of days is of type datetime right now: ", days[0:2])

Number of days is of type datetime right now:  [datetime.timedelta(days=663), datetime.timedelta(days=113)]


In [16]:
#Add this list days-count to the dataframe
data['Customer_For'] = days
#values are time-delta objects, extract days out from them
data['Customer_For'] = data['Customer_For'].dt.days
#Convert to numeric type
data['Customer_For'] = pd.to_numeric(data['Customer_For'])

In [17]:
data[['ID', 'Dt_Customer', 'Customer_For']].head(3)

Unnamed: 0,ID,Dt_Customer,Customer_For
0,5524,2012-09-04,663
1,2174,2014-03-08,113
2,4141,2013-08-21,312


#### 2.3 Create New Features

##### A. Age of customers

In [27]:
type(recent_date.year)

int

In [28]:
#age will be calculated from 2014, the most recent date in dataset
age = []
recent_year = recent_date.year
for i in data['Year_Birth']:
    customer_age = recent_year - i
    age.append(customer_age)

In [30]:
#add this list to dataframe
data['customer_age'] = age
print("maximum age", max(data.customer_age))
print("minimum age", min(data.customer_age))

maximum age 121
minimum age 18


Maximum age of 121 seems bizarre.  
Let's have a look at the age-distribution of all customers.  
These discrepancies will be corrected in upcoming sections.

In [32]:
data['customer_age'].describe()

count    2216.000000
mean       45.179603
std        11.985554
min        18.000000
25%        37.000000
50%        44.000000
75%        55.000000
max       121.000000
Name: customer_age, dtype: float64

<h5>b. Total amount spent by each customer</h5>

In [33]:
data.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response',
       'Customer_For', 'customer_age'],
      dtype='object')