In [1]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Loading datasets
train = pd.read_csv('train.csv')
campaign_data = pd.read_csv('campaign_data.csv')
coupon_item_mapping = pd.read_csv('coupon_item_mapping.csv')
customer_demographics = pd.read_csv('customer_demographics.csv')
customer_transaction_data = pd.read_csv('customer_transaction_data.csv')
item_data = pd.read_csv('item_data.csv')
test = pd.read_csv('test.csv')

Performing Data Pre-Processing in each DataFrame

In [4]:
#considering train dataframe
train.sample(10)

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status
50486,82787,30,325,580,0
64835,106420,8,1058,622,0
9667,15948,26,381,506,0
24367,39974,13,1059,1343,0
27779,45560,8,1049,1338,0
31125,51011,9,705,71,0
69309,113856,13,111,936,0
11272,18593,13,171,649,0
18451,30401,12,492,1418,0
4865,8154,13,22,1173,0


In [5]:
train.shape

(78369, 5)

In [6]:
train.dtypes

id                   int64
campaign_id          int64
coupon_id            int64
customer_id          int64
redemption_status    int64
dtype: object

All the ids should be of type category,

redemption_status represents 2 categories : 0 for Coupon not redeemed, 1 - Coupon redeemed, it should be of type category

In [8]:
train['campaign_id'] = train['campaign_id'].astype('category')
train['coupon_id'] = train['coupon_id'].astype('category')
train['customer_id'] = train['customer_id'].astype('category')
train['id'] = train['id'].astype('category')
train['redemption_status'] = train['redemption_status'].astype('category')

In [9]:
#to verify the change
train.dtypes

id                   category
campaign_id          category
coupon_id            category
customer_id          category
redemption_status    category
dtype: object

In [10]:
# checking for missing values
train.isnull().sum()

id                   0
campaign_id          0
coupon_id            0
customer_id          0
redemption_status    0
dtype: int64

There are no missing values in the train dataframe

In [12]:
#checking for duplicated rows
train.duplicated().sum()

0

There are no duplicate rows in the train dataframe

In [14]:
# considering campaign_data dataframe
campaign_data.sample(10)

Unnamed: 0,campaign_id,campaign_type,start_date,end_date
16,7,Y,02/02/13,08/03/13
19,5,Y,12/01/13,15/02/13
12,12,Y,22/04/13,24/05/13
8,17,Y,29/07/13,30/08/13
24,29,Y,08/10/12,30/11/12
4,21,Y,16/09/13,18/10/13
6,18,X,10/08/13,04/10/13
23,30,X,19/11/12,04/01/13
3,23,Y,08/10/13,15/11/13
13,10,Y,08/04/13,10/05/13


In [15]:
campaign_data.shape

(28, 4)

In [16]:
campaign_data.dtypes

campaign_id       int64
campaign_type    object
start_date       object
end_date         object
dtype: object

campaign_type represents Anonymised Campaign Type (X/Y), it should be of type category.
start_date and end_date should be coverted to type datetime
campaign_id should be of type category

In [18]:
campaign_data['campaign_id'] = campaign_data['campaign_id'].astype('category')
campaign_data['campaign_type'] = campaign_data['campaign_type'].astype('category')
campaign_data['start_date'] = pd.to_datetime(campaign_data['start_date'], format='%d/%m/%y')
campaign_data['end_date'] = pd.to_datetime(campaign_data['end_date'], format='%d/%m/%y')


In [19]:
campaign_data.head()

Unnamed: 0,campaign_id,campaign_type,start_date,end_date
0,24,Y,2013-10-21,2013-12-20
1,25,Y,2013-10-21,2013-11-22
2,20,Y,2013-09-07,2013-11-16
3,23,Y,2013-10-08,2013-11-15
4,21,Y,2013-09-16,2013-10-18


In [20]:
campaign_data.dtypes

campaign_id            category
campaign_type          category
start_date       datetime64[ns]
end_date         datetime64[ns]
dtype: object

In [21]:
#checking for missing values
campaign_data.isnull().sum()

campaign_id      0
campaign_type    0
start_date       0
end_date         0
dtype: int64

In [22]:
#checking for duplicate rows
campaign_data.duplicated().sum()

0

In [23]:
#considering coupon_item_mapping dataframe
coupon_item_mapping.sample(10)

Unnamed: 0,coupon_id,item_id
76226,30,5875
63235,29,18726
39536,9,46799
59635,23,57567
13349,891,57845
68561,32,9228
82873,32,35460
83605,32,8731
63604,31,23308
35288,7,11474


In [24]:
coupon_item_mapping.shape

(92663, 2)

In [25]:
coupon_item_mapping.dtypes

coupon_id    int64
item_id      int64
dtype: object

Both the ids are of type int, should be converted to category

In [27]:
coupon_item_mapping['coupon_id'] = coupon_item_mapping['coupon_id'].astype('category')
coupon_item_mapping['item_id'] = coupon_item_mapping['item_id'].astype('category')

In [28]:
coupon_item_mapping.dtypes

coupon_id    category
item_id      category
dtype: object

In [29]:
#checking for missing values
coupon_item_mapping.isnull().sum()

coupon_id    0
item_id      0
dtype: int64

In [30]:
#checking for duplicate rows
coupon_item_mapping.duplicated().sum()

0

In [31]:
#considering customer_demographics dataframe
customer_demographics.sample(10)

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
5,11,70+,Single,0,2,,1
330,663,56-70,,0,1,,1
432,886,46-55,Married,0,3,1.0,5
360,738,46-55,,0,1,,1
479,974,46-55,,0,1,,2
555,1154,46-55,Married,0,3,1.0,6
523,1083,26-35,Married,1,4,2.0,2
101,201,46-55,,0,3,1.0,6
102,202,46-55,Married,0,2,,6
126,250,36-45,,0,1,,9


In [32]:
customer_demographics.shape

(760, 7)

In [33]:
customer_demographics.dtypes

customer_id        int64
age_range         object
marital_status    object
rented             int64
family_size       object
no_of_children    object
income_bracket     int64
dtype: object

customer_id should be of type category
age-range represents age range of customer family in years , should be converted to category 
marital_status contains two categories: single and married, should be converted to category
rented represents 0 - not rented accommodation, 1 - rented accommodation, should be converted to category
income_bracket represents label encoded income bracket, it should be converted to ordinal category

In [35]:
#converting datatypes
customer_demographics['customer_id'] = customer_demographics['customer_id'].astype('category')
customer_demographics['age_range'] = customer_demographics['age_range'].astype('category')
customer_demographics['marital_status'] = customer_demographics['marital_status'].astype('category')
customer_demographics['rented'] = customer_demographics['rented'].astype('category')


In [36]:
customer_demographics['family_size'].value_counts()

family_size
2     303
1     248
3     104
5+     57
4      48
Name: count, dtype: int64

As the family_size contains mixed datatype, integers and string, converting it into category

In [38]:
customer_demographics['family_size'] = customer_demographics['family_size'].astype('category')

In [39]:
customer_demographics['no_of_children'].value_counts()

no_of_children
1     107
3+     60
2      55
Name: count, dtype: int64

Same as , no_of_children contains mixed datatype, integers and string, converting it into category

In [41]:
customer_demographics['no_of_children'] = customer_demographics['no_of_children'].astype('category')

In [42]:
customer_demographics['income_bracket'].value_counts()

income_bracket
5     187
4     165
6      88
3      70
2      68
1      59
8      37
7      32
9      29
12     10
10     10
11      5
Name: count, dtype: int64

In [44]:
#converting income_bracket into into ordinal ctegory
values = [1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11, 12]
customer_demographics['income_bracket'] = pd.Categorical(
    customer_demographics['income_bracket'],
    categories=values,
    ordered=True
)


In [45]:
customer_demographics.sample(10)

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
634,1322,46-55,Married,0,2,,5
619,1296,26-35,Single,0,2,,8
723,1492,46-55,Married,1,2,,3
337,679,36-45,Single,1,2,,4
144,286,46-55,Single,0,1,,5
674,1392,36-45,,0,3,2,5
560,1167,46-55,,0,1,,5
494,1011,46-55,Married,0,5+,3+,8
539,1121,18-25,Married,0,3,1,5
145,287,26-35,,0,2,1,2


In [83]:
#checking for missing values as we can see from above they are present in marital_status and  no_of_children
customer_demographics.isnull().sum()

customer_id         0
age_range           0
marital_status    329
rented              0
family_size         0
no_of_children    538
income_bracket      0
dtype: int64

In [85]:
# checking the missing value percentage
(customer_demographics.isnull().sum()/len(customer_demographics)) * 100

customer_id        0.000000
age_range          0.000000
marital_status    43.289474
rented             0.000000
family_size        0.000000
no_of_children    70.789474
income_bracket     0.000000
dtype: float64

Here, 43% of data in marital_status and around 71% data in no_of_children are missing, which is very huge amount