In [1]:
import sys
import pandas as pd
import numpy as np
from sqlalchemy import create_engine 
import missingno as msno
from zipfile import ZipFile
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
customers_file = "resources/customers.csv"
customers_df = pd.read_csv(customers_file)
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371980 entries, 0 to 1371979
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   customer_id             1371980 non-null  object 
 1   FN                      476930 non-null   float64
 2   Active                  464404 non-null   float64
 3   club_member_status      1365918 non-null  object 
 4   fashion_news_frequency  1355971 non-null  object 
 5   age                     1356119 non-null  float64
 6   postal_code             1371980 non-null  object 
dtypes: float64(3), object(4)
memory usage: 73.3+ MB


In [3]:
#checking duplicate row 
customers_df.duplicated().sum()

0

In [4]:
#checking number of null columns
customers_df.isna().sum()

customer_id                    0
FN                        895050
Active                    907576
club_member_status          6062
fashion_news_frequency     16009
age                        15861
postal_code                    0
dtype: int64

In [5]:
customers_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


In [6]:
# Keeping The row with 3 or more valid data and drop others
customers_df.dropna(axis=0,thresh=3,inplace=True)

In [7]:
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1369967 entries, 0 to 1371979
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   customer_id             1369967 non-null  object 
 1   FN                      476930 non-null   float64
 2   Active                  464404 non-null   float64
 3   club_member_status      1365918 non-null  object 
 4   fashion_news_frequency  1355971 non-null  object 
 5   age                     1356119 non-null  float64
 6   postal_code             1369967 non-null  object 
dtypes: float64(3), object(4)
memory usage: 83.6+ MB


In [8]:
#converting customer_id to int type
def id_inting(x):
    return int(x[-16:],16)

In [9]:
#converting customer_id to int type
print(f'number of uunique customer_id before converting {customers_df["customer_id"].nunique()}')
customers_df['customer_id']= customers_df['customer_id'].apply(id_inting)
print(f'number of uunique customer_id after converting {customers_df["customer_id"].nunique()}')

number of uunique customer_id before converting 1369967
number of uunique customer_id after converting 1369967


In [10]:
#converting postal_code to int type
print(f'number of uunique postal_code before converting {customers_df["postal_code"].nunique()}')
customers_df['postal_code']= customers_df['postal_code'].apply(id_inting)
print(f'number of uunique postal_code after converting {customers_df["postal_code"].nunique()}')

number of uunique postal_code before converting 352899
number of uunique postal_code after converting 352899


In [11]:
#Fill null value of club_member_status column
print(f"The unique values before imputing in club_member_status column: {customers_df['club_member_status'].unique()}")
customers_df['club_member_status'].fillna("Not_interested" ,inplace=True)
print(f"The unique values after imputing in club_member_status column: {customers_df['club_member_status'].unique()}")

The unique values before imputing in club_member_status column: ['ACTIVE' 'PRE-CREATE' nan 'LEFT CLUB']
The unique values after imputing in club_member_status column: ['ACTIVE' 'PRE-CREATE' 'Not_interested' 'LEFT CLUB']


In [12]:
#Fill null value of fashion_news_frequency column
print(f"The unique values before imputing in fashion_news_frequency column: {customers_df['fashion_news_frequency'].unique()}")
customers_df['fashion_news_frequency'].fillna("Not_interested" ,inplace=True)
customers_df["fashion_news_frequency"].replace("None","Not_interested",inplace=True)
customers_df["fashion_news_frequency"].replace("NONE","Not_interested",inplace=True)
print(f"The unique values after imputing in fashion_news_frequency column: {customers_df['fashion_news_frequency'].unique()}")

The unique values before imputing in fashion_news_frequency column: ['NONE' 'Regularly' nan 'Monthly' 'None']
The unique values after imputing in fashion_news_frequency column: ['Not_interested' 'Regularly' 'Monthly']


In [13]:
#Fill null value of FN column
print(f"The unique values before imputing in FN column: {customers_df['FN'].unique()}")
customers_df['FN'].fillna(0.0 ,inplace=True)
customers_df["FN"].replace("Not_interested",0.0,inplace=True)
print(f"The unique values after imputing in FN column: {customers_df['FN'].unique()}")

The unique values before imputing in FN column: [nan  1.]
The unique values after imputing in FN column: [0. 1.]


In [14]:
#Fill null value of Active column
print(f"The unique values before imputing in Active column: {customers_df['Active'].unique()}")
customers_df['Active'].fillna(0.0 ,inplace=True)
customers_df["Active"].replace("Not_interested",0.0,inplace=True)
print(f"The unique values after imputing in Active column : {customers_df['Active'].unique()}")

The unique values before imputing in Active column: [nan  1.]
The unique values after imputing in Active column : [0. 1.]


In [15]:
customers_df["age"].unique()

array([49., 25., 24., 54., 52., 20., 32., 29., 31., 56., 75., 41., 27.,
       30., 48., 35., 22., 40., 38., 45., 68., 55., 19., 60., 44., 21.,
       26., 28., 53., 33., 17., 23., nan, 51., 18., 34., 57., 47., 70.,
       50., 63., 58., 43., 67., 72., 42., 39., 79., 71., 59., 36., 62.,
       37., 46., 73., 64., 74., 61., 85., 69., 76., 66., 65., 82., 16.,
       90., 80., 78., 81., 84., 77., 97., 89., 83., 98., 88., 86., 87.,
       93., 91., 99., 96., 94., 92., 95.])

In [16]:
#droping the rows with the null value in subset of age column
customers_df.dropna(subset=['age'],inplace=True)


In [17]:
#checking number of null columns
customers_df.isna().sum()

customer_id               0
FN                        0
Active                    0
club_member_status        0
fashion_news_frequency    0
age                       0
postal_code               0
dtype: int64

In [18]:
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1356119 entries, 0 to 1371979
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   customer_id             1356119 non-null  uint64 
 1   FN                      1356119 non-null  float64
 2   Active                  1356119 non-null  float64
 3   club_member_status      1356119 non-null  object 
 4   fashion_news_frequency  1356119 non-null  object 
 5   age                     1356119 non-null  float64
 6   postal_code             1356119 non-null  uint64 
dtypes: float64(3), object(2), uint64(2)
memory usage: 82.8+ MB


In [19]:
transactions_file = "resources/transactions_train.csv"
transactions_df = pd.read_csv(transactions_file)
transactions_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [20]:
transactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31788324 entries, 0 to 31788323
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   t_dat             object 
 1   customer_id       object 
 2   article_id        int64  
 3   price             float64
 4   sales_channel_id  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 1.2+ GB


In [21]:
#changing the t_dat column to datetime format
transactions_df["t_dat"]=pd.to_datetime(transactions_df["t_dat"])
transactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31788324 entries, 0 to 31788323
Data columns (total 5 columns):
 #   Column            Dtype         
---  ------            -----         
 0   t_dat             datetime64[ns]
 1   customer_id       object        
 2   article_id        int64         
 3   price             float64       
 4   sales_channel_id  int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 1.2+ GB


In [22]:
#checking number of null columns
transactions_df.isna().sum()

t_dat               0
customer_id         0
article_id          0
price               0
sales_channel_id    0
dtype: int64

In [23]:
#converting customer_id to int type
print(f'number of uunique customer_id before converting {transactions_df["customer_id"].nunique()}')
transactions_df['customer_id']= transactions_df['customer_id'].apply(id_inting)
print(f'number of uunique customer_id after converting {transactions_df["customer_id"].nunique()}')

number of uunique customer_id before converting 1362281
number of uunique customer_id after converting 1362281


In [24]:
transactions_customer_df=pd.merge(transactions_df,customers_df,on="customer_id",how='inner')

In [25]:
transactions_customer_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31648066 entries, 0 to 31648065
Data columns (total 11 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   t_dat                   datetime64[ns]
 1   customer_id             uint64        
 2   article_id              int64         
 3   price                   float64       
 4   sales_channel_id        int64         
 5   FN                      float64       
 6   Active                  float64       
 7   club_member_status      object        
 8   fashion_news_frequency  object        
 9   age                     float64       
 10  postal_code             uint64        
dtypes: datetime64[ns](1), float64(4), int64(2), object(2), uint64(2)
memory usage: 2.8+ GB


In [27]:
#checking number of null columns
transactions_customer_df.isna().sum()

t_dat                     0
customer_id               0
article_id                0
price                     0
sales_channel_id          0
FN                        0
Active                    0
club_member_status        0
fashion_news_frequency    0
age                       0
postal_code               0
dtype: int64