# Exam Preparation Data Mining Cup 2014

## Files:
- data/orders_class.txt (target features - this is the 'test' data)
- data/orders_train.txt (training data - this is the ground truth and will be used to train the algo)
- data/DMC 2014_realclass.txt (validation data - quality check for algo)

## File Handling:
- Use the 'data/orders_train.txt' dataset to train the algo
- Afterwards use 'data/orders_class.txt' dataset to predict the target feature
- Finally check the quality of the results with 'data/DMC 2014_realclass.txt' dataset

## Scenario:
'On the basis of historical purchase data of an online shop a model is to be learned generating a
prediction of the probability that a certain purchase is converted into a return on the basis of new
purchase  data  of  the  shop.'

## Data:
- Item kept = '0'
- Item returned = '1'

# Data Exploration

In [323]:
import pandas as pd
import numpy as np
from datetime import timedelta
from datetime import datetime

In [208]:
def perform_data_quality_check(initial_dataframe):
    qc_df = pd.DataFrame(columns=['field', 'missing_values', 'min_value', 'max_value'])

    for column in initial_dataframe:
        
        # calcualte the count of missing values (defined as '?' or numpy.NaN)
        if (df[column]=='?').sum() is not None or df[column].isna().sum() is not None:
            missing_values = (df[column]=='?').sum() + df[column].isna().sum()
        else:
            missing_values = float('NaN')
        
        # calculate the min and max if the field is a date or numeric
        if column not in ('size','color','salutation','state'):
            min_value = df[column].min()
            max_value = df[column].max()
        else:
            min_value = float('NaN')
            max_value = float('NaN')
    
        new_row = {'field': column, 'missing_values': missing_values, 'min_value': min_value, 'max_value': max_value}
        qc_df.loc[len(qc_df)] = new_row
        
    return qc_df

In [209]:
df = pd.read_csv('data/orders_train.txt', delimiter=';')
df.head()

Unnamed: 0,orderItemID,orderDate,deliveryDate,itemID,size,color,manufacturerID,price,customerID,salutation,dateOfBirth,state,creationDate,returnShipment
0,1,2012-04-01,2012-04-03,186,m,denim,25,69.9,794,Mrs,1965-01-06,Baden-Wuerttemberg,2011-04-25,0
1,2,2012-04-01,2012-04-03,71,9+,ocher,21,69.95,794,Mrs,1965-01-06,Baden-Wuerttemberg,2011-04-25,1
2,3,2012-04-01,2012-04-03,71,9+,curry,21,69.95,794,Mrs,1965-01-06,Baden-Wuerttemberg,2011-04-25,1
3,4,2012-04-02,?,22,m,green,14,39.9,808,Mrs,1959-11-09,Saxony,2012-01-04,0
4,5,2012-04-02,1990-12-31,151,39,black,53,29.9,825,Mrs,1964-07-11,Rhineland-Palatinate,2011-02-16,0


In [210]:
# check for datatypes and NaN values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481092 entries, 0 to 481091
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   orderItemID     481092 non-null  int64  
 1   orderDate       481092 non-null  object 
 2   deliveryDate    481092 non-null  object 
 3   itemID          481092 non-null  int64  
 4   size            481092 non-null  object 
 5   color           481092 non-null  object 
 6   manufacturerID  481092 non-null  int64  
 7   price           481092 non-null  float64
 8   customerID      481092 non-null  int64  
 9   salutation      481092 non-null  object 
 10  dateOfBirth     481092 non-null  object 
 11  state           481092 non-null  object 
 12  creationDate    481092 non-null  object 
 13  returnShipment  481092 non-null  int64  
dtypes: float64(1), int64(5), object(8)
memory usage: 51.4+ MB


## Note
- DataTypes are not right
  - for example 'orderDate' is of type 'object' needs to be of type 'date'
- 'df.info()' doesnt detect the right NaN Values because they are not numpy NaN but simple '?' instead
  - Documentation states that there are for example missing values in column 'deliveryDate'

In [211]:
# perform some basic data quality checks
# including count of missing values, min and max value (if its a numeric field)
dq_check_df = perform_data_quality_check(df)
dq_check_df

Unnamed: 0,field,missing_values,min_value,max_value
0,orderItemID,0,1,481092
1,orderDate,0,2012-04-01,2013-03-31
2,deliveryDate,39419,1990-12-31,?
3,itemID,0,1,3071
4,size,0,,
5,color,143,,
6,manufacturerID,0,1,166
7,price,0,0.0,999.0
8,customerID,0,6,86611
9,salutation,0,,


In [214]:
# replace '?' values with numpy NaN
df.replace('?', np.nan, inplace=True)

In [217]:
# trying to convert the Date fields to a datetime format
df[['orderDate', 'deliveryDate', 'dateOfBirth', 'creationDate']] = df[['orderDate', 'deliveryDate', 'dateOfBirth', 'creationDate']].apply(pd.to_datetime, format='%Y-%m-%d')

OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 1655-04-19 00:00:00 present at position 11578

In [228]:
# it is throwing an out of bounds exception because the year 1655 is not in the boundary of 'pd.to_datetime()'
min_ts = pd.Timestamp.min
max_ts = pd.Timestamp.max
print(f'min timestamp: {min_ts}, max timestamp: {max_ts}')

min timestamp: 1677-09-21 00:12:43.145224193, max timestamp: 2262-04-11 23:47:16.854775807


In [229]:
# there are a total of 3 rows which are affected by this
# it is also always the same customer ('customerID' == 48469)
df.query("dateOfBirth == '1655-04-19'")

Unnamed: 0,orderItemID,orderDate,deliveryDate,itemID,size,color,manufacturerID,price,customerID,salutation,dateOfBirth,state,creationDate,returnShipment
201497,201498,2012-09-12,2012-09-13,1662,l,brown,36,109.9,48469,Mrs,1655-04-19,Bavaria,2011-02-16,1
223802,223803,2012-09-22,2012-09-24,1662,m,brown,36,109.9,48469,Mrs,1655-04-19,Bavaria,2011-02-16,1
322553,322554,2013-01-08,2013-01-31,2429,M,black,11,89.9,48469,Mrs,1655-04-19,Bavaria,2011-02-16,0


In [264]:
# check if there are more unrealistic dates of birth
df.sort_values('dateOfBirth').head(150).to_csv('output_files/date_of_birth_analysis.csv')

In [244]:
age_gap = 2013 - 95
age_gap

1918

In [255]:
# we assume that the max age of our customers are 95
# therefore we are counting rows of purchases with a birthdate older than that
len(df[(df['dateOfBirth']>='1655-01-01') & (df['dateOfBirth']<='1918-12-31')])

5231

In [262]:
# in order to explore the 5231 rows a little further we put them into a .csv file
# and assigned them to a new dataframe
invalid_age = df[(df['dateOfBirth']>='1655-01-01') & (df['dateOfBirth']<='1918-12-31')]
invalid_age.to_csv('output_files/customer_analysis_with_invalid_date_of_birth.csv')

In [293]:
# now we want to check how many unique customers are affected
invalid_age['customerID'].nunique()

# display all unique customerIDs ascending
#invalid_age.sort_values('customerID')['customerID'].unique()

530

In [300]:
# one percent of data is affected by a false birthdate
perc_of_wrong_bdate = 5231 / 481092 * 100
perc_of_wrong_bdate

1.087318018175318

In [303]:
# check dateOfBirth is NaN
#df.loc[df['dateOfBirth'].isnull()]

In [328]:
# orderDate_max = 31.03.2013
# deliverDate_max = 22.07.2013
df['deliveryDate'].dropna().max()

'2013-07-22'

In [329]:
df[['orderDate', 'deliveryDate', 'dateOfBirth', 'creationDate']] = df[['orderDate', 'deliveryDate', 'dateOfBirth', 'creationDate']].apply(pd.to_datetime, format='%Y-%m-%d', errors = 'coerce')

In [330]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481092 entries, 0 to 481091
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   orderItemID     481092 non-null  int64         
 1   orderDate       481092 non-null  datetime64[ns]
 2   deliveryDate    441673 non-null  datetime64[ns]
 3   itemID          481092 non-null  int64         
 4   size            481092 non-null  object        
 5   color           480949 non-null  object        
 6   manufacturerID  481092 non-null  int64         
 7   price           481092 non-null  float64       
 8   customerID      481092 non-null  int64         
 9   salutation      481092 non-null  object        
 10  dateOfBirth     432200 non-null  datetime64[ns]
 11  state           481092 non-null  object        
 12  creationDate    481092 non-null  datetime64[ns]
 13  returnShipment  481092 non-null  int64         
dtypes: datetime64[ns](4), float64(1), in