In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import GaussianNB

In [2]:
train_data = pd.read_table('risk-train.txt')
test_data = pd.read_table('risk-test.txt')




In [3]:
train_data.replace('?',np.nan, inplace = True)
train_data.sample(5).head()


Unnamed: 0,ORDER_ID,CLASS,B_EMAIL,B_TELEFON,B_BIRTHDATE,FLAG_LRIDENTISCH,FLAG_NEWSLETTER,Z_METHODE,Z_CARD_ART,Z_CARD_VALID,...,FAIL_RPLZ,FAIL_RORT,FAIL_RPLZORTMATCH,SESSION_TIME,NEUKUNDE,AMOUNT_ORDER_PRE,VALUE_ORDER_PRE,DATE_LORDER,MAHN_AKT,MAHN_HOECHST
3890,39918,no,yes,no,9/12/1979,yes,no,debit_card,debit_card,4.2005,...,no,no,no,13,no,1,31.88,2/22/2001,0.0,0.0
25633,3256,no,yes,no,9/22/1963,no,no,debit_card,debit_card,6.2005,...,no,no,no,3,no,0,0.0,,,
21078,10782,no,no,no,4/29/1970,yes,no,check,,11.2006,...,no,no,no,3,no,1,63.57,5/28/2002,0.0,0.0
12308,25699,no,yes,no,6/12/1974,yes,no,check,,8.2005,...,no,no,no,1,yes,0,0.0,,,
20205,12395,no,yes,no,,no,no,credit_card,Eurocard,8.2005,...,no,no,no,17,yes,0,0.0,,,


In [4]:
test_data.replace('?',np.nan, inplace = True)
test_data.sample(5).head()

Unnamed: 0,ORDER_ID,B_EMAIL,B_TELEFON,B_BIRTHDATE,FLAG_LRIDENTISCH,FLAG_NEWSLETTER,Z_METHODE,Z_CARD_ART,Z_CARD_VALID,Z_LAST_NAME,...,FAIL_RPLZ,FAIL_RORT,FAIL_RPLZORTMATCH,SESSION_TIME,NEUKUNDE,AMOUNT_ORDER_PRE,VALUE_ORDER_PRE,DATE_LORDER,MAHN_AKT,MAHN_HOECHST
1509,42481,yes,yes,,yes,no,check,,3.2006,,...,no,no,no,5,no,1,5.91,4/11/2001,0.0,2.0
492,45067,yes,no,6/5/1961,yes,no,check,,9.2006,,...,no,no,no,4,no,3,171.0,4/24/2003,1.0,1.0
7022,28348,yes,no,3/20/1960,yes,no,check,,8.2006,,...,no,no,no,3,yes,0,0.0,,,
1991,41068,no,yes,,yes,no,check,,4.2006,,...,no,no,no,7,yes,0,0.0,,,
2593,39463,yes,no,,no,no,credit_card,Eurocard,9.2006,yes,...,no,no,no,13,no,1,7.92,3/12/2004,0.0,1.0


In [5]:
#function to check for missing values

def missing_data(data):
    total= data.isnull().sum().sort_values(ascending= False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending=False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

missing_data(train_data)


Unnamed: 0,Total,Percent
ANUMMER_10,30000,100.0
ANUMMER_09,29993,99.976667
ANUMMER_08,29966,99.886667
ANUMMER_07,29905,99.683333
ANUMMER_06,29794,99.313333
ANUMMER_05,29459,98.196667
ANUMMER_04,28668,95.56
ANUMMER_03,26802,89.34
ANUMMER_02,22147,73.823333
Z_CARD_ART,18654,62.18


In [6]:
missing_data(test_data)


Unnamed: 0,Total,Percent
ANUMMER_10,19987,99.935
ANUMMER_09,19982,99.91
ANUMMER_08,19971,99.855
ANUMMER_07,19935,99.675
ANUMMER_06,19855,99.275
ANUMMER_05,19636,98.18
ANUMMER_04,19123,95.615
ANUMMER_03,17907,89.535
ANUMMER_02,14905,74.525
Z_CARD_ART,12317,61.585


In [7]:
#drop columns with over 50% missing values to avoid imputational biasness
#drop id colum to avoid disturbing the results when training the model
#drop Z_LAST_NAME because its not useful.
train_data = train_data.drop(['ANUMMER_02','ANUMMER_03','ANUMMER_04','ANUMMER_05','ANUMMER_06','ANUMMER_07',
'ANUMMER_08','ANUMMER_09','ANUMMER_10','Z_CARD_ART','MAHN_HOECHST','MAHN_AKT','DATE_LORDER','Z_LAST_NAME','ORDER_ID'], axis = 1)
missing_data(train_data)


Unnamed: 0,Total,Percent
B_BIRTHDATE,2942,9.806667
TIME_ORDER,20,0.066667
CLASS,0,0.0
CHK_CARD,0,0.0
AMOUNT_ORDER_PRE,0,0.0
NEUKUNDE,0,0.0
SESSION_TIME,0,0.0
FAIL_RPLZORTMATCH,0,0.0
FAIL_RORT,0,0.0
FAIL_RPLZ,0,0.0


In [8]:
test_data = test_data.drop(['ANUMMER_02','ANUMMER_03','ANUMMER_04','ANUMMER_05','ANUMMER_06','ANUMMER_07',
'ANUMMER_08','ANUMMER_09','ANUMMER_10','Z_CARD_ART','MAHN_HOECHST','MAHN_AKT','DATE_LORDER','Z_LAST_NAME','ORDER_ID'], axis = 1)
missing_data(test_data)

Unnamed: 0,Total,Percent
B_BIRTHDATE,2054,10.27
TIME_ORDER,15,0.075
B_EMAIL,0,0.0
CHK_CARD,0,0.0
AMOUNT_ORDER_PRE,0,0.0
NEUKUNDE,0,0.0
SESSION_TIME,0,0.0
FAIL_RPLZORTMATCH,0,0.0
FAIL_RORT,0,0.0
FAIL_RPLZ,0,0.0


In [9]:
#fill missing values in 'TIME ORDER' using forward filling
train_data['TIME_ORDER'] = train_data['TIME_ORDER'].fillna(method='ffill')
train_data['B_BIRTHDATE'] = train_data['B_BIRTHDATE'].fillna(method='ffill')
missing_data(train_data)




  train_data['TIME_ORDER'] = train_data['TIME_ORDER'].fillna(method='ffill')
  train_data['B_BIRTHDATE'] = train_data['B_BIRTHDATE'].fillna(method='ffill')


Unnamed: 0,Total,Percent
CLASS,0,0.0
CHK_KTO,0,0.0
AMOUNT_ORDER_PRE,0,0.0
NEUKUNDE,0,0.0
SESSION_TIME,0,0.0
FAIL_RPLZORTMATCH,0,0.0
FAIL_RORT,0,0.0
FAIL_RPLZ,0,0.0
FAIL_LPLZORTMATCH,0,0.0
FAIL_LORT,0,0.0


In [10]:
#fill missing values in 'TIME ORDER' using forward filling
test_data['TIME_ORDER'] = test_data['TIME_ORDER'].fillna(method='ffill')
test_data['B_BIRTHDATE'] = test_data['B_BIRTHDATE'].fillna(method='ffill')
missing_data(test_data)




  test_data['TIME_ORDER'] = test_data['TIME_ORDER'].fillna(method='ffill')
  test_data['B_BIRTHDATE'] = test_data['B_BIRTHDATE'].fillna(method='ffill')


Unnamed: 0,Total,Percent
B_EMAIL,0,0.0
B_TELEFON,0,0.0
AMOUNT_ORDER_PRE,0,0.0
NEUKUNDE,0,0.0
SESSION_TIME,0,0.0
FAIL_RPLZORTMATCH,0,0.0
FAIL_RORT,0,0.0
FAIL_RPLZ,0,0.0
FAIL_LPLZORTMATCH,0,0.0
FAIL_LORT,0,0.0


In [11]:
train_data['BIRTH_YEAR'] = pd.to_datetime(train_data['B_BIRTHDATE']).dt.year
train_data['AGE'] = 2024 - train_data['BIRTH_YEAR']
train_data = train_data.drop(columns=['B_BIRTHDATE', 'BIRTH_YEAR'])


test_data['BIRTH_YEAR'] = pd.to_datetime(test_data['B_BIRTHDATE']).dt.year
test_data['AGE'] = 2024 - test_data['BIRTH_YEAR']
test_data = test_data.drop(columns=['B_BIRTHDATE', 'BIRTH_YEAR'])

In [12]:
X_train = train_data.drop(columns=['CLASS'])  # Extract features
y_train = train_data['CLASS']

In [15]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder_features_train = OneHotEncoder(sparse=False, drop='first')  
X_train_encoded = one_hot_encoder_features_train.fit_transform(X_train)

one_hot_encoder_target = OneHotEncoder(sparse=False, drop='first')  
y_train_reshaped = y_train.values.reshape(-1, 1)
y_train_encoded = one_hot_encoder_target.fit_transform(y_train_reshaped)






In [16]:
X_test = test_data
one_hot_encoder_features_test = OneHotEncoder(sparse=False, drop='first')  
X_test_encoded = one_hot_encoder_features_test.fit_transform(X_test)



In [17]:
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=one_hot_encoder_features_train.get_feature_names_out())
y_train_encoded_df = pd.DataFrame(y_train_encoded, columns=one_hot_encoder_target.get_feature_names_out())
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=one_hot_encoder_features_test.get_feature_names_out())


In [18]:
classifier = GaussianNB()
classifier.fit(X_train_encoded_df, y_train_encoded_df)


  y = column_or_1d(y, warn=True)


In [20]:
test_predictions = classifier.predict(X_test_encoded_df)


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- AMOUNT_ORDER_10
- AMOUNT_ORDER_PRE_7
- SESSION_TIME_25
- SESSION_TIME_26
- VALUE_ORDER_10.4
- ...
Feature names seen at fit time, yet now missing:
- SESSION_TIME_24
- VALUE_ORDER_100.29
- VALUE_ORDER_100.39
- VALUE_ORDER_100.4
- VALUE_ORDER_100.59
- ...
