In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
train_set = '/content/drive/MyDrive/ML_ECO/cup98lrn.csv' # zip
valid_set = '/content/drive/MyDrive/ML_ECO/cup98val.txt'

In [488]:
df = pd.read_csv(train_set, low_memory = False)  #compression='zip'

In [490]:
df.columns[362:386]

Index(['ADATE_2', 'ADATE_3', 'ADATE_4', 'ADATE_5', 'ADATE_6', 'ADATE_7',
       'ADATE_8', 'ADATE_9', 'ADATE_10', 'ADATE_11', 'ADATE_12', 'ADATE_13',
       'ADATE_14', 'ADATE_15', 'ADATE_16', 'ADATE_17', 'ADATE_18', 'ADATE_19',
       'ADATE_20', 'ADATE_21', 'ADATE_22', 'ADATE_23', 'ADATE_24', 'RFA_2'],
      dtype='object')

In [491]:
df.GENDER.unique()

array(['F', 'M', ' ', 'C', 'U', 'J', 'A'], dtype=object)

## Preprosessing
Our preprosessing will include 4 steps:
1. We will delete the columns we will not use (please see documentation to see which columns we drop)
2. Re-code /transform complex variables to easier ones
3. Encode ordinal variables using ordinal encoder and create one-hot-encoding where ordinal encoding is not suitable
4. Delete observations with missing data or additionally drop features that contain more than 50% of missing data

After each step we will check whether our target variable (TARGET_B) still has enough positive examples (value = 1)




**STEP 1: Drop Features**

In dropping features we follow certain rules, namely:
1. We drop every feature that is a date
2. We drop features that refer to the sources of information 
3. We drop features that are already covered implicitly by the other features that we will encode 
4. We drop features that have a very complex structure and would require a lot of dummy variables (like STATE)
5. We drop features that highly depend on features that we previously decided to drop (like WEALTH2)



In [492]:
# some important stuff 

yesno_variables = df.columns[56:74]
rfas = df.columns[386:408]

In [493]:
drop1 = df.columns[1:15].to_list() # need to drop these
drop2 = df.columns[16]#.to_list() # CLUSTER   
drop3 = df.columns[18]#.to_list() # AGEFLAG
drop4 = df.columns[44:51].to_list() # connection to the war and war veterans , might actually be relevant even though very specific data
drop11 = df.columns[51:54].to_list() # SOLICIT variables + MAJOR
drop5 = df.columns[54:56].to_list() 
drop6 = df.columns[362:386].to_list() # ADATES and RFA_XX
drop7 = df.columns[413:457].to_list() # Dates of different donations received
drop8 = df.columns[473]
drop9 = df.columns[480:482].to_list() # CLUSTER and GEOCODE
drop10 = df.columns[43] # drop the source from where the data is collected
drop12 = df.columns[74] #  LIFE STYLE DATA SOURCE
drop13 = df.columns[461] # Date associated with the smallest gift to date
drop14 = df.columns[463] # Date associated with the largest gift to date
drop15 = df.columns[465:468].to_list() # Dates of different gifts received
drop16 = df.columns[409] # MAXADATE
drop17 = df.columns[24] # Number of Children, very ambiguous since many empty cells, covered by other variables implicitly
drop18 = df.columns[196:199] # these are very weird codes, I do not know how to use them for the prediction
drop = drop1+drop5+drop6+drop7+drop9+drop11+drop15+drop4
df = df.drop(drop, axis = 1)
df = df.drop(drop2,axis = 1)
df = df.drop(drop3, axis = 1)
df = df.drop(drop8, axis = 1)
df = df.drop(drop10, axis = 1)
df = df.drop(drop12, axis = 1)
df = df.drop(drop13, axis = 1)
df = df.drop(drop14, axis = 1)
df = df.drop(drop17, axis = 1)

**STEP 2 : Encode**

Now we start encoding things
1. We first rename the first column to Index
2. We then extract the first letter from Domain to then encode it to one-hot encoding
3. Afterwards we change the CHILDXX variables , which will be also encoded as one-hot 
4.
5.

In [494]:
df = df.rename(columns = {'Unnamed: 0' : 'Index'}) # rename first column to the Index

In [495]:
df.DOMAIN.unique()

array(['T2', 'S1', 'R2', 'S2', 'T1', 'R3', 'U1', 'C2', 'C1', 'U3', ' ',
       'R1', 'U2', 'C3', 'U4', 'S3', 'T3'], dtype=object)

In [496]:
df.DOMAIN = df.DOMAIN.replace({' ': '99'})

df['SES'] = 0 # NEW variable - socioeconomic status - second byte from DOMAIN
index = 0
for x in df.DOMAIN:
    df.SES[index] = int(df.DOMAIN[index][1])
    index += 1


# extracting the first 
index = 0
for x in df.DOMAIN:
    df.DOMAIN[index] = df.DOMAIN[index][0]
    index += 1


In [504]:
 df[rfas] = df[rfas].replace({' ': '999'})
 df['RFA_23'] = df['RFA_23'].replace({'1D': '91D','3E': '93E', '2D' : '92D', '1E': '91E', '1C': '91C', '1G': '91G', '2E': '92E',
                                     '1F': '91F','4E': '94E', '3F': '93F' , '2F' : '92F'})
  
for item in rfas:
    df[item + 'R'] = [ letter[0] for letter in df[item].to_list() ]
    df[item + 'F'] = [ int(letter[1]) for letter in df[item].to_list() ]
    df[item + 'A'] = [ letter[2] for letter in df[item].to_list() ]
  
for item in rfas:
    df[item + 'R'] = df[item + 'R'].replace({'9': np.NaN, 'U': np.NaN, 'P': np.NaN})
    df[item + 'F'] = df[item + 'R'].replace({'9': np.NaN, 'U': np.NaN, 'P': np.NaN})
    df[item + 'A'] = df[item + 'R'].replace({'9': np.NaN, 'U': np.NaN, 'P': np.NaN})
 

In [523]:
len(df.RFA_19A.unique())

7

In [524]:
df.HOMEOWNR = df.HOMEOWNR.replace({'H': 1, 'U': 0, ' ': np.NaN})
df['CHILD03'] = df['CHILD03'].replace({' ': 'N'}) # we will assume that no indication of 
# children means no children as a category
df['CHILD07'] = df['CHILD07'].replace({' ': 'N'})	
df['CHILD12'] = df['CHILD12'].replace({' ': 'N'})
df['CHILD18'] = df['CHILD18'].replace({' ': 'N'})
#df.MAJOR = df.MAJOR.replace({'X': 1, ' ': 0})
df.PEPSTRFL = df.PEPSTRFL.replace({'X': 1, ' ':0})
df.DOMAIN = df.DOMAIN.replace({9 : np.NaN, '9': np.NaN})
df.SES = df.SES.replace({9 : np.NaN, 4: 3}) # please read documentation, the decision was made to replace every 4 to 3 so that technically 3 will include 
# all lowest SES
df.SES = df.SES.replace({3: 1, 1: 3}) # now we replace all 3 with 1 and all 1 with 3 so that we can decode it ordinally
df.GENDER = df.GENDER.replace({'A': 'U', 'C': 'U', ' ': np.NaN, 'J' : 'U'})
#df.MDMAUD_A = df.MDMAUD_A.replace({'X': np.NaN})
df.MDMAUD_F = df.MDMAUD_F.replace({'X': np.NaN})

for item in yesno_variables:
    df[item] = df[item].replace({' ': 0, 'N':0, 'Y': 1}) # these data values are a bit ambiguous since some of them do not have real N so we assume
    #that empty cells represent negative observations


recency = df.columns[376:442].to_list()[0::3]
frequency = df.columns[376:442].to_list()[1::3]
freq = [[1,2,3,4]]*len(frequency)
amount = df.columns[376:442].to_list()[2::3]
am = [['A', 'B', 'C', 'D', 'E', 'F', 'G']] * len(amount)



In [525]:
too_many_nas = []

for item in df:
    if df[item].isna().sum() > len(df)*0.5:
      too_many_nas.append(str(item))

df = df.drop(too_many_nas, axis = 1)

In [460]:
[df.DOMAIN.unique()] + [df.MDMAUD_A.unique()]

[array(['T', 'S', 'R', 'U', 'C', '9'], dtype=object),
 array(['X', 'C', 'M', 'L', 'T'], dtype=object)]

In [None]:
for item in df:
    print(df[item].dtype)

In [526]:
too_many_nas

['MBCRAFT',
 'MBGARDEN',
 'MBBOOKS',
 'MBCOLECT',
 'MAGFAML',
 'MAGFEM',
 'MAGMALE',
 'PUBGARDN',
 'PUBCULIN',
 'PUBHLTH',
 'PUBDOITY',
 'PUBNEWFN',
 'PUBPHOTO',
 'PUBOPP',
 'MDMAUD_F',
 'RFA_15R',
 'RFA_15F',
 'RFA_15A',
 'RFA_20R',
 'RFA_20F',
 'RFA_20A',
 'RFA_23R',
 'RFA_23F',
 'RFA_23A']

**Step 3: One Hot Encoding**

In [481]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
ordinal_encoder = OrdinalEncoder(categories = categories)
one_hot_encoder = OneHotEncoder()

#dropna
df = df.dropna()

one_hot_encoding = ['DOMAIN', 'CHILD03','CHILD12','CHILD18', 'GENDER', 'MDMAUD_R'] + recency
categorical_encoding = ['MDMAUD_A', 'SES', 'INCOME']# + frequency + amount
categories = [[ 'X' ,'L', 'C', 'M', 'T'],
              ['1','2','3'],
              [1.,2.,3.,4.,5.,6.,7.]] #+ freq + am

df_onehot = one_hot_encoder.fit_transform(df[one_hot_encoding])
df_ordinal = ordinal_encoder.fit_transform(df[categorical_encoding])

In [487]:
df.RFA_3A.unique()

array(['G', 'E', 'F', 'D', '9', 'C', 'B'], dtype=object)

In [359]:
categorical_encoding

['categorical_encoding',
 'MDMAUD_F',
 'SES',
 'INCOME',
 'RFA_3F',
 'RFA_4F',
 'RFA_5F',
 'RFA_6F',
 'RFA_7F',
 'RFA_8F',
 'RFA_9F',
 'RFA_10F',
 'RFA_11F',
 'RFA_12F',
 'RFA_13F',
 'RFA_14F',
 'RFA_15F',
 'RFA_16F',
 'RFA_17F',
 'RFA_18F',
 'RFA_19F',
 'RFA_20F',
 'RFA_21F',
 'RFA_22F',
 'RFA_23F',
 'RFA_24F',
 'RFA_3A',
 'RFA_4A',
 'RFA_5A',
 'RFA_6A',
 'RFA_7A',
 'RFA_8A',
 'RFA_9A',
 'RFA_10A',
 'RFA_11A',
 'RFA_12A',
 'RFA_13A',
 'RFA_14A',
 'RFA_15A',
 'RFA_16A',
 'RFA_17A',
 'RFA_18A',
 'RFA_19A',
 'RFA_20A',
 'RFA_21A',
 'RFA_22A',
 'RFA_23A',
 'RFA_24A']