In [58]:
import pandas as pd
import numpy as np

In [59]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [60]:
train_set = '/content/drive/MyDrive/ML_ECO/cup98lrn.csv' # zip
valid_set = '/content/drive/MyDrive/ML_ECO/cup98val.csv'

In [144]:
df = pd.read_csv(train_set,low_memory = False)  #compression='zip'

In [145]:
val = pd.read_csv(valid_set,low_memory = False)  #ASSUMING THE ORDER of columns is the same

In [146]:
df.columns[362:386] # just checking , should be ADATEs and RFA_2

Index(['ADATE_2', 'ADATE_3', 'ADATE_4', 'ADATE_5', 'ADATE_6', 'ADATE_7',
       'ADATE_8', 'ADATE_9', 'ADATE_10', 'ADATE_11', 'ADATE_12', 'ADATE_13',
       'ADATE_14', 'ADATE_15', 'ADATE_16', 'ADATE_17', 'ADATE_18', 'ADATE_19',
       'ADATE_20', 'ADATE_21', 'ADATE_22', 'ADATE_23', 'ADATE_24', 'RFA_2'],
      dtype='object')

## Preprosessing
Our preprosessing will include 4 steps:
1. We will delete the columns we will not use (please see documentation to see which columns we drop)
2. Re-code /transform complex variables to easier ones
3. Encode ordinal variables using ordinal encoder and create one-hot-encoding where ordinal encoding is not suitable
4. Delete observations with missing data or additionally drop features that contain more than 50% of missing data

After each step we will check whether our target variable (TARGET_B) still has enough positive examples (value = 1)




**STEP 1: Drop Features**

In dropping features we follow certain rules, namely:
1. We drop every feature that is a date
2. We drop features that refer to the sources of information 
3. We drop features that are already covered implicitly by the other features that we will encode 
4. We drop features that have a very complex structure and would require a lot of dummy variables (like STATE)
5. We drop features that highly depend on features that we previously decided to drop (like WEALTH2)



In [147]:
# some important stuff 
yesno_variables = df.columns[56:74]
rfas = df.columns[386:408] # RFA variables

In [148]:
drop1 = df.columns[1:15].to_list() # need to drop these
drop2 = df.columns[16]#.to_list() # CLUSTER   
drop3 = df.columns[18]#.to_list() # AGEFLAG
drop4 = df.columns[44:51].to_list() # connection to the war and war veterans , might actually be relevant even though very specific data
drop11 = df.columns[51:54].to_list() # SOLICIT variables + MAJOR
drop5 = df.columns[54:56].to_list() 
drop6 = df.columns[362:386].to_list() # ADATES and RFA_XX
drop7 = df.columns[413:457].to_list() # Dates of different donations received
drop8 = df.columns[473]
drop9 = df.columns[480:482].to_list() # CLUSTER and GEOCODE
drop10 = df.columns[43] # drop the source from where the data is collected
drop12 = df.columns[74] #  LIFE STYLE DATA SOURCE
drop13 = df.columns[461] # Date associated with the smallest gift to date
drop14 = df.columns[463] # Date associated with the largest gift to date
drop15 = df.columns[465:468].to_list() # Dates of different gifts received
drop16 = df.columns[409] # MAXADATE
drop17 = df.columns[24] # Number of Children, very ambiguous since many empty cells, covered by other variables implicitly
drop18 = df.columns[196:199] # these are very weird codes, I do not know how to use them for the prediction
drop = drop1+drop5+drop6+drop7+drop9+drop11+drop15+drop4
df = df.drop(drop, axis = 1)
df = df.drop(drop2,axis = 1)
df = df.drop(drop3, axis = 1)
df = df.drop(drop8, axis = 1)
df = df.drop(drop10, axis = 1)
df = df.drop(drop12, axis = 1)
df = df.drop(drop13, axis = 1)
df = df.drop(drop14, axis = 1)
df = df.drop(drop17, axis = 1)


# repeat for val

val = val.drop(drop, axis = 1)
val = val.drop(drop2,axis = 1)
val = val.drop(drop3, axis = 1)
val = val.drop(drop8, axis = 1)
val = val.drop(drop10, axis = 1)
val = val.drop(drop12, axis = 1)
val = val.drop(drop13, axis = 1)
val = val.drop(drop14, axis = 1)
val = val.drop(drop17, axis = 1)


**STEP 2 : Encode**

Now we start encoding things
1. We first rename the first column to Index
2. We then extract the first letter from Domain to then encode it to one-hot encoding
3. Afterwards we change the CHILDXX variables , which will be also encoded as one-hot 
4.
5.

In [149]:
df = df.rename(columns = {'Unnamed: 0' : 'Index'}) # rename first column to the Index
val = val.rename(columns = {'Unnamed: 0' : 'Index'}) # rename first column to the Index


In [150]:
val.DOMAIN.unique()

array(['T1', 'C1', 'C2', 'R2', 'S1', 'C3', 'T2', 'U2', 'U1', 'U4', 'S2',
       'S3', ' ', 'T3', 'R3', 'U3', 'R1'], dtype=object)

In [151]:
df.DOMAIN = df.DOMAIN.replace({' ': '99'}) # replace empty cells to 99 and then every 9 will be replaced to NaN

df['SES'] = 0 # NEW variable - socioeconomic status - second byte from DOMAIN
index = 0
for x in df.DOMAIN:
    df.SES[index] = int(df.DOMAIN[index][1])
    index += 1


# extracting the first 
index = 0
for x in df.DOMAIN:
    df.DOMAIN[index] = df.DOMAIN[index][0]
    index += 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [152]:
val.DOMAIN = val.DOMAIN.replace({' ': '99'}) # replace empty cells to 99 and then every 9 will be replaced to NaN

val['SES'] = 0 # NEW variable - socioeconomic status - second byte from DOMAIN
index = 0
for x in val.DOMAIN:
    val.SES[index] = int(val.DOMAIN[index][1])
    index += 1


# extracting the first 
index = 0
for x in val.DOMAIN:
    val.DOMAIN[index] = val.DOMAIN[index][0]
    index += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [153]:
val[rfas]

Unnamed: 0,RFA_3,RFA_4,RFA_5,RFA_6,RFA_7,RFA_8,RFA_9,RFA_10,RFA_11,RFA_12,RFA_13,RFA_14,RFA_15,RFA_16,RFA_17,RFA_18,RFA_19,RFA_20,RFA_21,RFA_22,RFA_23,RFA_24
0,A1G,A1G,A1G,A1G,A2G,A2G,A1F,A1F,A1F,A1F,A1F,A1F,A2F,A2F,A1E,A1E,A2E,A2E,A2E,A2E,A2E,A1D
1,A1F,A1F,,N2F,N1E,N1E,N1E,N1E,N1E,N1E,F1E,F1E,,,P1E,P1E,,,,,,
2,A1G,A1G,A1G,A1G,A1G,A1G,A1G,A1G,A1G,A1G,A1G,A1G,A1G,A1G,A1F,A1F,A1F,A1F,A1F,A1F,A1F,A1F
3,A1F,A1F,A1F,A1F,A1F,A1F,A1F,A1F,A1F,A1F,A1F,A1F,,A1F,N1D,N1D,N1D,N1D,N1D,N1D,,F1D
4,A3E,A3E,,A2D,A2D,A2D,A2D,A2D,A2D,A2D,A3D,A2D,,A2D,A2D,A2D,A2D,A1D,A1D,A1D,,A1D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96362,A1F,A1F,,L1F,L1F,L1F,A1F,A1F,A1F,A1F,A1F,A1F,,A1F,A1F,A1F,L2F,,L2F,L2F,,L2F
96363,,,,L1E,A1E,A1E,A1E,A1E,A1E,A1E,,A1E,,A1E,,,A1D,A1D,A1D,A1D,,A1D
96364,N2F,N2F,N2F,F1E,F1E,F1E,,,P1A,P1E,,,,,,,,,,,,
96365,A1G,A1G,A2G,A2G,A2G,A2G,N3G,N3G,N3G,N3G,N2G,N2G,N2G,F1F,F1F,F1F,F1F,,P1A,P1F,,


In [154]:
 df[rfas] = df[rfas].replace({' ': '999'})
 df['RFA_23'] = df['RFA_23'].replace({'1D': '91D','3E': '93E', '2D' : '92D', '1E': '91E', '1C': '91C', '1G': '91G', '2E': '92E',
                                     '1F': '91F','4E': '94E', '3F': '93F' , '2F' : '92F', '1B': '91B'})
  
for item in rfas:
    df[item + 'R'] = [ letter[0] for letter in df[item].to_list() ] # every first letter goes to Recency
    df[item + 'F'] = [ int(letter[1]) for letter in df[item].to_list() ] # every second letter (actually integer) goes to Frequency
    df[item + 'A'] = [ letter[2] for letter in df[item].to_list() ] # every third letter goes to Amount
  
for item in rfas:
    df[item + 'R'] = df[item + 'R'].replace({'9': np.NaN, 'U': np.NaN, 'P': np.NaN})
    df[item + 'F'] = df[item + 'F'].replace({9: np.NaN})
    df[item + 'A'] = df[item + 'A'].replace({'9': np.NaN, 'U': np.NaN, 'P': np.NaN})
 

In [155]:
val[rfas] = val[rfas].replace({' ': '999'})
val['RFA_23'] = val['RFA_23'].replace({'1D': '91D','3E': '93E', '2D' : '92D', '1E': '91E', '1C': '91C', '1G': '91G', '2E': '92E',
                                     '1F': '91F','4E': '94E', '3F': '93F' , '2F' : '92F', '1B': '91B', '2C': '92C', '2G': '92G'})
  
for item in rfas:
    val[item + 'R'] = [ letter[0] for letter in val[item].to_list() ] # every first letter goes to Recency
    val[item + 'F'] = [ int(letter[1]) for letter in val[item].to_list() ] # every second letter (actually integer) goes to Frequency
    val[item + 'A'] = [ letter[2] for letter in val[item].to_list() ] # every third letter goes to Amount
  
for item in rfas:
    val[item + 'R'] = val[item + 'R'].replace({'9': np.NaN, 'U': np.NaN, 'P': np.NaN})
    val[item + 'F'] = val[item + 'F'].replace({9: np.NaN})
    val[item + 'A'] = val[item + 'A'].replace({'9': np.NaN, 'U': np.NaN, 'P': np.NaN})
 

In [156]:
df.HOMEOWNR = df.HOMEOWNR.replace({'H': 1, 'U': 0, ' ': np.NaN})
df['CHILD03'] = df['CHILD03'].replace({' ': 'N'}) # we will assume that no indication of 
# children means no children as a category
df['CHILD07'] = df['CHILD07'].replace({' ': 'N'})	
df['CHILD12'] = df['CHILD12'].replace({' ': 'N'})
df['CHILD18'] = df['CHILD18'].replace({' ': 'N'})
#df.MAJOR = df.MAJOR.replace({'X': 1, ' ': 0})
df.PEPSTRFL = df.PEPSTRFL.replace({'X': 1, ' ':0})
df.DOMAIN = df.DOMAIN.replace({9 : np.NaN, '9': np.NaN})
df.SES = df.SES.replace({9 : np.NaN, 4: 3}) # please read documentation, the decision was made to replace every 4 to 3 so that technically 3 will include 
# all lowest SES
df.SES = df.SES.replace({3: 1, 1: 3}) # now we replace all 3 with 1 and all 1 with 3 so that we can decode it ordinally
df.GENDER = df.GENDER.replace({'A': 'U', 'C': 'U', ' ': np.NaN, 'J' : 'U'})
#df.MDMAUD_A = df.MDMAUD_A.replace({'X': np.NaN})
df.MDMAUD_F = df.MDMAUD_F.replace({'X': np.NaN})

for item in yesno_variables:
    df[item] = df[item].replace({' ': 0, 'N':0, 'Y': 1}) # these data values are a bit ambiguous since some of them do not have real N so we assume
    #that empty cells represent negative observations


recency = df.columns[376:442].to_list()[0::3]
frequency = df.columns[376:442].to_list()[1::3]
amount = df.columns[376:442].to_list()[2::3]


In [157]:
val.HOMEOWNR = val.HOMEOWNR.replace({'H': 1, 'U': 0, ' ': np.NaN})
val['CHILD03'] = val['CHILD03'].replace({' ': 'N'}) # we will assume that no indication of 
# children means no children as a category
val['CHILD07'] = val['CHILD07'].replace({' ': 'N'})	
val['CHILD12'] = val['CHILD12'].replace({' ': 'N'})
val['CHILD18'] = val['CHILD18'].replace({' ': 'N'})
#df.MAJOR = df.MAJOR.replace({'X': 1, ' ': 0})
val.PEPSTRFL = val.PEPSTRFL.replace({'X': 1, ' ':0})
val.DOMAIN = val.DOMAIN.replace({9 : np.NaN, '9': np.NaN})
val.SES = val.SES.replace({9 : np.NaN, 4: 3}) # please read documentation, the decision was made to replace every 4 to 3 so that technically 3 will include 
# all lowest SES
val.SES = val.SES.replace({3: 1, 1: 3}) # now we replace all 3 with 1 and all 1 with 3 so that we can decode it ordinally
val.GENDER = val.GENDER.replace({'A': 'U', 'C': 'U', ' ': np.NaN, 'J' : 'U'})
#df.MDMAUD_A = df.MDMAUD_A.replace({'X': np.NaN})
val.MDMAUD_F = val.MDMAUD_F.replace({'X': np.NaN})

for item in yesno_variables:
    val[item] = val[item].replace({' ': 0, 'N':0, 'Y': 1}) # these data values are a bit ambiguous since some of them do not have real N so we assume
    #that empty cells represent negative observations


recency_val = val.columns[376:442].to_list()[0::3]
frequency_val = val.columns[376:442].to_list()[1::3]
amount_val = val.columns[376:442].to_list()[2::3]

In [158]:
too_many_nas = [] # columns where we still have too many nans

for item in df:
    if df[item].isna().sum() > len(df)*0.4:
      too_many_nas.append(str(item))

df = df.drop(too_many_nas, axis = 1)
df = df.drop('Index', axis = 1)

In [161]:
too_many_nas_val = [] # columns where we still have too many nans

for item in val:
    if val[item].isna().sum() > len(df)*0.4:
      too_many_nas_val.append(str(item))

if too_many_nas_val != too_many_nas:
  too_many_nas_val = too_many_nas
else:
    pass


val = val.drop(too_many_nas_val, axis = 1)
val = val.drop('Index', axis = 1)

In [162]:
list(np.setdiff1d(df.columns,val.columns))

['TARGET_B', 'TARGET_D']

In [163]:
[df.DOMAIN.unique()] + [df.MDMAUD_A.unique()]

[array(['T', 'S', 'R', 'U', 'C', nan], dtype=object),
 array(['X', 'C', 'M', 'L', 'T'], dtype=object)]

In [164]:
too_many_nas_val == too_many_nas 

True

In [165]:
# now we need to drop 
recency_dropped = list(set.intersection(set(too_many_nas), set(recency)))
frequency_dropped = list(set.intersection(set(too_many_nas), set(frequency)))
amount_dropped = list(set.intersection(set(too_many_nas), set(amount)))

recency = list(np.setdiff1d(recency,recency_dropped))
frequency = list(np.setdiff1d(frequency,frequency_dropped))
amount = list(np.setdiff1d(amount,amount_dropped))

freq = [[1.0,2.0,3.0,4.0]] *len(frequency)
am = [['A', 'B', 'C', 'D', 'E', 'F', 'G']] * len(amount)

In [166]:
# now we need to drop 
recency_dropped_val = list(set.intersection(set(too_many_nas_val), set(recency_val)))
frequency_dropped_val = list(set.intersection(set(too_many_nas_val), set(frequency_val)))
amount_dropped_val = list(set.intersection(set(too_many_nas_val), set(amount_val)))

recency_val = list(np.setdiff1d(recency_val,recency_dropped_val))
frequency_val = list(np.setdiff1d(frequency_val,frequency_dropped_val))
amount_val = list(np.setdiff1d(amount_val,amount_dropped_val))

freq = [[1.0,2.0,3.0,4.0]] *len(frequency)
am = [['A', 'B', 'C', 'D', 'E', 'F', 'G']] * len(amount)

In [167]:
df[amount[0]]

0          E
1          E
2        NaN
3        NaN
4          D
        ... 
95407    NaN
95408    NaN
95409      E
95410      F
95411      G
Name: RFA_10A, Length: 95412, dtype: object

**Step 3: One Hot Encoding**

In [169]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
#dropna
df = df.dropna()
val = val.dropna()

one_hot_encoding = ['DOMAIN', 'CHILD03','CHILD12','CHILD18', 'GENDER', 'MDMAUD_R'] #+ recency
categorical_encoding = ['MDMAUD_A', 'SES', 'INCOME'] + frequency + amount
categories = [[ 'X' ,'L', 'C', 'M', 'T'],
              ['1','2','3'],
              [1.,2.,3.,4.,5.,6.,7.]] + freq + am


ordinal_encoder = OrdinalEncoder(categories = categories)
one_hot_encoder = OneHotEncoder()

df_onehot = one_hot_encoder.fit_transform(df[one_hot_encoding])
df_ordinal = ordinal_encoder.fit_transform(df[categorical_encoding])

val_onehot = one_hot_encoder.fit_transform(val[one_hot_encoding])
val_ordinal = ordinal_encoder.fit_transform(val[categorical_encoding])


In [170]:
# TARGET VARIABLES
label_binary = df['TARGET_B'].to_numpy() # 1/0 donated or not
label_dollars = df['TARGET_D'].to_numpy() # how much donated


columns = np.setdiff1d(df.columns,one_hot_encoding)
columns = np.setdiff1d(df.columns,categorical_encoding)
columns = np.setdiff1d(df.columns,['TARGET_B','TARGET_D','CONTROLN'])

df = df[columns]

controln = val['CONTROLN']
columns = np.setdiff1d(val.columns,one_hot_encoding)
columns = np.setdiff1d(val.columns,categorical_encoding)
columns = np.setdiff1d(val.columns,['CONTROLN'])


val = val[columns]

In [171]:
(label_dollars>0).sum()

757

In [172]:
df = np.concatenate((df,df_onehot.toarray(), df_ordinal), axis = 1)
val = np.concatenate((val,val_onehot.toarray(), val_ordinal), axis = 1)

In [176]:
df#.shape
#label_binary.shape

array([[7, 11, 127.0, ..., 3.0, 4.0, 4.0],
       [2, 3, 201.0, ..., 4.0, 4.0, 4.0],
       [5, 6, 127.0, ..., 6.0, 6.0, 6.0],
       ...,
       [5, 6, 211.0, ..., 4.0, 4.0, 4.0],
       [6, 3, 13.0, ..., 5.0, 5.0, 5.0],
       [7, 4, 355.0, ..., 6.0, 6.0, 6.0]], dtype=object)

In [179]:
controln

2        155244
7         80803
10       182234
11       156420
20       157437
          ...  
96333    132493
96339     68028
96343     21610
96346      1040
96350     59970
Name: CONTROLN, Length: 15776, dtype: int64

In [180]:
# FINAL RESULT SHOULD BE df and two labels
import pickle
data = {}
data['X'] = val
#data['y'] = label_binary
data['CONTROLN'] = controln
pickle_path = '/content/drive/MyDrive/ML_ECO/validation_set.pkl'
with open(pickle_path, "wb") as f:
    pickle.dump(data, f)