In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
train_set = '/content/drive/MyDrive/ML_ECO/cup98lrn.csv' # zip
valid_set = '/content/drive/MyDrive/ML_ECO/cup98val.csv'

In [181]:
df = pd.read_csv(train_set, low_memory = False)  #compression='zip'

In [182]:
df.columns[362:386] # just checking , should be ADATEs and RFA_2

Index(['ADATE_2', 'ADATE_3', 'ADATE_4', 'ADATE_5', 'ADATE_6', 'ADATE_7',
       'ADATE_8', 'ADATE_9', 'ADATE_10', 'ADATE_11', 'ADATE_12', 'ADATE_13',
       'ADATE_14', 'ADATE_15', 'ADATE_16', 'ADATE_17', 'ADATE_18', 'ADATE_19',
       'ADATE_20', 'ADATE_21', 'ADATE_22', 'ADATE_23', 'ADATE_24', 'RFA_2'],
      dtype='object')

In [183]:
df.GENDER.unique()

array(['F', 'M', ' ', 'C', 'U', 'J', 'A'], dtype=object)

## Preprosessing
Our preprosessing will include 4 steps:
1. We will delete the columns we will not use (please see documentation to see which columns we drop)
2. Re-code /transform complex variables to easier ones
3. Encode ordinal variables using ordinal encoder and create one-hot-encoding where ordinal encoding is not suitable
4. Delete observations with missing data or additionally drop features that contain more than 50% of missing data

After each step we will check whether our target variable (TARGET_B) still has enough positive examples (value = 1)




**STEP 1: Drop Features**

In dropping features we follow certain rules, namely:
1. We drop every feature that is a date
2. We drop features that refer to the sources of information 
3. We drop features that are already covered implicitly by the other features that we will encode 
4. We drop features that have a very complex structure and would require a lot of dummy variables (like STATE)
5. We drop features that highly depend on features that we previously decided to drop (like WEALTH2)



In [184]:
# some important stuff 
yesno_variables = df.columns[56:74]
rfas = df.columns[386:408] # RFA variables

In [185]:
drop1 = df.columns[1:15].to_list() # need to drop these
drop2 = df.columns[16]#.to_list() # CLUSTER   
drop3 = df.columns[18]#.to_list() # AGEFLAG
drop4 = df.columns[44:51].to_list() # connection to the war and war veterans , might actually be relevant even though very specific data
drop11 = df.columns[51:54].to_list() # SOLICIT variables + MAJOR
drop5 = df.columns[54:56].to_list() 
drop6 = df.columns[362:386].to_list() # ADATES and RFA_XX
drop7 = df.columns[413:457].to_list() # Dates of different donations received
drop8 = df.columns[473]
drop9 = df.columns[480:482].to_list() # CLUSTER and GEOCODE
drop10 = df.columns[43] # drop the source from where the data is collected
drop12 = df.columns[74] #  LIFE STYLE DATA SOURCE
drop13 = df.columns[461] # Date associated with the smallest gift to date
drop14 = df.columns[463] # Date associated with the largest gift to date
drop15 = df.columns[465:468].to_list() # Dates of different gifts received
drop16 = df.columns[409] # MAXADATE
drop17 = df.columns[24] # Number of Children, very ambiguous since many empty cells, covered by other variables implicitly
drop18 = df.columns[196:199] # these are very weird codes, I do not know how to use them for the prediction
drop = drop1+drop5+drop6+drop7+drop9+drop11+drop15+drop4
df = df.drop(drop, axis = 1)
df = df.drop(drop2,axis = 1)
df = df.drop(drop3, axis = 1)
df = df.drop(drop8, axis = 1)
df = df.drop(drop10, axis = 1)
df = df.drop(drop12, axis = 1)
df = df.drop(drop13, axis = 1)
df = df.drop(drop14, axis = 1)
df = df.drop(drop17, axis = 1)

**STEP 2 : Encode**

Now we start encoding things
1. We first rename the first column to Index
2. We then extract the first letter from Domain to then encode it to one-hot encoding
3. Afterwards we change the CHILDXX variables , which will be also encoded as one-hot 
4.
5.

In [186]:
df = df.rename(columns = {'Unnamed: 0' : 'Index'}) # rename first column to the Index

In [187]:
df.DOMAIN.unique()

array(['T2', 'S1', 'R2', 'S2', 'T1', 'R3', 'U1', 'C2', 'C1', 'U3', ' ',
       'R1', 'U2', 'C3', 'U4', 'S3', 'T3'], dtype=object)

In [188]:
df.DOMAIN = df.DOMAIN.replace({' ': '99'}) # replace empty cells to 99 and then every 9 will be replaced to NaN

df['SES'] = 0 # NEW variable - socioeconomic status - second byte from DOMAIN
index = 0
for x in df.DOMAIN:
    df.SES[index] = int(df.DOMAIN[index][1])
    index += 1


# extracting the first 
index = 0
for x in df.DOMAIN:
    df.DOMAIN[index] = df.DOMAIN[index][0]
    index += 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [189]:
 df[rfas] = df[rfas].replace({' ': '999'})
 df['RFA_23'] = df['RFA_23'].replace({'1D': '91D','3E': '93E', '2D' : '92D', '1E': '91E', '1C': '91C', '1G': '91G', '2E': '92E',
                                     '1F': '91F','4E': '94E', '3F': '93F' , '2F' : '92F'})
  
for item in rfas:
    df[item + 'R'] = [ letter[0] for letter in df[item].to_list() ] # every first letter goes to Recency
    df[item + 'F'] = [ int(letter[1]) for letter in df[item].to_list() ] # every second letter (actually integer) goes to Frequency
    df[item + 'A'] = [ letter[2] for letter in df[item].to_list() ] # every third letter goes to Amount
  
for item in rfas:
    df[item + 'R'] = df[item + 'R'].replace({'9': np.NaN, 'U': np.NaN, 'P': np.NaN})
    df[item + 'F'] = df[item + 'F'].replace({9: np.NaN})
    df[item + 'A'] = df[item + 'A'].replace({'9': np.NaN, 'U': np.NaN, 'P': np.NaN})
 

In [190]:
df.HOMEOWNR = df.HOMEOWNR.replace({'H': 1, 'U': 0, ' ': np.NaN})
df['CHILD03'] = df['CHILD03'].replace({' ': 'N'}) # we will assume that no indication of 
# children means no children as a category
df['CHILD07'] = df['CHILD07'].replace({' ': 'N'})	
df['CHILD12'] = df['CHILD12'].replace({' ': 'N'})
df['CHILD18'] = df['CHILD18'].replace({' ': 'N'})
#df.MAJOR = df.MAJOR.replace({'X': 1, ' ': 0})
df.PEPSTRFL = df.PEPSTRFL.replace({'X': 1, ' ':0})
df.DOMAIN = df.DOMAIN.replace({9 : np.NaN, '9': np.NaN})
df.SES = df.SES.replace({9 : np.NaN, 4: 3}) # please read documentation, the decision was made to replace every 4 to 3 so that technically 3 will include 
# all lowest SES
df.SES = df.SES.replace({3: 1, 1: 3}) # now we replace all 3 with 1 and all 1 with 3 so that we can decode it ordinally
df.GENDER = df.GENDER.replace({'A': 'U', 'C': 'U', ' ': np.NaN, 'J' : 'U'})
#df.MDMAUD_A = df.MDMAUD_A.replace({'X': np.NaN})
df.MDMAUD_F = df.MDMAUD_F.replace({'X': np.NaN})

for item in yesno_variables:
    df[item] = df[item].replace({' ': 0, 'N':0, 'Y': 1}) # these data values are a bit ambiguous since some of them do not have real N so we assume
    #that empty cells represent negative observations


recency = df.columns[376:442].to_list()[0::3]
frequency = df.columns[376:442].to_list()[1::3]
amount = df.columns[376:442].to_list()[2::3]


In [191]:
too_many_nas = [] # columns where we still have too many nans

for item in df:
    if df[item].isna().sum() > len(df)*0.4:
      too_many_nas.append(str(item))

df = df.drop(too_many_nas, axis = 1)
#df = df.drop('Index', axis = 1)

In [192]:
df

Unnamed: 0,Index,DOMAIN,AGE,HOMEOWNR,CHILD03,CHILD07,CHILD12,CHILD18,INCOME,GENDER,HIT,COLLECT1,VETERANS,BIBLE,CATLG,HOMEE,PETS,CDPLAY,STEREO,PCOWNERS,PHOTO,CRAFTS,FISHER,GARDENIN,BOATS,WALKER,KIDSTUFF,CARDS,PLATES,PEPSTRFL,POP901,POP902,POP903,POP90C1,POP90C2,POP90C3,POP90C4,POP90C5,ETH1,ETH2,...,RFA_7F,RFA_7A,RFA_8R,RFA_8F,RFA_8A,RFA_9R,RFA_9F,RFA_9A,RFA_10R,RFA_10F,RFA_10A,RFA_11R,RFA_11F,RFA_11A,RFA_12R,RFA_12F,RFA_12A,RFA_14R,RFA_14F,RFA_14A,RFA_16R,RFA_16F,RFA_16A,RFA_17R,RFA_17F,RFA_17A,RFA_18R,RFA_18F,RFA_18A,RFA_19R,RFA_19F,RFA_19A,RFA_21F,RFA_21A,RFA_22R,RFA_22F,RFA_22A,RFA_24R,RFA_24F,RFA_24A
0,1,T,60.0,,N,N,N,N,,F,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,992,264,332,0,35,65,47,53,92,1,...,4.0,E,S,4.0,E,S,4.0,E,S,4.0,E,S,4.0,E,S,4.0,E,S,4.0,E,S,4.0,E,S,4.0,E,S,4.0,E,S,4.0,E,4.0,E,S,4.0,E,S,4.0,E
1,2,S,46.0,1.0,N,N,N,M,6.0,M,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3611,940,998,99,0,0,50,50,67,0,...,1.0,E,A,1.0,E,A,1.0,E,A,1.0,E,A,1.0,E,A,1.0,E,,,,L,1.0,E,,,,,,,N,1.0,E,1.0,E,N,1.0,E,F,1.0,E
2,3,R,,0.0,N,N,N,N,3.0,M,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,7001,2040,2669,0,2,98,49,51,96,2,...,4.0,F,S,4.0,F,S,4.0,F,,,,S,4.0,F,S,4.0,F,S,4.0,F,S,4.0,F,,,,S,4.0,D,S,4.0,D,,,S,4.0,D,S,3.0,D
3,4,R,70.0,0.0,N,N,N,N,1.0,F,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,640,160,219,0,8,92,54,46,61,0,...,4.0,E,S,4.0,E,S,4.0,E,,,,S,4.0,E,S,4.0,E,S,4.0,E,S,4.0,E,S,2.0,D,S,2.0,D,A,1.0,D,1.0,D,A,1.0,D,,,
4,5,S,78.0,1.0,N,N,N,N,3.0,F,60,0,0,1,1,0,0,0,1,0,0,1,0,1,0,1,0,1,0,0,2520,627,761,99,0,0,46,54,2,98,...,2.0,D,A,1.0,E,A,1.0,E,L,1.0,D,A,1.0,E,A,1.0,E,L,3.0,D,L,3.0,D,A,2.0,D,A,2.0,D,A,3.0,D,3.0,D,I,4.0,E,A,3.0,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,95408,C,,,N,N,N,N,,M,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,27380,7252,10037,99,0,0,50,50,78,10,...,1.0,G,,1.0,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
95408,95409,C,48.0,1.0,N,N,N,M,7.0,M,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1254,322,361,96,0,4,51,49,91,3,...,1.0,F,,1.0,F,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
95409,95410,C,60.0,,N,N,N,N,,M,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,552,131,205,99,0,0,53,47,82,14,...,,,A,2.0,E,N,3.0,E,N,3.0,E,N,3.0,E,N,3.0,E,N,3.0,E,F,1.0,D,,,,F,1.0,D,,,,1.0,D,,1.0,D,,,
95410,95411,C,58.0,1.0,N,N,N,N,7.0,F,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1746,432,508,99,0,0,47,53,92,1,...,4.0,F,S,4.0,F,S,4.0,F,S,4.0,F,S,4.0,F,S,4.0,F,S,4.0,F,S,3.0,F,S,2.0,F,S,2.0,F,A,1.0,F,1.0,F,A,1.0,F,S,3.0,F


In [193]:
[df.DOMAIN.unique()] + [df.MDMAUD_A.unique()]

[array(['T', 'S', 'R', 'U', 'C', nan], dtype=object),
 array(['X', 'C', 'M', 'L', 'T'], dtype=object)]

In [194]:
too_many_nas

['WEALTH1',
 'MBCRAFT',
 'MBGARDEN',
 'MBBOOKS',
 'MBCOLECT',
 'MAGFAML',
 'MAGFEM',
 'MAGMALE',
 'PUBGARDN',
 'PUBCULIN',
 'PUBHLTH',
 'PUBDOITY',
 'PUBNEWFN',
 'PUBPHOTO',
 'PUBOPP',
 'MDMAUD_F',
 'RFA_13R',
 'RFA_13F',
 'RFA_13A',
 'RFA_15R',
 'RFA_15F',
 'RFA_15A',
 'RFA_20R',
 'RFA_20F',
 'RFA_20A',
 'RFA_21R',
 'RFA_23R',
 'RFA_23F',
 'RFA_23A']

In [195]:
# now we need to drop 
recency_dropped = list(set.intersection(set(too_many_nas), set(recency)))
frequency_dropped = list(set.intersection(set(too_many_nas), set(frequency)))
amount_dropped = list(set.intersection(set(too_many_nas), set(amount)))

recency = list(np.setdiff1d(recency,recency_dropped))
frequency = list(np.setdiff1d(frequency,frequency_dropped))
amount = list(np.setdiff1d(amount,amount_dropped))

freq = [[1.0,2.0,3.0,4.0]] *len(frequency)
am = [['A', 'B', 'C', 'D', 'E', 'F', 'G']] * len(amount)

In [196]:
freq

[[1.0, 2.0, 3.0, 4.0],
 [1.0, 2.0, 3.0, 4.0],
 [1.0, 2.0, 3.0, 4.0],
 [1.0, 2.0, 3.0, 4.0],
 [1.0, 2.0, 3.0, 4.0],
 [1.0, 2.0, 3.0, 4.0],
 [1.0, 2.0, 3.0, 4.0],
 [1.0, 2.0, 3.0, 4.0],
 [1.0, 2.0, 3.0, 4.0],
 [1.0, 2.0, 3.0, 4.0],
 [1.0, 2.0, 3.0, 4.0],
 [1.0, 2.0, 3.0, 4.0],
 [1.0, 2.0, 3.0, 4.0],
 [1.0, 2.0, 3.0, 4.0],
 [1.0, 2.0, 3.0, 4.0],
 [1.0, 2.0, 3.0, 4.0],
 [1.0, 2.0, 3.0, 4.0],
 [1.0, 2.0, 3.0, 4.0]]

In [197]:
df[amount[0]]

0          E
1          E
2        NaN
3        NaN
4          D
        ... 
95407    NaN
95408    NaN
95409      E
95410      F
95411      G
Name: RFA_10A, Length: 95412, dtype: object

**Step 3: One Hot Encoding**

In [198]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
#dropna
df = df.dropna()

one_hot_encoding = ['DOMAIN', 'CHILD03','CHILD12','CHILD18', 'GENDER', 'MDMAUD_R'] + recency
categorical_encoding = ['MDMAUD_A', 'SES', 'INCOME'] + frequency + amount
categories = [[ 'X' ,'L', 'C', 'M', 'T'],
              ['1','2','3'],
              [1.,2.,3.,4.,5.,6.,7.]] + freq + am


ordinal_encoder = OrdinalEncoder(categories = categories)
one_hot_encoder = OneHotEncoder()

df_onehot = one_hot_encoder.fit_transform(df[one_hot_encoding])
df_ordinal = ordinal_encoder.fit_transform(df[categorical_encoding])

In [199]:
# TARGET VARIABLES
label_binary = df['TARGET_B'].to_numpy() # 1/0 donated or not
label_dollars = df['TARGET_D'].to_numpy() # how much donated


columns = np.setdiff1d(df.columns,one_hot_encoding)
columns = np.setdiff1d(df.columns,categorical_encoding)
columns = np.setdiff1d(df.columns,['TARGET_B','TARGET_D'])

df = df[columns]

In [200]:
(label_dollars>0).sum()

757

In [201]:
df = np.concatenate((df,df_onehot.toarray(), df_ordinal), axis = 1)

In [205]:
df.shape
#label_binary.shape

(15578, 571)

In [None]:
# FINAL RESULT SHOULD BE df and two labels
