# Data Mining Project
- Filipe Coelho - id: m20200580
- Ivan Kisialiou - id: m20200998
- Jose Quintas - id: m20200673

## <br/>      

## Phase 1: Data Pre-Processing (Other columns)

## Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy
import seaborn as sns

### Config some libraries

In [None]:
pd.options.display.max_columns = 500
pd.options.display.max_rows = 50

## <br/>
## <br/>
## Data Read

In [None]:
df = pd.read_csv('../Data/donors.csv', index_col=0)
df.head()

In [None]:
df.shape

## <br/>
## <br/>
## Data Selection

 ## Donation history

- RAMNTALL  -->  Dollar amount of lifetime gifts to date
- NGIFTALL  -->  Number of lifetime gifts to date
- AVGGIFT  -->   Average dollar amount of gifts to date
- NUMPROM  -->   Lifetime number of promotions received to date
- NUMPRM12 -->   Number of promotions received in the last 12 months
- CARDPROM -->   Lifetime number of card promotions received to date
- CARDPRM12 -->  Number of card promotions received in the last 12 months
- CARDGIFT -->   Number of lifetime gifts to card promotions to date
- MINRAMNT -->   Dollar amount of smallest gift to date
- MAXRAMNT -->   Dollar amount of largest gift to date


### Data to generate/keep
- AVGGIFT - Average donated amount
- NGIFTALL / NUMPROM - Success percentage

In [None]:
selected_features = [
    'RAMNTALL',
    'NGIFTALL',
    'AVGGIFT',
    'NUMPROM',
    'NUMPRM12',
    'CARDPROM',
    'CARDPM12',
    'CARDGIFT', 
    'MINRAMNT', 
    'MAXRAMNT',
    'LASTGIFT', 
    'TIMELAG', 
]

## <br/>
## <br/>

### Checking DTypes and fix case necessary

In [None]:
df[selected_features].dtypes

In [None]:
#fix dtypes
df['RAMNTALL'] = df['RAMNTALL'].astype('int64')

In [None]:
df[selected_features].head()

## <br/>
## <br/>
## Data Engineering

In [None]:
modified_df = df[selected_features].copy()

### Applying Log10 on filtered features

In [None]:
def log10_cheat(n):
    if n < 1: 
        return -1
        
    elif n >=1: 
        return(np.log10(n))

In [None]:
for column in selected_features: 
    modified_df[column] = modified_df[column].apply(lambda x: log10_cheat(x))
    
    #Plotting Before + After
    fig, ax = plt.subplots(2,1, figsize=(10,10))
    fig.suptitle('Before and after applying log10')
    sns.boxplot(df[column], ax = ax[0], color = 'red')
    sns.boxplot(modified_df[column], ax = ax[1], color ='blue')
    
    fig.show()

## <br/>
## <br/>
### Generating Time Based columns

# </br>
#### Time between minimum and maximum donation

In [None]:
MINRDATE_datetime = pd.to_datetime(df['MINRDATE'], infer_datetime_format=True)
MAXRDATE_datetime = pd.to_datetime(df['MAXRDATE'], infer_datetime_format=True)

modified_df['minmax_time_delta'] = (MINRDATE_datetime - MAXRDATE_datetime).dt.days

# </br>
#### Time between last donation and the closest day to present

In [None]:
last_datetime = pd.to_datetime('2017-06-01', infer_datetime_format=True)

In [None]:
last_dates = pd.to_datetime(df['LASTDATE'], infer_datetime_format=True)
delta = last_dates.apply(lambda date: (last_datetime - date).days )

modified_df['last_gift'] = delta.apply(lambda x: log10_cheat(x))

# </br>
#### Customer Loyalty to PVA

In [None]:
first_dates = pd.to_datetime(df['FISTDATE'], infer_datetime_format=True)
delta = first_dates.apply(lambda date: (last_datetime - date).days)

In [None]:
modified_df['customer_age'] = delta.apply(lambda x: log10_cheat(x))

## </br>
## </br>
### Generating Money based columns

In [None]:
def log2_cheat(n):
    
    if n < 1: 
        return -1

    elif n >=1: 
        return(np.log2(n))

In [None]:
modified_df['maxmin_dollar_diff'] = modified_df['MAXRAMNT'] - modified_df['MINRAMNT']
modified_df['maxmin_dollar_diff'] = modified_df['maxmin_dollar_diff'].apply(lambda x: log2_cheat(x))

## </br>
## </br>

### Success Percentage (Number donation vs number of promotions targeted)

In [None]:
# NGIFTALL / NUMPROM - Success percentage
def promotions_conversion_rate(row): 
    
    if row['NUMPROM'] != 0: 
        val =row['NGIFTALL'] / row['NUMPROM']
    else:
        val = 0 
        
    return val 


modified_df['SUCCESS_PCT'] = df.apply(promotions_conversion_rate, axis = 1)
modified_df['SUCCESS_PCT'].head()

## </br>
## </br>

### Percentage Time Lapsed (Column that contains percentage of time passed on a certain category)

In [None]:
import re

def get_percentage_as_category(source_dataframe, target_df, category):
    re_expression = re.compile('^RFA_\d{1,2}$')

    rfa_columns = [column for column in source_dataframe.columns.values if re_expression.match(column)]

    rfas = source_dataframe[rfa_columns].copy()

    rfas = rfas.applymap(lambda val: 1 if val[0] == category else 0)
    
    target_df['PCT_TIME_LAPSED_%s' % category] =  rfas.sum(axis=1) / df['NUMPROM']

    return target_df

In [None]:
categories = ['F','N','A','L','I','S']

In [None]:
for category in categories:
    modified_df = get_percentage_as_category(df, modified_df, category)
    
modified_df.head()

## </br>
## </br>

## Creating columns from *RAMNT* and *RFA* features

In [None]:
re_expression = re.compile('^RFA_\d{1,2}$')
rfa_columns = [column for column in df.columns.values if re_expression.match(column)]

In [None]:
re_expression = re.compile('^RAMNT_\d{1,2}$')
ramnt_columns = [column for column in df.columns.values if re_expression.match(column)]

## </br>

### Variance on donation value

In [None]:
#Calculate variance
modified_df['GIFT_VAR'] = df[ramnt_columns].var(axis=1)
#Fill NaN's
modified_df['GIFT_VAR'].fillna(0, inplace=True)
#Apply log10
modified_df['GIFT_VAR'] = modified_df['GIFT_VAR'].apply(lambda x: log10_cheat(x))

modified_df['GIFT_VAR'].head()

## </br>
## </br>

## Creating Wealth and County features

#### GEOCODE

In [None]:
temp_df = df[['GEOCODE','GEOCODE2']].copy()

#Re-map values
temp_df['GEOCODE2'].replace({'A':1, 'B':2, 'C':3, 'D':4, ' ': np.NaN}, inplace=True)
temp_df['GEOCODE'] = temp_df['GEOCODE'].apply(lambda x: int(x) if x != ' ' else np.NaN)

#Choosing maximum value between the 2 features
modified_df['county_size'] = temp_df.max(axis=1)
del temp_df

modified_df['county_size'].head()

#### WEALTH

In [None]:
print("WEALTH1 count:", df['WEALTH1'].count())
print("WEALTH1 NaN:",df['WEALTH1'].isna().sum())

print("\nWEALTH2 count:", df['WEALTH2'].count())
print("WEALTH2 NaN:",df['WEALTH2'].isna().sum())

In [None]:
print("WEALTH1 scale:", df.WEALTH1.max(), df.WEALTH1.min())

print("\nWEALTH2 count:", df.WEALTH2.max(), df.WEALTH2.min())

In [None]:
temp_df = df[['WEALTH1','WEALTH2']].copy()

#calculate nr rows with NaN after merge
pct_non_NaN = temp_df.mean(axis=1).count() / temp_df.shape[0]
print('Percentage of columns with value after merge: %1.2f%%' % (pct_non_NaN *100))

#Merge by maximum value
modified_df['WEALTH'] = temp_df.max(axis=1)
del temp_df

modified_df['WEALTH'].head()

## </br>
## </br>

## Adding *INCOME*, *GENDER*, *AGE* & *HOMEOWNR*

In [None]:
temp_df = df[['GENDER','INCOME','HOMEOWNR']].copy()

#### HOMEOWNR

In [None]:
#Re-map
temp_df.HOMEOWNR.replace(['',' '], 'U', inplace=True)
temp_df.HOMEOWNR.replace(['U'], 0, inplace=True)
temp_df.HOMEOWNR.replace(['H'], 1, inplace=True)

#### GENDER

In [None]:
#Re-map
temp_df.GENDER.replace(['A','C'],'J', inplace=True)
temp_df.GENDER.replace(' ','U', inplace=True)

#Creating binary columns and appending
gender_dummies = pd.get_dummies(temp_df.GENDER, prefix='GENDER')
temp_df = temp_df.join(gender_dummies)

#Cleaning
del gender_dummies
temp_df.drop(columns='GENDER', inplace=True)

In [None]:
temp_df.head()

In [None]:
#Concatenate new columns
modified_df = pd.concat([modified_df, temp_df], axis=1)
del temp_df

modified_df.head()

#### AGE

In [None]:
df['DOB'] = pd.to_datetime(df['DOB'])
df['DOB_year'] = df['DOB'].apply(lambda x: x.year if x!= None else 0)
df['DOB_year']

modified_df['age'] = df['DOB_year']\
    .apply(lambda x: 2016-x if x != None else None)\
    .apply(lambda x: log2_cheat(x))

modified_df['age'].head()

## </br>
## </br>

## Adding children features

In [None]:
children_columns = [
    'CHILD03',
    'CHILD07',
    'CHILD12',
    'CHILD18',
    'NUMCHLD'
]

In [None]:
children_df = df[children_columns].copy()
children_df.head()

In [None]:
#Re-map values
children_df.replace(' ', 0, inplace=True)
children_df.replace('', 0, inplace=True)
children_df.replace(['M','F','B'], 1, inplace=True)
children_df.fillna(0, inplace=True)

#Changing DType
children_df = children_df.astype('int64')
children_df.head()

In [None]:
#Creating column with SUM of other cols
children_df['SUM_ageGap_columns'] = children_df[children_columns[:-1]].sum(axis=1)
children_df.drop(columns=children_columns[:-1], inplace=True)

children_df.head()

In [None]:
modified_df['CHILDREN'] = children_df.max(axis=1)
del children_df

modified_df.head()

## </br>
## </br>

## Adding Neighborhood Socio Economic Status

In [None]:
domain_mode_0 = df['DOMAIN'].mode()[0][0]
domain_mode_1 = int(df['DOMAIN'].mode()[0][1])

domain_mode_0, domain_mode_1

In [None]:
socio_economic_status = df['DOMAIN'].apply(lambda x : int(x[1]) if x != ' ' else domain_mode_1)
socio_economic_status.head()

In [None]:
rurality = df['DOMAIN'].apply(lambda x : x[0] if x != ' ' else domain_mode_0)

#Re-map
mapping = {
    'U':0,
    'C':1,
    'S':2, 
    'T':3,
    'R':4
}
rurality.replace(mapping, inplace=True)
rurality.head()

In [None]:
socio_economic_status.shape, rurality.shape

In [None]:
modified_df['ses'] = socio_economic_status
modified_df['rurality'] = rurality

modified_df[['rurality','ses']].head()

## </br>
## </br>

## Adding some extra columns

In [None]:
modified_df['RECINHSE'] = df['RECINHSE'].apply(lambda x : 1 if x == 'X' else 0)

In [None]:
modified_df['RECP3'] = df['RECP3'].apply(lambda x : 1 if x == 'X' else 0)

In [None]:
modified_df['RECPGVG'] = df['RECPGVG'].apply(lambda x : 1 if x == 'X' else 0)

In [None]:
modified_df['RECSWEEP'] = df['RECSWEEP'].apply(lambda x : 1 if x == 'X' else 0)

In [None]:
modified_df['HIT'] = df['HIT'].apply(lambda x : 1 if x == 'X' else 0).apply(lambda x: log2_cheat(x))

In [None]:
modified_df['MAJOR'] = df['MAJOR'].apply(lambda x : 1 if x == 'X' else 0)

## </br>
## </br>

## Removing NaN
Since the dimensionality of the dataframe is substantial, it was decided to drop every row with NaN cells, since it's preferable not to impute values

In [None]:
modified_df.shape

In [None]:
modified_df.isna().sum()

In [None]:
modified_df.dropna(inplace=True)
modified_df.shape

## </br>
## </br>

## Checking correlation between created/selected features

In [None]:
corr = modified_df.corr()
corr = corr[np.abs(corr) > 0.45]


plt.figure(figsize=(30,30))
sns.heatmap(corr, annot=True)
plt.show()

## </br>
## </br>

## Save final Dataframe into CSV

In [None]:
modified_df.head()

In [None]:
modified_df.to_csv('../Exports/CSV/other_columns.csv')