In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re as re
from datetime import date
from pandas_profiling import ProfileReport

In [None]:
pd.set_option('display.max_columns', 500)

# 0. Loading our Data

In [None]:
df = pd.read_csv('../Data/donors.csv')

In [None]:
df.head()

# 1.Analysing Data

## Donator classification

- RFA  --> RFA (RECENCY/FREQUENCY/AMOUNT)

In [None]:
columns_donator_classification = [
    'RFA_2R',
    'RFA_2F',
    'RFA_2A'
]

In [None]:
df[columns_donator_classification].head()

In [None]:
df[columns_donator_classification].isna().sum()

In [None]:
df[columns_donator_classification] = df[columns_donator_classification].astype('object')

 ## Donation history

- RAMNTALL  -->  Dollar amount of lifetime gifts to date
- NGIFTALL  -->  Number of lifetime gifts to date
- CARDGIFT  -->  Number of lifetime gifts to card promotions to date
- MINRAMNT  -->  Dollar amount of smallest gift to date
- MINRDATE  -->  Date associated with the smallest gift to date
- MAXRAMNT  -->  Dollar amount of largest gift to date
- MAXRDATE  -->  Date associated with the largest gift to date
- LASTGIFT  -->  Dollar amount of most recent gift
- LASTDATE  -->  Date associated with the most recent gift
- FISTDATE  -->  Date of first gift
- NEXTDATE  -->  Date of second gift
- TIMELAG  -->   Number of months between first and second gift
- AVGGIFT  -->   Average dollar amount of gifts to date

In [None]:
columns_donation_history = [
    'RAMNTALL',
    'NGIFTALL',
    'CARDGIFT',
    'MINRAMNT',
    'MINRDATE',
    'MAXRAMNT',
    'MAXRDATE',
    'LASTGIFT',
    'LASTDATE',
    'FISTDATE',
    'NEXTDATE',
    'TIMELAG',
    'AVGGIFT'
]

In [None]:
df[columns_donation_history].head()

In [None]:
df[columns_donation_history].isna().sum()

In [None]:
#drop rows with FISTDATE == NaN
df.dropna(subset=['FISTDATE'], inplace=True)
df[columns_donation_history].isna().sum()

In [None]:
#fill time lag between first and second donation to 0
df['TIMELAG'].fillna(0, inplace=True)
df[columns_donation_history].isna().sum()

In [None]:
# Make sure datatype is correct
df[columns_donation_history].dtypes

In [None]:
#fix dtypes
df[columns_donation_history] =\
    df[columns_donation_history].astype(
        {
            'RAMNTALL': 'int64',
            'MINRAMNT': 'int64',
            'MINRDATE': 'datetime64',
            'MAXRAMNT': 'int64',
            'MAXRDATE': 'datetime64',
            'LASTGIFT': 'int64',
            'LASTDATE': 'datetime64',
            'FISTDATE': 'datetime64',
            'NEXTDATE': 'datetime64',
            'TIMELAG': 'int64',
            'AVGGIFT': 'int64'
        }
    )

In [None]:
df[columns_donation_history].head()

In [None]:
#generate time delta between biggest and smallest donation
timelag_days = list(map(lambda delta: delta.days, np.abs(df['MINRDATE'] - df['MAXRDATE'])))

new_column = 'TIMEDELTA_SMALLEST_BIGGEST_DONATION'
df[new_column] = timelag_days
columns_donation_history.append(new_column)

In [None]:
#generate time delta between biggest and first donation
timelag_days = list(map(lambda delta: delta.days, np.abs(df['MAXRDATE'] - df['FISTDATE'])))

new_column = 'TIMEDELTA_FIRST_BIGGEST_DONATION'
df[new_column] = timelag_days
columns_donation_history.append(new_column)

In [None]:
#generate time delta between first and last donation
timelag_days = list(map(lambda delta: delta.days, np.abs(df['LASTDATE'] - df['FISTDATE'])))

new_column = 'TIMEDELTA_FIRST_LAST_DONATION'
df[new_column] = timelag_days
columns_donation_history.append(new_column)

In [None]:
columns_to_remove = ['MINRDATE','MAXRDATE','LASTDATE','FISTDATE','NEXTDATE']
df.drop(columns=columns_to_remove, inplace=True)

columns_donation_history = [col for col in columns_donation_history if col not in columns_to_remove]

In [None]:
#Add "Average donation value per gift"'s column
new_column = 'AVG_GIFT_VAL'
columns_donation_history.append(new_column) 

df[new_column] = df['RAMNTALL'] / df['NGIFTALL']
df[new_column].head()

In [None]:
df[columns_donation_history].head()

## Merging promotions by year
### Number gifts per year

In [None]:
columns_promotions_2014 = ['RDATE_23','RDATE_24']

columns_promotions_2015 = ['RDATE_13','RDATE_14','RDATE_15','RDATE_16','RDATE_17','RDATE_18','RDATE_19','RDATE_20','RDATE_21','RDATE_22','RDATE_23','RDATE_24']

columns_promotions_2016 = ['RDATE_3','RDATE_4','RDATE_5','RDATE_6','RDATE_7','RDATE_8','RDATE_9','RDATE_10','RDATE_11','RDATE_12']

In [None]:
#Getting number of donations per year
nr_donations_2014 = df[columns_promotions_2014].isna().sum(axis=1)
nr_donations_2015 = df[columns_promotions_2015].isna().sum(axis=1)
nr_donations_2016 = df[columns_promotions_2016].isna().sum(axis=1)
donations = pd.DataFrame([nr_donations_2014, nr_donations_2015, nr_donations_2016]).transpose()

columns_nr_donations_perYear = ['TOT_DON_14','TOT_DON_15','TOT_DON_16']

In [None]:
df[columns_nr_donations_perYear] = donations
df[columns_nr_donations_perYear].head()

### Total gift amount per year

In [None]:
columns_promotions_2014 = ['RAMNT_23','RAMNT_24']

columns_promotions_2015 = ['RAMNT_13','RAMNT_14','RAMNT_15','RAMNT_16','RAMNT_17','RAMNT_18','RAMNT_19','RAMNT_20','RAMNT_21','RAMNT_22','RAMNT_23','RAMNT_24']

columns_promotions_2016 = ['RAMNT_3','RAMNT_4','RAMNT_5','RAMNT_6','RAMNT_7','RAMNT_8','RAMNT_9','RAMNT_10','RAMNT_11','RAMNT_12']

In [None]:
#Getting number of donations per year
nr_donations_2014 = df[columns_promotions_2014].sum(axis=1)
nr_donations_2015 = df[columns_promotions_2015].sum(axis=1)
nr_donations_2016 = df[columns_promotions_2016].sum(axis=1)
donations = pd.DataFrame([nr_donations_2014, nr_donations_2015, nr_donations_2016]).transpose()

columns_total_donation_amount_perYear = ['TOT_DON_AMNT_14','TOT_DON_AMNT_15','TOT_DON_AMNT_16']

In [None]:
df[columns_total_donation_amount_perYear] = donations
df[columns_total_donation_amount_perYear].head()

## Adding return rate per donator
- **CARDPROM**  -->   Lifetime number of card promotions received to
                    date. Card promotions are promotion type FS, GK,
                    TK, SK, NK, XK, UF, UU.

- **MAXADATE**  -->   Date of the most recent promotion received (in
                    YYMM, Year/Month format)

- **NUMPROM**   -->   Lifetime number of promotions received to date

- **CARDPM12**  -->   Number of card promotions received in the last
                    12 months (in terms of calendar months translates
                    into 1603-1702)
                    
- **NUMPRM12**  -->   Number of promotions received in the last 12
                    months (in terms of calendar months translates
                    into 1603-1702)

In [None]:
columns_overall_promotion_targeting = [
    'CARDPROM',
    'MAXADATE',
    'NUMPROM',
    'CARDPM12',
    'NUMPRM12',
]

columns_return_rates = []

In [None]:
df[columns_overall_promotion_targeting].head()

In [None]:
#Adding conversion rate into df
new_column = 'CONVERSION_RATE'
columns_return_rates.append(new_column)

df[new_column] = df['NGIFTALL'] / df['NUMPROM']
df[new_column].head()

In [None]:
#Adding conversion rate into df
new_column = 'GIFT_AVG_AMNT'
columns_return_rates.append(new_column)

df[new_column] = df['NGIFTALL'] / df['NUMPROM']
df[new_column].head()

In [None]:
#Adding average gift per promotion targeting
new_column = 'GIFT_AVG_AMNT_PER_PROMOTION'
columns_return_rates.append(new_column)

df[new_column] = df['RAMNTALL'] / df['NUMPROM']
df[new_column].head()

## Encode categorical columns

In [None]:
columns_donator_classification

In [None]:
dummies = pd.get_dummies(df[columns_donator_classification])
dummies.head()

In [None]:
df.drop(columns=columns_donator_classification, inplace=True)
df = pd.concat([df, dummies], axis=1)

In [None]:
columns_donator_classification = list(dummies.columns.values)
columns_donator_classification

## Small Recap

In [None]:
modified_columns =\
    columns_donator_classification +\
    columns_donation_history +\
    columns_nr_donations_perYear +\
    columns_total_donation_amount_perYear +\
    columns_return_rates

df[modified_columns].head()

In [None]:
corr = df[modified_columns].corr()

plt.figure(figsize=(30,30))
sns.heatmap(corr, annot=True, linewidths=0.25)
plt.show()

In [None]:
# columns_to_drop = [
#     'RAMNTALL',
#     'AVGGIFT',
#     'GIFT_AVG_AMNT',
#     'NGIFTALL',
#     'AVG_GIFT_VAL',
#     'TIMEDELTA_FIRST_LAST_DONATION',
#     'CARDGIFT',
#     'TOT_DON_15',
#     'TOT_DON_14'
# ]

# modified_columns = [col for col in modified_columns if col not in columns_to_drop]

In [None]:
# corr = df[modified_columns].corr()

# plt.figure(figsize=(20,20))
# sns.heatmap(corr, annot=True, linewidths=0.25)
# plt.show()

## Normalizing Data

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
scaler.fit(df[modified_columns])
df[modified_columns] = scaler.transform(df[modified_columns])

In [None]:
df[modified_columns].head()

## Trying clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
inertia = []
k_range = range(1,15)

In [None]:
for k in k_range:
    k_means = KMeans(n_clusters=k)
    k_means.fit(df[modified_columns])
    inertia.append(k_means.inertia_)

In [None]:
plt.figure(figsize=(16,8))
plt.plot(k_range, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

    --> chosen: 3 clusters

In [None]:
k_means = KMeans(n_clusters=7)
k_means.fit(df[modified_columns])

new_column = 'cluster'
df[new_column] = k_means.predict(df[modified_columns])
modified_columns.append(new_column)

In [None]:
df[new_column].head()

In [None]:
df[modified_columns].groupby(new_column).max()

## Cluster Analysis

In [None]:
sns.scatterplot(data=df, x='TOT_DON_AMNT_14', y='TOT_DON_AMNT_15', hue='cluster')

In [None]:
sns.scatterplot(data=df, x='MINRAMNT', y='MAXRAMNT', hue='cluster')

In [None]:
sns.scatterplot(data=df, x='MAXRAMNT', y='CONVERSION_RATE', hue='cluster')

In [None]:
df['RFA_3'].str[0].value_counts()