In [124]:
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import pairwise_distances
from  ml_metrics import mapk

In [148]:
donations = pd.read_csv('./donations.csv')
projects = pd.read_csv('./projects.csv')
schools = pd.read_csv('./schools.csv')
donors = pd.read_csv('./donors.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
#donation_id_count_length= len(donation_id_count)
#print('Percentage of donations that are repeated', donation_id_count[donation_id_count['donor_counts']<2].sum()/donation_id_count_length)
#donation_id_count[donation_id_count['donor_counts']<2].sum()
#donation_id_count[donation_id_count['donor_counts']>2].sum()

In [None]:
donations = donations.merge(projects, how='left', on='Project ID')
donations = donations.merge(donation_id_count, how='left',on='Donor ID')

### What percentage of the people who donated more than once, donated to a project tied to the same school?
get donations data only for donors who donated more than once

In [None]:
print('len donations', len(donations))
donations_repeat_donors = donations[donations['donor_counts']>1]
print('len donor counts', len(donations_repeat_donors))
donations_repeat_not_null = donations_repeat_donors.dropna()

In [None]:
donation_id_count = donation_id_count[donation_id_count['donor_counts']<=3]

In [None]:
fig, ax = plt.subplots(1,2, figsize=(30,30))
donations_by_category = sns.countplot(data=projects, x='Project Resource Category', ax=ax[0])
project_category_count = sns.countplot(data=projects, x='Project Grade Level Category', ax=ax[1])
for a in ax:
    for tick in a.get_xticklabels():
        tick.set_rotation(45)
plt.show()
#So books, supplies and technology have the highest amount of donations

In [None]:
plt.figure(figsize=(20,15))
supplies_sub_category = sns.countplot(data=projects[projects['Project Resource Category']=='Supplies'],x='Project Subject Category Tree')
plt.xticks(rotation=45)
plt.show()

In [None]:
fig, ax = plt.subplots(3,2,figsize=(30,20))
donation_amount = sns.boxplot(x=donations['Donation Amount'], ax=ax[0])
project_cost = sns.boxplot(x=projects['Project Cost'], ax=ax[1])


In [None]:
x = np.sort(donation_id_count['donor_counts'])
y = np.arange(0, len(donation_id_count))/len(donation_id_count)
cum_dist_donor_counts = plt.plot(x,y,marker='.', linestyle='none')

## Cleaning

In [126]:
project_columns = ['Project ID', 'School ID', 'Project Resource Category', 'Project Grade Level Category']
school_columns = ['School ID', 'School Metro Type', 'School Percentage Free Lunch', 'School State', 'School District']
unprocessed_donations = donations[['Donation ID','Donor ID','Project ID', 'Donation Received Date']].merge(projects[project_columns], how='left', on='Project ID').merge(schools[school_columns], how='left', on='School ID')
unprocessed_donations['Donation Received Date'] = pd.to_datetime(unprocessed_donations['Donation Received Date'])

In [127]:
columns = project_columns + school_columns
for column in columns:
    print('Percentage of null values for ', column, 'is: ',(unprocessed_donations[column].isnull().sum()/ len(unprocessed_donations[column])*100) , '%')

Percentage of null values for  Project ID is:  0.0 %
Percentage of null values for  School ID is:  1.5737154073892992 %
Percentage of null values for  Project Resource Category is:  1.5765738380639296 %
Percentage of null values for  Project Grade Level Category is:  1.5737154073892992 %
Percentage of null values for  School ID is:  1.5737154073892992 %
Percentage of null values for  School Metro Type is:  1.57437668612746 %
Percentage of null values for  School Percentage Free Lunch is:  2.042668690606237 %
Percentage of null values for  School State is:  1.57437668612746 %
Percentage of null values for  School District is:  1.57437668612746 %


In [128]:
unprocessed_donations = unprocessed_donations.dropna()
unprocessed_donations = unprocessed_donations.reset_index(drop=True)

In [130]:
donation_id_count = donations['Donor ID'].value_counts()
donation_id_count = donation_id_count.to_frame(name='donor_counts')
donation_id_count = donation_id_count.reset_index().rename(columns={'index':'Donor ID'})
unprocessed_donations = unprocessed_donations.merge(donation_id_count, how='left',on='Donor ID')

In [131]:
# Sorting by Donor ID so that the subset of the data that we keep contains all possible donation id recommendations
# This way we can properly evaluate our recommendations 
unprocessed_donations = unprocessed_donations[unprocessed_donations['donor_counts']>1]
unprocessed_donations = unprocessed_donations.sort_values(by='Donor ID')
unprocessed_donations = unprocessed_donations.iloc[:2300,:]
unprocessed_donations = unprocessed_donations.reset_index(drop=True)

In [132]:
enc = OneHotEncoder()
encoded = enc.fit_transform(unprocessed_donations[['Project Resource Category', 'Project Grade Level Category', 'School State']]).toarray()
feature_labels = enc.get_feature_names()
processed_donations = pd.DataFrame(encoded, columns=feature_labels)

In [133]:
processed_donations = processed_donations.apply(lambda x: x.astype('bool'))

## Recommendation

In [134]:
similarity_matrix = pairwise_distances(processed_donations.to_numpy(), processed_donations.to_numpy(),metric='jaccard')
mapping = pd.Series(processed_donations.index, index=unprocessed_donations['Donation ID'])

In [139]:
def get_donation_id(donor_id,transaction_number='first'):
    donations = unprocessed_donations[unprocessed_donations['Donor ID']==donor_id].sort_values(by='Donation Received Date')['Donation ID'].to_numpy()
    if transaction_number=='first':
        return donations[0]
    elif transaction_number=='last':
        return donations[len(donations)-1]    
    
def get_recommendation(donation_id):
    index = mapping[donation_id]
    similarity_scores = list(enumerate(similarity_matrix[index]))
    similarity_scores = sorted(similarity_scores, key=lambda score:score[1], reverse=True)
    donation_indeces = [score[0] for score in similarity_scores[:5]]
    return unprocessed_donations['Project ID'].iloc[donation_indeces].tolist()                        

## Evaluation

In [140]:
unique_donor_ids = unprocessed_donations['Donor ID'].unique()
actual_donations = []
predicted_donations = []
for i in range(len(unique_donor_ids)):
    donor_id = unique_donor_ids[i]
    donation_id = get_donation_id(donor_id, transaction_number='first')
    predicted_donations.append(get_recommendation(donation_id))
    actual_donations.append(unprocessed_donations[unprocessed_donations['Donor ID']==donor_id]['Project ID'].tolist())

In [147]:
mapk(actual_donations, predicted_donations, 5)

0.001