In [10]:
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import linear_kernel

In [11]:
donations = pd.read_csv('~/Desktop/recommender_system_project/donations.csv')
projects = pd.read_csv('~/Desktop/recommender_system_project/projects.csv')
schools = pd.read_csv('~/Desktop/recommender_system_project/schools.csv')

In [13]:
donation_id_count = donations['Donor ID'].value_counts()
donation_id_count = donation_id_count.to_frame(name='donor_counts')
donation_id_count = donation_id_count.reset_index().rename(columns={'index':'Donor ID'})
donation_id_count.head()

Unnamed: 0,Donor ID,donor_counts
0,39df9399f5384334a42905bcf0acdcbf,18035
1,237db43817f34988f9d543ca518be4ee,14565
2,a0e1d358aa17745ff3d3f4e4909356f3,10515
3,6f74ffb17cbb2b616b1eef06bd4acd0c,9029
4,a1929a1172ad0b3d14bc84f54018c563,6427


In [None]:
#donation_id_count_length= len(donation_id_count)
#print('Percentage of donations that are repeated', donation_id_count[donation_id_count['donor_counts']<2].sum()/donation_id_count_length)
#donation_id_count[donation_id_count['donor_counts']<2].sum()
#donation_id_count[donation_id_count['donor_counts']>2].sum()

In [None]:
donations = donations.merge(projects, how='left', on='Project ID')
donations = donations.merge(donation_id_count, how='left',on='Donor ID')

### What percentage of the people who donated more than once, donated to a project tied to the same school?
get donations data only for donors who donated more than once

In [None]:
print('len donations', len(donations))
donations_repeat_donors = donations[donations['donor_counts']>1]
print('len donor counts', len(donations_repeat_donors))
donations_repeat_not_null = donations_repeat_donors.dropna()

In [None]:
donation_id_count = donation_id_count[donation_id_count['donor_counts']<=3]

In [None]:
fig, ax = plt.subplots(1,2, figsize=(30,30))
donations_by_category = sns.countplot(data=projects, x='Project Resource Category', ax=ax[0])
project_category_count = sns.countplot(data=projects, x='Project Grade Level Category', ax=ax[1])
for a in ax:
    for tick in a.get_xticklabels():
        tick.set_rotation(45)
plt.show()
#So books, supplies and technology have the highest amount of donations

In [None]:
plt.figure(figsize=(20,15))
supplies_sub_category = sns.countplot(data=projects[projects['Project Resource Category']=='Supplies'],x='Project Subject Category Tree')
plt.xticks(rotation=45)
plt.show()

In [None]:
fig, ax = plt.subplots(3,2,figsize=(30,20))
donation_amount = sns.boxplot(x=donations['Donation Amount'], ax=ax[0])
project_cost = sns.boxplot(x=projects['Project Cost'], ax=ax[1])


In [None]:
x = np.sort(donation_id_count['donor_counts'])
y = np.arange(0, len(donation_id_count))/len(donation_id_count)
cum_dist_donor_counts = plt.plot(x,y,marker='.', linestyle='none')

## Cleaning

In [12]:
project_columns = ['Project ID', 'School ID', 'Project Resource Category', 'Project Grade Level Category']
school_columns = ['School ID', 'School Metro Type', 'School Percentage Free Lunch', 'School State', 'School District']
unprocessed_donations = donations[['Donation ID', 'Project ID']].merge(projects[project_columns], how='left', on='Project ID').merge(schools[school_columns], how='left', on='School ID')

In [13]:
unprocessed_donations.head()

Unnamed: 0,Donation ID,Project ID,School ID,Project Resource Category,Project Grade Level Category,School Metro Type,School Percentage Free Lunch,School State,School District
0,688729120858666221208529ee3fc18e,000009891526c0ade7180f8423792063,5aa86a53f658c198fd4e42c541411c76,Other,Grades 6-8,suburban,23.0,Utah,Jordan School District
1,dcf1071da3aa3561f91ac689d1f73dee,000009891526c0ade7180f8423792063,5aa86a53f658c198fd4e42c541411c76,Other,Grades 6-8,suburban,23.0,Utah,Jordan School District
2,18a234b9d1e538c431761d521ea7799d,000009891526c0ade7180f8423792063,5aa86a53f658c198fd4e42c541411c76,Other,Grades 6-8,suburban,23.0,Utah,Jordan School District
3,38d2744bf9138b0b57ed581c76c0e2da,000009891526c0ade7180f8423792063,5aa86a53f658c198fd4e42c541411c76,Other,Grades 6-8,suburban,23.0,Utah,Jordan School District
4,5a032791e31167a70206bfb86fb60035,000009891526c0ade7180f8423792063,5aa86a53f658c198fd4e42c541411c76,Other,Grades 6-8,suburban,23.0,Utah,Jordan School District


In [14]:
columns = project_columns + school_columns
for column in columns:
    print('Percentage of null values', (unprocessed_donations[column].isnull().sum()/ len(unprocessed_donations[column])*100) , '%')

Percentage of null values 0.0 %
Percentage of null values 1.5737154073892992 %
Percentage of null values 1.5765738380639296 %
Percentage of null values 1.5737154073892992 %
Percentage of null values 1.5737154073892992 %
Percentage of null values 1.57437668612746 %
Percentage of null values 2.042668690606237 %
Percentage of null values 1.57437668612746 %
Percentage of null values 1.57437668612746 %


In [15]:
unprocessed_donations = unprocessed_donations.dropna()
unprocessed_donations = unprocessed_donations.reset_index()

In [16]:
enc = OneHotEncoder()
encoded = enc.fit_transform(processed_donations[['Project Resource Category', 'Project Grade Level Category', 'School Metro Type','School State', 'School District']]).toarray()
feature_labels = enc.get_feature_names()
enc_matrix = pd.DataFrame(encoded, columns=feature_labels)

In [17]:
enc_matrix.info()
processed_donations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4591995 entries, 0 to 4591994
Columns: 10453 entries, x0_Art Supplies to x4_[State Run]
dtypes: float64(10453)
memory usage: 357.6 GB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4591995 entries, 0 to 4591994
Data columns (total 10 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   index                         int64  
 1   Donation ID                   object 
 2   Project ID                    object 
 3   School ID                     object 
 4   Project Resource Category     object 
 5   Project Grade Level Category  object 
 6   School Metro Type             object 
 7   School Percentage Free Lunch  float64
 8   School State                  object 
 9   School District               object 
dtypes: float64(1), int64(1), object(8)
memory usage: 350.3+ MB


In [None]:
processed_donations = unprocessed_donations[['School Percentage Free Lunch']].merge(enc_matrix, how='left', left_index=True, right_index=True)

In [None]:
processed_df = projects_schools[['School Percentage Free Lunch']].merge(enc_matrix, left_index=True, right_index=True)
processed_df.head()

## Recommendation

In [None]:
similarity_matrix = linear_kernel(processed_donations, processed_donations)
mapping = pd.Series(processed_donations.index, index=processed_donations['Donation ID'])

In [None]:
def get_recommendaiton(donation_id):
    index = mapping[donation_id]
    similarity_scores = list(enumerate(similarity_matrix[index]))
    similarity_scores = sorted(similarity_scores, key=lambda score:score[1], reverse=True)
    donation_indeces = [score[0] for score in similarity_scores[:5]]
    return unprocessed_donations['Project ID'].iloc[donation_indeces]                
                             