## Data Cleaning and Exploratory Data Analysis

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

Clean the data that was webscrapped.

In [2]:
df = pd.read_csv('../StartingWithToday/data/non-profit-orgs.csv')

In [None]:
df['name'] = [str(name).split('>')[1].split('<')[0] for name in df['name']]

In [None]:
df['org_type'] = [str(org).split('">')[1].split('<')[0] for org in df['org_type']]

In [None]:
df['services'] = [str(svcs).replace('<li class="sc-59ntl3-0 gNRvwb">','').replace('</li>','').\
                   replace('&amp;','and').replace(',',' & ').replace('[','').replace(']','').replace('<em>','').\
                   replace('</em>','') for svcs in df['services']]

In [None]:
df['website'] = [str(web[6]).split('>')[1].split('<')[0] for web in df['website'].values]

In [None]:
df['about'] = [''.join(str(about[0]).split('<p>')).split('>')[4].split('<')[0] for about in df['about'].values]

Replace `No value` with empty string.

In [3]:
for col in df.columns:
    df[col] = [np.nan if val == 'No value' or val == 'No coordinates found' else val for val in df[col]]

Search for the number of missing values.

In [4]:
df.isnull().sum()

org_url       0
name          0
location      2
website       0
about       290
services     62
org_type      0
lat          14
lng          14
zip           2
dtype: int64

Keep organizations whose `service area` is known.

In [5]:
df = df[~df['services'].isnull()]

In [6]:
df.shape

(938, 10)

Check for data types.

In [7]:
df.dtypes

org_url     object
name        object
location    object
website     object
about       object
services    object
org_type    object
lat         object
lng         object
zip         object
dtype: object

Convert the latitude and longitude object type columns to float type columns.

In [8]:
df['lat'] = df['lat'].astype(float)
df['lng'] = df['lng'].astype(float)

Keep only on-profit organizations in the data set.

In [9]:
df['org_type'].value_counts()

Nonprofit     866
Consultant     40
Government     17
Recruiter      15
Name: org_type, dtype: int64

In [10]:
df = df[df['org_type'] == 'Nonprofit']

Create a count of the number of organizations by service area. First, temporarily assign each organization to only one service area.


In [11]:
services = [svcs.split(' & ') for svcs in list(df['services'].value_counts().index)]

unique_services = []

for service in services:
    for svcs in service:
        unique_services.append(svcs.strip())

unique_services = sorted(list(set(unique_services)))
unique_services[:5]

['Agriculture',
 'Animals',
 'Arts and Music',
 'Children and Youth',
 'Civic Engagement']

Create an indicator for whether the organization offers at one of the types of services available.

In [12]:
for service in unique_services:
    df[service] = 0
    df.loc[df['services'].str.contains(service),service] = 1

Clean the service types column.

In [23]:
df['services'] = [' & '.join(sorted(svcs.replace('  ',' ').split(' & '))) for svcs in df['services']]

Export final version of data set for map-plotting.

In [24]:
df.to_csv('../StartingWithToday/data/non-profit-orgs-cleaned.csv',index=False)