# Schema

https://docs.google.com/document/d/1wDCb29oYVJIF2RPNtrnzxEYs1lH0J4RwYACDIn4kPhA/edit#heading=h.4wl6nl945n7g

In [43]:
import pandas as pd
import validators
import numpy as np
import uuid
!ls

helpyourngo.json lab1.ipynb       toSchema.ipynb
helpyourngo.sql  random.txt


In [37]:
NgoBackground = pd.DataFrame(columns=[
    'ngoId', 
    'hasRegistration', 
    'hasFinance',
    'logo', 
    'yearEstablished', 
    'missionStatement', 
    'vision', 
    'objectives', 
    'legalStatus', 
    'areasOfOperation', 
    'activeStatus'
])
NgoRegistration = pd.DataFrame(columns=[
    'ngoId',
    'isRegistrationFor',
    'ngoType',
    'regNo',
    'regDate',
    'registrar',
    '12ANo',
    '12ARegdate',
    '12AUpload',
    '80GNo',
    '80GRegdate',
    '80GUpload',
    '35ACNo',
    '35ACRegdate',
    '35ACUpload',
    'fCRANo',
    'fCRARegdate',
    'natureListed',
    'fCRAStatus',
    'fCRAExpiration',
    'fCRAUpload',
    'pan',
    'tan',
    'gst',
    'notes'
])

NgoContact = pd.DataFrame(columns=[
    'ngoId',
    'ngoName',
    'orgEmail',
    'officePhone',
    'primaryPoc',
    'primaryPocPhone',
    'secondaryPoc',
    'secondaryPocPhone',
    'mailingAddress',
    'physicalAddress',
    'fieldOffices',
    'orgType',
    'orgWebsite',
    'websiteIsValid',
    'facebook',
    'twitter',
    'iInstagram',
    'youtube',
    'whatsapp',
    'otherSocials',
    'scrapeSource',
    'executiveDirector',
    'technicalSupport',
    'chairmanName',
    'chairmanMobile',
    'chairmanEmail',
    'viceChairmanName',
    'viceChairmanMobile',
    'viceChairmanEmail',
    'secretaryName',
    'secretaryMobile',
    'secretaryEmail',
    'assistantSecretaryName',
    'assistantSecretaryMobile',
    'assistantSecretaryEmail'
])

NgoFinance = pd.DataFrame(columns=[
    'ngoId',
    'fiscalYear',
    'totalIncome',
    'totalGrant',
    'totalExpense',
    'totalAssets',
    'totalLiabilities',
    'totalFundingGap',
    'sourceOfFunds',
    'majorFunders',
    'ngoBudgetAllocation',
    'ngoBudgetUtilization',
    'auditReport',
    'avgMonthlyExpenditure',
    'avgMonthlyOverhead',
    'longTermLoan',
    'fulltimeEmployees',
    'parttimeEmployees',
    'employmentExpenses',
    'fulltimeVolunteers',
    'parttimeVolunteers',
    'volunteerExpenses',
    'consultants',
    'consultantExpenses',
    'occupancyExpenses',
    'fundraisingExpenses'
])

In [23]:
df = pd.read_json("helpyourngo.json")
df["name"] = df["name"].str.title()
df = df.dropna(subset=['name'])
df["ngoId"] = [uuid.uuid3(uuid.NAMESPACE_URL, _) for _ in df["name"]]
df.head()

Unnamed: 0,name,last_updated,address,mobile,email,website,annual_expenditure,description,ngoId
0,Aai Caretaker,2020.0,"Room No. B/4, Ashok Nagar, Near Kris...",+91 22 25530537,info@aaicaretaker.org.in,www.aaicaretaker.in,138990084.0,Aai Caretaker is involved in diverse...,608f2a51-35a5-3002-8229-c078531a7cdd
1,Aakriti,2015.0,"J-159, Sector-10 DLF, Faridabad 1210...",+91 9312263021,aakritischool@yahoo.in,www.aakritingo.org,1023204.0,"A parent-initiative, Association for...",c894bc4b-4c34-38ca-a874-fa9f9bc69677
2,Aakash Maindwal Foundation,2016.0,"107, First Floor, Block - Milano, Ma...",+91 120 4377527,aakashmaindwalfoundation@gmail.com,www.amfindia.org,767980.0,Aakash Maindwal Foundation has been ...,b023c29c-3ae0-3128-881c-b0245892e0aa
3,Aaradhana Sanstha,2013.0,"14, Sulabhpuram, Sikandara Bodla Roa...",+91 9639161612,drhchaudhary@yahoo.com,,,Aaradhana Sanstha was formed for edu...,7c8e8ac2-e381-3a90-bf13-c3786b890a05
4,Action Against Hunger (Fight Hunger ...,2019.0,"201, Sai Prasad Building, Sion Kamga...",+91 022 2611 1275,contact@fighthungerfoundation.org,www.actionagainsthunger.in,86348954.0,Action Against Hunger (AAH) register...,3e5c857a-4e60-36d9-b3a3-d14c02bf373b


In [24]:
df.columns

Index(['name', 'last_updated', 'address', 'mobile', 'email', 'website',
       'annual_expenditure', 'description', 'ngoId'],
      dtype='object')

In [25]:
# Set display option to show full text
pd.set_option('display.max_colwidth', 40)
print(df.description[:1])

0    Aai Caretaker is involved in diverse...
Name: description, dtype: object


In [67]:
# Mapping for NgoContact
NgoContact["ngoId"] = df['ngoId']
NgoContact["ngoName"] = df["name"].str.title()
NgoContact["orgEmail"] = df["email"]
NgoContact["officePhone"] = df["mobile"]
NgoContact["physicalAddress"] = df["address"]
NgoContact["mailingAddress"] = df["address"]
NgoContact["orgWebsite"] = df["website"]

def validate_url(url):
    if pd.isna(url):  # handle NaN values
        return False
    if validators.url(url):
        return True
    else:
        return False

NgoContact['websiteIsValid'] = NgoContact['orgWebsite'].apply(validate_url)
NgoContact['scrapeSource'] = 'helpyourngo'

# Mapping for NgoFinance
NgoFinance["ngoId"] = df['ngoId']
NgoFinance["fiscalYear"] = df['last_updated'].astype(float).astype('Int64')
NgoFinance['totalExpense'] = df['annual_expenditure'].str.replace(r'[,]', '', regex=True).fillna(0).astype(int)
NgoFinance['avgMonthlyExpenditure'] = NgoFinance['totalExpense']/12

# Mapping for NgoBackground
NgoBackground["ngoId"] = df['ngoId']
NgoBackground["objectives"] = df["description"]

In [70]:
# Drop rows where all columns are empty except Id
NgoContact = NgoContact.dropna(how='all', subset=NgoContact.columns.difference(['ngoId']))
NgoBackground = NgoBackground.dropna(how='all', subset=NgoBackground.columns.difference(['ngoId']))
NgoFinance = NgoFinance.dropna(how='all', subset=NgoFinance.columns.difference(['ngoId']))

In [71]:
# Export to CSV
NgoBackground.to_csv('NgoBackground.csv', index=False)
NgoFinance.to_csv('NgoFinance.csv', index=False)
NgoContact.to_csv('NgoContact.csv', index=False)