# Schema

https://docs.google.com/document/d/1wDCb29oYVJIF2RPNtrnzxEYs1lH0J4RwYACDIn4kPhA/edit#heading=h.4wl6nl945n7g

## Imports and Load Data

In [44]:
!pip install validators

Defaulting to user installation because normal site-packages is not writeable


In [45]:
import pandas as pd
import validators
import uuid

In [46]:
df = pd.read_excel("42621 Final_Data_ngodarpan.gov.in.xlsx")
df.head()

Unnamed: 0,Name,ngo url,Mobile,UniqueID,Off phone1,Email,Major Activities1,operational states db,issues working db,operational district db,...,Asisstant Secretary mobile,Board Member name,Board Member email,Board Member mobile,Vice Chairman name,Vice Chairman email,Vice Chairman mobile,Member name,Member email,Member mobile
0,PRAYAS,,9778080000.0,OR/2009/0010000,06858-223440,director_prayas@yahoo.com,1.63 Nos. of SHGs formed,"ORISSA,","Agriculture,Children,Civic Issues,Disaster Man...","ORISSA->Nabarangapur ,",...,,,,,,,,,,
1,PONDICHERRYWOMENSCONFERENCE,,9443253000.0,PY/2016/0100001,0413-2213238,surebe33@gmail.com,Working for Women and Children Obtaining Loan ...,"PUDUCHERRY,","Women's Development & Empowerment,Children,","PUDUCHERRY->Puducherry,",...,,,,,,,,,,
2,SHABRI SAMAJ SEWA SAMITI,http://ssssamitibhind.org,7828394000.0,MP/2016/0100003,0751-1234689,ssssamitibhind@gmail.com,more than one thousand leadership development ...,"MADHYA PRADESH,","Animal Husbandry, Dairying & Fisheries,Agricul...","MADHYA PRADESH->Anuppur, Ashoknagar, Balaghat,...",...,,,,,,,,ALOK,ssssamitibhind@gmail.com,7828498000.0
3,ANAND GANGA SAMAJIK SIKSHA SAMITI,,9450678000.0,UP/2016/0100004,05566-281059,lovelyraivijendra@gmail.com,OUR ORGANISATION HAVE PLANTED MORE THAN 2 LAKH...,"UTTAR PRADESH,","Agriculture,Environment & Forests,Health & Fam...","UTTAR PRADESH->Deoria, Gorakhpur, Sant Kabir N...",...,,,,,,,,,,
4,Himaliyan Gram Vikas Samiti,,9412037000.0,UA/2016/0100009,05964-213271,hgvs1990@gmail.com,Facilitated formation and strengthening of 65C...,"UTTARAKHAND,","Animal Husbandry, Dairying & Fisheries,Agricul...","UTTARAKHAND->Almora , Bageshwar, Champawat, Pi...",...,,Krishna Nand,hgvsgan@yahoo.co.in,7500720000.0,Leela Dhar Joshi,hgvs.jleeladhar.lj@gmail.com,8057816000.0,,,


## Create Empty Dataframes

In [55]:
NgoBackground = pd.DataFrame(columns=[
    'ngoId', 
    'hasRegistration', 
    'hasFinance',
    'logo', 
    'yearEstablished', 
    'missionStatement', 
    'vision', 
    'objectives', 
    'legalStatus', 
    'areasOfOperation', 
    'activeStatus'
])
NgoRegistration = pd.DataFrame(columns=[
    'ngoId',
    'isRegistrationFor',
    'ngoType',
    'regNo',
    'regDate',
    'registrar',
    '12ANo',
    '12ARegdate',
    '12AUpload',
    '80GNo',
    '80GRegdate',
    '80GUpload',
    '35ACNo',
    '35ACRegdate',
    '35ACUpload',
    'fCRANo',
    'fCRARegdate',
    'natureListed',
    'fCRAStatus',
    'fCRAExpiration',
    'fCRAUpload',
    'pan',
    'tan',
    'gst',
    'notes'
])

NgoContact = pd.DataFrame(columns=[
    'ngoId',
    'ngoName',
    'orgEmail',
    'officePhone',
    'primaryPoc',
    'primaryPocPhone',
    'secondaryPoc',
    'secondaryPocPhone',
    'mailingAddress',
    'physicalAddress',
    'fieldOffices',
    'orgType',
    'orgWebsite',
    'websiteIsValid',
    'facebook',
    'twitter',
    'iInstagram',
    'youtube',
    'whatsapp',
    'otherSocials',
    'scrapeSource',
    'executiveDirector',
    'technicalSupport',
    'chairmanName',
    'chairmanMobile',
    'chairmanEmail',
    'viceChairmanName',
    'viceChairmanMobile',
    'viceChairmanEmail',
    'secretaryName',
    'secretaryMobile',
    'secretaryEmail',
    'assistantSecretaryName',
    'assistantSecretaryMobile',
    'assistantSecretaryEmail'
])

## Transform

In [56]:
df.columns

Index(['Name', 'ngo url', 'Mobile', 'UniqueID', 'Off phone1', 'Email',
       'Major Activities1', 'operational states db', 'issues working db',
       'operational district db', 'reg name', 'fcrano', 'nr regNo', 'nr add',
       'nr orgName', 'ngo reg date', 'nr actName', 'nr city',
       'TypeDescription', 'StateName', 'status', 'president name',
       'president email', 'president mobile', 'Chairman name',
       'Chairman email', 'Chairman mobile', 'Secretary name',
       'Secretary email', 'Secretary mobile', 'Asisstant Secretary name',
       'Asisstant Secretary email', 'Asisstant Secretary mobile',
       'Board Member name', 'Board Member email', 'Board Member mobile',
       'Vice Chairman name', 'Vice Chairman email', 'Vice Chairman mobile',
       'Member name', 'Member email', 'Member mobile'],
      dtype='object')

In [57]:
## Generate uuid3
NgoContact["ngoId"] = [uuid.uuid3(uuid.NAMESPACE_URL, _) for _ in df["Name"]]
NgoBackground["ngoId"] = [uuid.uuid3(uuid.NAMESPACE_URL, _) for _ in df["Name"]]
NgoRegistration["ngoId"] = [uuid.uuid3(uuid.NAMESPACE_URL, _) for _ in df["Name"]]

In [58]:
# Use title case
NgoContact["ngoName"] = df["nr orgName"].str.title()

# Website
NgoContact['orgWebsite'] = df['ngo url']
def validate_url(url):
    if pd.isna(url): # handle NaN values
        return False
    if validators.url(url):
        return True
    else:
        return False
NgoContact['websiteIsValid'] = NgoContact['orgWebsite'].apply(validate_url)

# Remove special characters.
NgoBackground["objectives"] = df["Major Activities1"].str.replace('[^A-Za-z0-9 .,!()-_{}\[\]?/;]', '', regex=True)

NgoContact["type"] = df['TypeDescription']
NgoContact['scrapeSource'] = 'NGO Darpan'
# Emails
NgoContact['orgEmail'] = df['Email']
NgoContact['chairmanEmail'] = df['Chairman email']
NgoContact['viceChairmanEmail'] = df['Vice Chairman email']
NgoContact['secretaryEmail'] = df['Secretary email']
NgoContact['assistantSecretaryEmail'] = df['Asisstant Secretary email']

# Names
NgoContact['chairmanName'] = df['Chairman name']
NgoContact['viceChairmanName'] = df['Vice Chairman name']
NgoContact['secretaryName'] = df['Secretary name']
NgoContact['assistantSecretaryName'] = df['Asisstant Secretary name']

# Other information
NgoRegistration['regNo'] = df['nr regNo']
NgoRegistration['registrar'] = df['reg name']
NgoRegistration['fCRANo'] = df['fcrano']
NgoBackground['activeStatus'] = df['status']

NgoContact["physicalAddress"] = df["nr add"]

In [59]:
NgoBackground

Unnamed: 0,ngoId,hasRegistration,hasFinance,logo,yearEstablished,missionStatement,vision,objectives,legalStatus,areasOfOperation,activeStatus
0,375c2cb5-86a7-31ab-8356-e290694368c8,,,,,,,1.63 Nos. of SHGs formed,,,
1,4a832087-dabc-366a-a9b6-a0ddda4ab8f9,,,,,,,Working for Women and Children Obtaining Loan ...,,,
2,d07a31ac-b221-3d97-8b4f-864d5ff1ef6f,,,,,,,more than one thousand leadership development ...,,,
3,6f91f063-e6b9-3a5e-a6f6-a6a78773fab0,,,,,,,OUR ORGANISATION HAVE PLANTED MORE THAN 2 LAKH...,,,
4,882950e6-7ccc-3c79-a6dc-129aabf11c50,,,,,,,Facilitated formation and strengthening of 65C...,,,
...,...,...,...,...,...,...,...,...,...,...,...
111924,0745b1ca-0b4b-33a5-a267-92e4657edb50,,,,,,,We are working for socio economic development ...,,,
111925,b79235e9-20d9-3c76-844a-d19f9a13d373,,,,,,,(IRCA),,,
111926,e8ca818c-7d57-30d3-af7a-8bb65312bd1e,,,,,,,We are working for socio economic development ...,,,
111927,20e812e0-344b-3c81-b033-e271d94572ec,,,,,,,"Our Trust Is Working In Prohibition, Agricultu...",,,


In [60]:
NgoRegistration

Unnamed: 0,ngoId,isRegistrationFor,ngoType,regNo,regDate,registrar,12ANo,12ARegdate,12AUpload,80GNo,...,fCRANo,fCRARegdate,natureListed,fCRAStatus,fCRAExpiration,fCRAUpload,pan,tan,gst,notes
0,375c2cb5-86a7-31ab-8356-e290694368c8,,,8/2000,,Registrar of Societies,,,,,...,105100015,,,,,,,,,
1,4a832087-dabc-366a-a9b6-a0ddda4ab8f9,,,152 OF 1992,,Registrar of Societies,,,,,...,,,,,,,,,,
2,d07a31ac-b221-3d97-8b4f-864d5ff1ef6f,,,4051,,Registrar of Societies,,,,,...,,,,,,,,,,
3,6f91f063-e6b9-3a5e-a6f6-a6a78773fab0,,,1494/2013-2014,,Registrar of Societies,,,,,...,,,,,,,,,,
4,882950e6-7ccc-3c79-a6dc-129aabf11c50,,,62/1992,,Registrar of Societies,,,,,...,347990011,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111924,0745b1ca-0b4b-33a5-a267-92e4657edb50,,,315 1989-90,,Registrar of Societies,,,,,...,,,,,,,,,,
111925,b79235e9-20d9-3c76-844a-d19f9a13d373,,,01/07/01/18571/07,,Registrar of Societies,,,,,...,,,,,,,,,,
111926,e8ca818c-7d57-30d3-af7a-8bb65312bd1e,,,1723,,Registrar of Societies,,,,,...,,,,,,,,,,
111927,20e812e0-344b-3c81-b033-e271d94572ec,,,E/1738/Valsad,,Charity Commissioner,,,,,...,,,,,,,,,,


In [61]:
NgoContact

Unnamed: 0,ngoId,ngoName,orgEmail,officePhone,primaryPoc,primaryPocPhone,secondaryPoc,secondaryPocPhone,mailingAddress,physicalAddress,...,viceChairmanName,viceChairmanMobile,viceChairmanEmail,secretaryName,secretaryMobile,secretaryEmail,assistantSecretaryName,assistantSecretaryMobile,assistantSecretaryEmail,type
0,375c2cb5-86a7-31ab-8356-e290694368c8,Prayas,director_prayas@yahoo.com,,,,,,,"At- Ekamba, Po- Nandahandi, Via- Dangarbheja, ...",...,,,,Santosh Kumar Sadangi,,prayasindia.ngo@gmail.com,,,,Registered Societies (Non-Government)
1,4a832087-dabc-366a-a9b6-a0ddda4ab8f9,Pondicherrywomensconference,surebe33@gmail.com,,,,,,,"C-15 Rose Apartments Venkata Nagar, Pondicherr...",...,,,,Swarnalatha,,surebe33@gmail.com,,,,Registered Societies (Non-Government)
2,d07a31ac-b221-3d97-8b4f-864d5ff1ef6f,Shabri Samaj Sewa Samiti,ssssamitibhind@gmail.com,,,,,,,C/o Rajaram prajapati kalyanpura road gormi bh...,...,,,,PRADEEP DHAKAR,,ssssamitibhind@gmail.com,,,,Registered Societies (Non-Government)
3,6f91f063-e6b9-3a5e-a6f6-a6a78773fab0,Anand Ganga Samajik Siksha Samiti,lovelyraivijendra@gmail.com,,,,,,,VILL - BARGO POST - BARGO BLOCK - HAISAR TEHSI...,...,,,,,,,,,,Registered Societies (Non-Government)
4,882950e6-7ccc-3c79-a6dc-129aabf11c50,Himaliyan Gram Vikas Samiti,hgvs1990@gmail.com,,,,,,,AwalaghatRoad DasaithalGangolihat ( Pithoragar...,...,Leela Dhar Joshi,,hgvs.jleeladhar.lj@gmail.com,KESHAR SINGH,,hgvs.kesher@gmail.com,,,,Registered Societies (Non-Government)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111924,0745b1ca-0b4b-33a5-a267-92e4657edb50,Hariom Samaj Vikas Samiti,homsvs1989@yahoo.co.in,,,,,,,186/126 A Allenganj Allahabad,...,,,,,,,,,,Registered Societies (Non-Government)
111925,b79235e9-20d9-3c76-844a-d19f9a13d373,Narmadanchal Naya Jeevan Jan Kalyaan Seva Sami...,nnjjkss1@gmail.com,,,,,,,"Hanuman Mandir, Sanjay Nagar, Gwaltoli, Hoshan...",...,,,,Kamla Yadav,,yadavvandana03081976@gmail.com,,,,Registered Societies (Non-Government)
111926,e8ca818c-7d57-30d3-af7a-8bb65312bd1e,Mathura Prasad Gramodyog Sansthan,mathuravikas1977@gmail.com,,,,,,,186/126 A Allenganj Allahabad,...,Gulrej,,mathuravikas1977@gmail.com,Farzana,,mathuravikas1977@gmail.com,,,,Registered Societies (Non-Government)
111927,20e812e0-344b-3c81-b033-e271d94572ec,Shree Swaminarayan Education Trust,snsdharampur@gmail.com,,,,,,,"College Road, At. Po. Motapondha Ta. Kaprada, ...",...,,,,Rameshbhai,,snsmotapondha@gmail.com,,,,Trust (Non-Government)


In [63]:
# Export to CSV
NgoBackground.to_csv('NgoBackground.csv', index=False)
NgoRegistration.to_csv('NgoRegistration.csv', index=False)
NgoContact.to_csv('NgoContact.csv', index=False)