# Schema

https://docs.google.com/document/d/1wDCb29oYVJIF2RPNtrnzxEYs1lH0J4RwYACDIn4kPhA/edit#heading=h.4wl6nl945n7g

In [2]:
import pandas as pd
import validators
import numpy as np
import uuid
import re

In [3]:
df = pd.read_csv("Final_IndiaNGO.csv", low_memory=False) 
df = df.loc[:, ~df.columns.str.contains('^Unnamed')] 
df.head()

Unnamed: 0,NGO Name,Unique Id of VO/NGO,President,Secretary,Treasurer,Registered With,Type of NGO,Registration Number,City of Registration,State of Registration,...,Key Issues,Operational States,Operational Districts,Chief Functionary,Chairman,Joint Secretary,Others,Actor,NGO Registration certificate,Telephone 2
0,Smile Foundation Sikkim,SK/2018/0220160,Sumitra Subba,Reema Subba,Khimoo Lepcha,Any Other,Registered Societies (Non-Government),E21/SAM/2805,Gangtok,SIKKIM,...,This Smile Foundation Sikkim NGO charity is wo...,Not Available,Not Available,Brahm Muni,Sant Ganga Ram,Pushpa Mohan suthar,nikita suresh bhalerao,jitendra asshok shirsate,Contact Us for certificate,9832557651
1,Vivek Mission Charitable Trust,PB/2016/0108881,Parvati Jangid,Ram Tirth,Amarjeet Singh,Registrar of Societies,Trust (Non-Government),082,Bathinda,PUNJAB,...,This Vivek Mission Charitable Trust NGO charit...,PUNJAB,PUNJAB->Bathinda,Indra Narayan Pradhan,Mohan Prakash Pradhan,sameena begam,ASHA SUKAYE,KOLATA NARASIMHA REDDY,Contact Us for certificate,09947421050
2,Youth Parliament,RJ/2018/0184767,Kumar Chhetri,Dr Mohan L Suthar,Hari Das Nepal,Sub-Registrar,Trust (Non-Government),2164,JODHPUR,RAJASTHAN,...,This Youth Parliament NGO charity is working o...,Not Available,Not Available,Vimhazo,Visenuo Avibu John,MAN SINGH RAJPUT,DAISY,MAHANTHESH GOWRIHALLI,Contact Us for certificate,09447387538
3,samaj sudhar samiti,MH/2018/0220090,MUMTAZ KHAN MANSURI,Monorath Nepal,Binita Karki,International Organisation,Academic Institutions (Private),F-3334/77,ujjain,LAKSHADWEEP,...,This samaj sudhar samiti NGO charity is workin...,SIKKIM,SIKKIM->East District,JAYARAMAN K,Shatung Phom,SUNITA GUGNANI,BABY K P,MAHANTHESH GOWRIHALLI,Contact Us for certificate,9745884441
4,AADHAR BAHUUDDESHIYA SEVA SANSTHA,SK/2019/0229258,MOHANLAL KHANDELWAL,Radhika Rai,Rusovil John,Any Other,Registered Societies (Non-Government),457/VOL No I,Gangtok,SIKKIM,...,This AADHAR BAHUUDDESHIYA SEVA SANSTHA NGO cha...,MANIPUR,MANIPUR->Churachandpur,MURHAVOYO TUNYI,Vimhazo,Ashok Varma,SEBASTIAN AROUJO,PALLI ESWAR RAO,Contact Us for certificate,080-23283823


In [22]:
NgoBackground = pd.DataFrame(columns=[
    'ngoId', 
    'hasRegistration', 
    'hasFinance',
    'logo', 
    'yearEstablished', 
    'missionStatement', 
    'vision', 
    'objectives', 
    'legalStatus', 
    'areasOfOperation', 
    'activeStatus'
])
NgoRegistration = pd.DataFrame(columns=[
    'ngoId',
    'isRegistrationFor',
    'ngoType',
    'regNo',
    'regDate',
    'registrar',
    '12ANo',
    '12ARegdate',
    '12AUpload',
    '80GNo',
    '80GRegdate',
    '80GUpload',
    '35ACNo',
    '35ACRegdate',
    '35ACUpload',
    'fCRANo',
    'fCRARegdate',
    'natureListed',
    'fCRAStatus',
    'fCRAExpiration',
    'fCRAUpload',
    'pan',
    'tan',
    'gst',
    'notes'
])

NgoContact = pd.DataFrame(columns=[
    'ngoId',
    'ngoName',
    'orgEmail',
    'officePhone',
    'primaryPoc',
    'primaryPocPhone',
    'secondaryPoc',
    'secondaryPocPhone',
    'mailingAddress',
    'physicalAddress',
    'fieldOffices',
    'orgType',
    'orgWebsite',
    'websiteIsValid',
    'facebook',
    'twitter',
    'iInstagram',
    'youtube',
    'whatsapp',
    'otherSocials',
    'scrapeSource',
    'executiveDirector',
    'technicalSupport',
    'chairmanName',
    'chairmanMobile',
    'chairmanEmail',
    'viceChairmanName',
    'viceChairmanMobile',
    'viceChairmanEmail',
    'secretaryName',
    'secretaryMobile',
    'secretaryEmail',
    'assistantSecretaryName',
    'assistantSecretaryMobile',
    'assistantSecretaryEmail'
])

In [23]:
df.columns

Index(['NGO Name', 'Unique Id of VO/NGO', 'President', 'Secretary',
       'Treasurer', 'Registered With', 'Type of NGO', 'Registration Number',
       'City of Registration', 'State of Registration', 'Date of Registration',
       'frca', 'City', 'State', 'Country', 'Telephone', 'Mobile Number',
       'Address', 'Email', 'Website', 'Key Issues', 'Operational States',
       'Operational Districts', 'Chief Functionary', 'Chairman',
       'Joint Secretary', 'Others', 'Actor', 'NGO Registration certificate',
       'Telephone 2'],
      dtype='object')

In [24]:
df.Telephone[df.Telephone == 'Not Available'].count()

35178

In [25]:
df.President

0              Sumitra Subba
1             Parvati Jangid
2              Kumar Chhetri
3        MUMTAZ KHAN MANSURI
4        MOHANLAL KHANDELWAL
                ...         
78692                    NaN
78693                    NaN
78694                    NaN
78695                    NaN
78696                    NaN
Name: President, Length: 78697, dtype: object

In [26]:
df.Chairman

0               Sant Ganga Ram
1        Mohan Prakash Pradhan
2           Visenuo Avibu John
3                 Shatung Phom
4                      Vimhazo
                 ...          
78692                      NaN
78693                      NaN
78694                      NaN
78695                      NaN
78696                      NaN
Name: Chairman, Length: 78697, dtype: object

In [33]:
df["NGO Name"].str.title()

0                  Smile Foundation Sikkim
1           Vivek Mission Charitable Trust
2                         Youth Parliament
3                      Samaj Sudhar Samiti
4        Aadhar Bahuuddeshiya Seva Sanstha
                       ...                
78692              Lok Sewa Sansthan Gonda
78693                    Lok Sewa Sansthan
78694                    Lok Sewa Sansthan
78695             Lok Sewa Sansthan Samiti
78696            Lokkalyans Amitibisal Pur
Name: NGO Name, Length: 78697, dtype: object

In [36]:
## Generate uuid3
df["NGO Name"].dropna()


NgoContact["ngoId"] = []

for name in df["NGO Name"].str.title():
    try:
        uid = uuid.uuid3(uuid.NAMESPACE_URL, name)
    except TypeError:
        uid = "NA"
        NgoContact["ngoId"].append(uid)

NgoBackground["ngoId"] = NgoContact["ngoId"]
NgoRegistration["ngoId"] = NgoContact["ngoId"]

NgoContact["ngoName"] = df["NGO Name"].str.title()
NgoContact['officePhone'] = df["Mobile Number"]
NgoContact["physicalAddress"] = df["Address"]
NgoContact["mailingAddress"] = df["Address"]
NgoContact['orgEmail'] = df['Email']
NgoContact["chairmanName"] = df['Chairman']
NgoContact["secretaryName"] = df['Secretary']
NgoContact['orgWebsite'] = df['Website']
def validate_url(url):
    if pd.isna(url): # handle NaN values
        return False
    if validators.url(url):
        return True
    else:
        return False
NgoContact['websiteIsValid'] = NgoContact['orgWebsite'].apply(validate_url)

NgoRegistration['registrar'] = df['Registered With']
NgoRegistration['ngoType'] = df['Type of NGO']
NgoRegistration['regNo'] = df['Registration Number']
NgoRegistration['regDate'] = df['Date of Registration']
NgoRegistration['fCRANo'] = df['fcra']

NgoBackground["missionStatement"] = df['Description']
NgoBackground['objectives'] = df['Key Issues']

TypeError: cannot concatenate object of type '<class 'str'>'; only Series and DataFrame objs are valid