In [11]:
import pandas as pd
import seaborn as sns

In [141]:
# read in data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
file_path = 'https://raw.githubusercontent.com/Lambda-School-Labs/Labs25-Bridges_to_Prosperity-TeamC-ds/main/B2P%20Rwanda%20Site%20Assessment%20Data_2020.06.03.csv'
df = pd.read_csv(file_path, encoding='latin-1')
print(f'Dimensions of dataset: {df.shape}')
df.head()

In [142]:
# split the Form: Form Name to get the data from the column and convert date to datetime

df['Assessment Date'] = df['Form: Form Name'].str.replace('Project Assessment - ', "")

df['Assessment Date'] = pd.to_datetime(df['Assessment Date'])
print(f'Dimensions of dataset: {df.shape}')
df.head()

In [14]:
# what are the duplicate project codes?
# it looks like that all the duplicates have two values. One has three.
# a quick and simple way to remove the duplicates would keep the sorting by project code and assesment date
# then drop the duplicates and keep the first occurence. That way we have the most recent date. 

# this was our test to find all the duplicates that we needed to drop
# testing = df[df.duplicated(subset='Project Code', keep=False)].sort_values(by=['Project Code', 'Assessment Date'], ascending=False)
# testing.head()
# testing[testing['Individuals Directly Served'].notna()]

# Indices that we 

In [14]:
# manually went thru above duplicates and selected those that needed to be DROPPED. 

indexes_to_drop = [192,210,216,767,202,1119,908,735,901,272,643,756,
                   738,193,655,642,327,909,1274,1289,135,176,1213,
                   493,178,205,387,397,395,398,399,401,372,371,892,
                   839,776,171,169,170,1117,1096,771,1109,95,89,132,133,134,
                   ]


df = df.drop(index=indexes_to_drop)
print(f'Dimensions of dataset: {df.shape}')

Dimensions of dataset: (1423, 28)


In [15]:
# make sure there are no duplicates 

df.duplicated(subset='Project Code', keep=False).value_counts()

False    1423
dtype: int64

In [94]:
# there are some columns that are all null values, we can drop those.
df.isnull().sum()


In [95]:
# these are columns we can drop
col_drop = ['Community Served 6', 'Community Served 7',
            'Community Served 8', 'Community Served 9',
            'Community Served 10'
           ]

# these are columns that we want as Ids when we melt the table
ID_variable = ['Country','Province', 'District', 'Sector', 'Cell', 
               'Bridge Site Name', 'Project Stage', 
               'Project Sub-Stage', 'Project Code', 'Bridge Type',
               ' Span (m)', ' GPS (Latitude)', 'GPS (Longitude)',
               'Individuals Directly Served', 'Form: Form Name',
               'CaseSafeID Form', 'Bridge Opportunity: Opportunity ID',
               'Assessment Date'
              ]

# these are the columns that we want to melt to values
value_variables = ['Community Served 1', 'Community Served 2',
                   'Community Served 3', 'Community Served 4',
                   'Community Served 5'
                  ]

In [96]:
# melt the dataframe so we get all the communities that a bridge would serve in one column
# this will make duplicate observations of bridges. THis is okay since we want to be able to get counts
# of how many villages a bridge would service.

b2p_df = pd.melt(df, id_vars=ID_variable, value_vars=value_variables, var_name='Original_Community_col', value_name='Community_Served', )
print(f'Dimensions of dataset: {b2p_df.shape}')
b2p_df.head()

Dimensions of dataset: (7115, 20)


Unnamed: 0,Country,Province,District,Sector,Cell,Bridge Site Name,Project Stage,Project Sub-Stage,Project Code,Bridge Type,Span (m),GPS (Latitude),GPS (Longitude),Individuals Directly Served,Form: Form Name,CaseSafeID Form,Bridge Opportunity: Opportunity ID,Assessment Date,Original_Community_col,Community_Served
0,Rwanda,Western Province,Rusizi,Giheke,Gakomeye,Buzi,Rejected,Technical,1014107,Suspended,,-2.42056,28.9662,,Project Assessment - 2018.10.29,a1if1000002e51bAAA,006f100000d1fk1,2018-10-29,Community Served 1,Buzi
1,Rwanda,Western Province,Rusizi,Giheke,Gakomeye,Kamigisha,Rejected,Technical,1014106,Suspended,,-2.42486,28.95726,,Project Assessment - 2018.10.29,a1if1000002e51WAAQ,006f100000d1fjw,2018-10-29,Community Served 1,Kabuga
2,Rwanda,Northern Province,Rulindo,Buyoga,Gahororo-Gipfundo,Gipfundo,Rejected,Technical,1007651,Suspended,8.0,-1.72053,30.08124,,Project Assessment - 2018.8.11,a1if10000025nz8AAA,006f100000a86I3,2018-08-11,Community Served 1,Gapfundo
3,Rwanda,Northern Province,Gicumbi,Kageyo,Kabuga/gatobotobo,Nyarubande,Rejected,,1012493,Other,,-1.65595,30.07884,,Project Assessment - 2018.8.11,a1if10000025nzDAAQ,006f100000cPpL8,2018-08-11,Community Served 1,
4,Rwanda,Southern Province,Kamonyi,Kayenzi,Kirwa,Gisizi,Identified,Requested,1014318,Suspended,,-1.870868,29.877686,,Project Assessment - 2018.11.15,a1if1000002gMwRAAU,006f100000eescb,2018-11-15,Community Served 1,Gisizi


In [97]:
# now that we have our melted dataframe, we want to delete any observation that has a 
# null value in the 'Community_Served' column

b2p_df = b2p_df[b2p_df['Community_Served'].notna()]
print(f'Dimensions of dataset: {b2p_df.shape}')
b2p_df.head()

Dimensions of dataset: (4483, 20)


Unnamed: 0,Country,Province,District,Sector,Cell,Bridge Site Name,Project Stage,Project Sub-Stage,Project Code,Bridge Type,Span (m),GPS (Latitude),GPS (Longitude),Individuals Directly Served,Form: Form Name,CaseSafeID Form,Bridge Opportunity: Opportunity ID,Assessment Date,Original_Community_col,Community_Served
0,Rwanda,Western Province,Rusizi,Giheke,Gakomeye,Buzi,Rejected,Technical,1014107,Suspended,,-2.42056,28.9662,,Project Assessment - 2018.10.29,a1if1000002e51bAAA,006f100000d1fk1,2018-10-29,Community Served 1,Buzi
1,Rwanda,Western Province,Rusizi,Giheke,Gakomeye,Kamigisha,Rejected,Technical,1014106,Suspended,,-2.42486,28.95726,,Project Assessment - 2018.10.29,a1if1000002e51WAAQ,006f100000d1fjw,2018-10-29,Community Served 1,Kabuga
2,Rwanda,Northern Province,Rulindo,Buyoga,Gahororo-Gipfundo,Gipfundo,Rejected,Technical,1007651,Suspended,8.0,-1.72053,30.08124,,Project Assessment - 2018.8.11,a1if10000025nz8AAA,006f100000a86I3,2018-08-11,Community Served 1,Gapfundo
4,Rwanda,Southern Province,Kamonyi,Kayenzi,Kirwa,Gisizi,Identified,Requested,1014318,Suspended,,-1.870868,29.877686,,Project Assessment - 2018.11.15,a1if1000002gMwRAAU,006f100000eescb,2018-11-15,Community Served 1,Gisizi
5,Rwanda,Southern Province,Kamonyi,Kayenzi,Kirwa,Ruheka,Rejected,Technical,1014319,Suspended,,-1.883957,29.850548,,Project Assessment - 2018.11.15,a1if1000002gMwqAAE,006f100000eescl,2018-11-15,Community Served 1,Ruheka


In [98]:
# lets check to see if there are still null values that we care about in the dataset

b2p_df.isnull().sum()


In [100]:
b2p_df.sort_values(by='Project Code').head()

In [23]:
print(b2p_df['Province'].unique())

print(b2p_df['District'].unique())

['Western Province' 'Northern Province' 'Southern Province' 'Kigali'
 'Eastern Province']
['Rusizi' 'Rulindo' 'Kamonyi' 'Gasabo' 'Gisagara' 'Kirehe' 'Rwamagana'
 'Gatsibo' 'Bugesera' 'Rubavu' 'Muhanga' 'Gakenke' 'Ngoma' 'Nyaruguru'
 'Nyagatare' 'Nyarugenge' 'Kicukiro' 'Ngororero' 'Nyanza' 'Karongi'
 'Gicumbi' 'Rutsiro' 'Ruhango' 'Huye' 'Musanze' 'Burera' 'Kayonza'
 'Nyamasheke' 'Nyabihu' 'Nyamagabe']


In [245]:
# Read in the Data of Government Entities with Government IDs

gov_file_path = 'https://raw.githubusercontent.com/tmbern/Labs25-Bridges_to_Prosperity-TeamC-ds/main/Rwanda%20Administrative%20Levels%20and%20Codes_Province%20through%20Village_2019.02.28.csv'
gov_df = pd.read_csv(gov_file_path, encoding='latin-1')
print(f'Dimensions of dataset: {gov_df.shape}')
gov_df.head()


In [102]:
# District ID dict 
district_ID_dict = gov_df['Dist_ID'].groupby(gov_df['District']).unique().apply(pd.Series).to_dict()[0]
district_ID_dict

{'Bugesera': 57,
 'Burera': 44,
 'Gakenke': 42,
 'Gasabo': 12,
 'Gatsibo': 53,
 'Gicumbi': 45,
 'Gisagara': 22,
 'Huye': 24,
 'Kamonyi': 28,
 'Karongi': 31,
 'Kayonza': 54,
 'Kicukiro': 13,
 'Kirehe': 55,
 'Muhanga': 27,
 'Musanze': 43,
 'Ngoma': 56,
 'Ngororero': 35,
 'Nyabihu': 34,
 'Nyagatare': 52,
 'Nyamagabe': 25,
 'Nyamasheke': 37,
 'Nyanza': 21,
 'Nyarugenge': 11,
 'Nyaruguru': 23,
 'Rubavu': 33,
 'Ruhango': 26,
 'Rulindo': 41,
 'Rusizi': 36,
 'Rutsiro': 32,
 'Rwamagana': 51}

In [104]:
# There are the same amount of government Districts as there is in B2P districts. 
len(b2p_df['District'].unique()) == len(gov_df['District'].unique())
# check length of "district" strings on both dfs
sorted(b2p_df['District'].unique()) == sorted(gov_df['District'].unique())

True

In [28]:
b2p_df.columns

Index(['Country', 'Province', 'District', 'Sector', 'Cell', 'Bridge Site Name',
       'Project Stage', 'Project Sub-Stage', 'Project Code', 'Bridge Type',
       ' Span (m)', ' GPS (Latitude)', 'GPS (Longitude)',
       'Individuals Directly Served', 'Form: Form Name', 'CaseSafeID Form',
       'Bridge Opportunity: Opportunity ID', 'Assessment Date',
       'Original_Community_col', 'Community_Served'],
      dtype='object')

In [29]:
gov_df.columns

Index(['Prov_ID', 'Province', 'Dist_ID', 'District', 'Sect_ID', 'Sector',
       'Cell_ID', 'Cell', 'Vill_ID', 'Village', 'Status', 'FID'],
      dtype='object')

In [105]:
# Sector ID 
# check that df['sector'] == gov_df['sector']
len(b2p_df['Sector'].unique()) == len(gov_df['Sector'].unique())

False

In [106]:
# check df['sector'] not in gov_df['sector']
x = [x for x in set(b2p_df['Sector']) if x not in set(gov_df['Sector'])]
len(x)

348

In [107]:
# Strip sectors down to single sector names 
# with regular expressions 
import re

b2p_df['new_sector'] = [re.split("-|~|_|(sector)|(Sector)|(cell)|\(|~", str(i))[0] for i in b2p_df['Sector']]
b2p_df['new_sector'] = b2p_df['new_sector'].str.replace(" ","")
b2p_df['new_sector'].head()

0     Giheke
1     Giheke
2     Buyoga
4    Kayenzi
5    Kayenzi
Name: new_sector, dtype: object

In [108]:
# confirm that df sector list is less than gov list of sectors - important 
b2p_df['new_sector'].nunique() < gov_df['Sector'].nunique()

True

In [61]:
# we group df by province district and sector. 
# get the count of the number of timex that a particular village is in that grouping. 

gov_df.head()

In [109]:
b2p_df['test_cell'] = b2p_df['Cell'].astype(str)

In [81]:
# test_list = [re.split("-|~|_|/|\(|\)", i) for i in b2p_df['test_cell']]

In [110]:
# helper function to clean characters in Cell string
def replace_char(x):
    """
    args: Cell
    returns: Cell value where chr replaced with whitespace
    """
    if type(x) == str:
         char = ['-', '~', '(', ')', '/', '_', 'between', 'villages', 
                'village', ' both', 'Both',' in ' , ' are ', ' village', 
                'all are in', 'village ', ' Village', 'Village ', '&', 'Between', ' Between ', 'Between ', 
                ' and ', ' of ', 'Village', 'Sector', 'sector']
         for i in char:
             if i in x:
                 x = x.replace(i, ' ')
    return x

In [111]:
# apply helper function to Cell column
b2p_df['test_cell'] = b2p_df['Cell'].apply(replace_char)

In [140]:
# Create list of Cells in govt data 
list_of_gov_cells = list(gov_df['Cell'].unique())
sorted(list_of_gov_cells)[600:]

In [242]:
# b2p_df[b2p_df['test_cell'].str.contains('cell',na=False)]

In [221]:
# Fill NaN with "Unknown"
b2p_df['test_cell'] = b2p_df['test_cell'].fillna("Unknown")

In [236]:
# Helper function to
# Complete cleaning of Cell
# return "Unknown for values not in govt data"
def cell_cleaner(x):
    """
    args: Cell
    return: clean value for Cell or "Unknown"
    """
    if type(x) == str:
        for i in list_of_gov_cells:
            if i in x.title():
                return i
            elif "Congo" in x:
                return "Congo-nil"
    return "Unknown"

In [237]:
# Apply helper function 
b2p_df['test_cell'] = b2p_df['test_cell'].apply(cell_cleaner)
b2p_df['test_cell'].nunique()

537

In [238]:
# confirm if helper function worked
x = [x for x in b2p_df['test_cell'] if (x not in list_of_gov_cells) and (x != "Unknown")]
len(x)

0

In [249]:
# Province dictionary. Key is B2P Province name, Value is government province ID
province_dict = {'Northern Province': 4,
                 'Southern Province': 2,
                 'Eastern Province': 5,
                 'Western Province': 3,
                 'Kigali': 1
                 }
# Map Province colummn to Province code
b2p_df['Prov_ID'] = b2p_df.Province.map(province_dict)

In [259]:
# # District ID dict 
district_ID_dict = gov_df['Dist_ID'].groupby(gov_df['District']).unique().apply(pd.Series).to_dict()[0]
# sorted(district_ID_dict)
# map District ID to District 
b2p_df['District_ID'] = b2p_df.District.map(district_ID_dict)

In [302]:
# Change gov_df.Sector to title case 
gov_df['Sector'] = gov_df['Sector'].apply(lambda x: x.title())

In [282]:
gov_sectors = list(gov_df['Sector'].unique())

In [307]:
# fill unknown sectors 
b2p_df['new_sector'] = b2p_df['new_sector'].fillna("Unknown")
b2p_df['new_sector'].isnull().sum()

0

In [308]:
b2p_df['new_sector'] = b2p_df['new_sector'].apply(lambda x: x.title())

In [310]:
def replace_sectors(x):
    for i in gov_sectors:
        if i in x:
            return i
    return x

In [311]:
b2p_df['new_sector'] = b2p_df['new_sector'].apply(replace_sectors)

In [380]:
corrected_dict = {
                    "Betweenmuhororo": "Muhororo",
                    "Rwanamiro": "Rwaniro",
                    "Rukuzo": "Rukozo",
                    "Rusheshe": "Rusasa",
                    "Gashali": "Gashari",
                    "Bwisìge": "Bwisige",
                    "Gasakka": "Gasaka",
                    "Giko": "Gikondo",
                    "Kabagari": "Kabagali",
                    "Kabengera": "Karengera",
                    "Katabagema": "Katabagemu",
                    "Kibingo": "Kibungo",
                    "Koza": "Kazo",
                    "Mushikir": "Mushikiri",
                    "Mwili": "Mwiri",
                    "Mwumba": "Mamba",
                    "Nkaka": "Nkanka",
                    "Nyagihinga": "Nyagihanga",
                    "Nyakariro": "Nyakaliro",
                    "Rerenge": "Rurenge",
                    "Rugalika": "Rugarika",
                    "Rugamba": "Ruramba",
                    "Buruhukioro": "Buruhukiro",
                    "Bushonyi": "Mushonyi",
                    "Cyingwa": "Cyungo",
                    "Gikheke": "Gikonko",
                    "Mulinga": "Muringa",
                    "Nan": "Unknown",
                    "Nyirangarama": "Ngarama",
                    "Bambiro": "Nyange",
                    "Cyimpindu": "Muhororo",
                    "Mutongo": "Muhororo",
                    "Rubirizi": "Kanombe",
                    "Buheta": "Gatebe",
                    "Gushali": "Nyagatare",

}

# Map correct sector to sector 
b2p_df['new_sector'].replace(corrected_dict,inplace=True)

In [381]:
# find incorrect values 
g = [x for x in b2p_df['new_sector'] if x not in gov_sectors]
len(g)

18

In [382]:
# Unknow sectors - resolve this 
sorted(set(g))

['Gahurizo', 'Kabukuba', 'Kamuragi', 'Nyakabingo', 'Pera', 'Unknown']

In [386]:
b2p_df[b2p_df['new_sector']=='Pera']

Unnamed: 0,Country,Province,District,Sector,Cell,Bridge Site Name,Project Stage,Project Sub-Stage,Project Code,Bridge Type,Span (m),GPS (Latitude),GPS (Longitude),Individuals Directly Served,Form: Form Name,CaseSafeID Form,Bridge Opportunity: Opportunity ID,Assessment Date,Original_Community_col,Community_Served,new_sector,test_cell,Prov_ID,District_ID
63,Rwanda,Western Province,Rusizi,Pera,,Rusayo,Rejected,Technical,1007661,Suspension,25.0,-2.697308,29.029517,17270.0,Project Assessment - 2018.12.13,a1if1000002qz1JAAQ,006f100000a86ID,2018-12-13,Community Served 1,Pera cell,Pera,Unknown,3,36
1486,Rwanda,Western Province,Rusizi,Pera,,Rusayo,Rejected,Technical,1007661,Suspension,25.0,-2.697308,29.029517,17270.0,Project Assessment - 2018.12.13,a1if1000002qz1JAAQ,006f100000a86ID,2018-12-13,Community Served 2,Kizura cell,Pera,Unknown,3,36


In [387]:
b2p_df.head()

Unnamed: 0,Country,Province,District,Sector,Cell,Bridge Site Name,Project Stage,Project Sub-Stage,Project Code,Bridge Type,Span (m),GPS (Latitude),GPS (Longitude),Individuals Directly Served,Form: Form Name,CaseSafeID Form,Bridge Opportunity: Opportunity ID,Assessment Date,Original_Community_col,Community_Served,new_sector,test_cell,Prov_ID,District_ID
0,Rwanda,Western Province,Rusizi,Giheke,Gakomeye,Buzi,Rejected,Technical,1014107,Suspended,,-2.42056,28.9662,,Project Assessment - 2018.10.29,a1if1000002e51bAAA,006f100000d1fk1,2018-10-29,Community Served 1,Buzi,Giheke,Gako,3,36
1,Rwanda,Western Province,Rusizi,Giheke,Gakomeye,Kamigisha,Rejected,Technical,1014106,Suspended,,-2.42486,28.95726,,Project Assessment - 2018.10.29,a1if1000002e51WAAQ,006f100000d1fjw,2018-10-29,Community Served 1,Kabuga,Giheke,Gako,3,36
2,Rwanda,Northern Province,Rulindo,Buyoga,Gahororo-Gipfundo,Gipfundo,Rejected,Technical,1007651,Suspended,8.0,-1.72053,30.08124,,Project Assessment - 2018.8.11,a1if10000025nz8AAA,006f100000a86I3,2018-08-11,Community Served 1,Gapfundo,Buyoga,Gahororo,4,41
4,Rwanda,Southern Province,Kamonyi,Kayenzi,Kirwa,Gisizi,Identified,Requested,1014318,Suspended,,-1.870868,29.877686,,Project Assessment - 2018.11.15,a1if1000002gMwRAAU,006f100000eescb,2018-11-15,Community Served 1,Gisizi,Kayenzi,Kirwa,2,28
5,Rwanda,Southern Province,Kamonyi,Kayenzi,Kirwa,Ruheka,Rejected,Technical,1014319,Suspended,,-1.883957,29.850548,,Project Assessment - 2018.11.15,a1if1000002gMwqAAE,006f100000eescl,2018-11-15,Community Served 1,Ruheka,Kayenzi,Kirwa,2,28
