In [1]:
import pandas as pd
import seaborn as sns

In [2]:
# read in data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
file_path = 'https://raw.githubusercontent.com/Lambda-School-Labs/Labs25-Bridges_to_Prosperity-TeamC-ds/main/B2P%20Rwanda%20Site%20Assessment%20Data_2020.06.03.csv'
df = pd.read_csv(file_path, encoding='latin-1')
print(f'Dimensions of dataset: {df.shape}')
df.head()

Dimensions of dataset: (1472, 27)


Unnamed: 0,Country,Province,District,Sector,Cell,Bridge Site Name,Project Stage,Project Sub-Stage,Project Code,Bridge Type,Span (m),GPS (Latitude),GPS (Longitude),Individuals Directly Served,Form: Form Name,Community Served 1,Community Served 2,Community Served 3,Community Served 4,Community Served 5,Community Served 6,Community Served 7,Community Served 8,Community Served 9,Community Served 10,CaseSafeID Form,Bridge Opportunity: Opportunity ID
0,Rwanda,Western Province,Rusizi,Giheke,Gakomeye,Buzi,Rejected,Technical,1014107,Suspended,,-2.42056,28.9662,,Project Assessment - 2018.10.29,Buzi,Kabuga,Kagarama,Gacyamo,Gasheke,,,,,,a1if1000002e51bAAA,006f100000d1fk1
1,Rwanda,Western Province,Rusizi,Giheke,Gakomeye,Kamigisha,Rejected,Technical,1014106,Suspended,,-2.42486,28.95726,,Project Assessment - 2018.10.29,Kabuga,Buzi,Gacyamo - Gasheke,Kagarama,Kanoga,,,,,,a1if1000002e51WAAQ,006f100000d1fjw
2,Rwanda,Northern Province,Rulindo,Buyoga,Gahororo-Gipfundo,Gipfundo,Rejected,Technical,1007651,Suspended,8.0,-1.72053,30.08124,,Project Assessment - 2018.8.11,Gapfundo,Rusabira ( Gicumbi),Merezo,Minanire,Karutongo,,,,,,a1if10000025nz8AAA,006f100000a86I3
3,Rwanda,Northern Province,Gicumbi,Kageyo,Kabuga/gatobotobo,Nyarubande,Rejected,,1012493,Other,,-1.65595,30.07884,,Project Assessment - 2018.8.11,,,,,,,,,,,a1if10000025nzDAAQ,006f100000cPpL8
4,Rwanda,Southern Province,Kamonyi,Kayenzi,Kirwa,Gisizi,Identified,Requested,1014318,Suspended,,-1.870868,29.877686,,Project Assessment - 2018.11.15,Gisizi,Nyabitare,,,,,,,,,a1if1000002gMwRAAU,006f100000eescb


In [18]:
# split the Form: Form Name to get the data from the column and convert date to datetime

df['Assessment Date'] = df['Form: Form Name'].str.replace('Project Assessment - ', "")

df['Assessment Date'] = pd.to_datetime(df['Assessment Date'])
print(f'Dimensions of dataset: {df.shape}')
df.head()

In [55]:
# what are the duplicate project codes?
# it looks like that all the duplicates have two values. One has three.
# a quick and simple way to remove the duplicates would keep the sorting by project code and assesment date
# then drop the duplicates and keep the first occurence. That way we have the most recent date. 

# this was our test to find all the duplicates that we needed to drop
# testing = df[df.duplicated(subset='Project Code', keep=False)].sort_values(by=['Project Code', 'Assessment Date'], ascending=False)
# testing.head()
# testing[testing['Individuals Directly Served'].notna()]

# Indices that we 

In [4]:
# manually went thru above duplicates and selected those that needed to be DROPPED. 

indexes_to_drop = [192,210,216,767,202,1119,908,735,901,272,643,756,
                   738,193,655,642,327,909,1274,1289,135,176,1213,
                   493,178,205,387,397,395,398,399,401,372,371,892,
                   839,776,171,169,170,1117,1096,771,1109,95,89,132,133,134,
                   ]


df = df.drop(index=indexes_to_drop)
print(f'Dimensions of dataset: {df.shape}')

Dimensions of dataset: (1423, 28)


In [67]:
# make sure there are no duplicates 

df.duplicated(subset='Project Code', keep=False).value_counts()

False    1423
dtype: int64

In [17]:
# there are some columns that are all null values, we can drop those.
df.isnull().sum()


In [5]:
# these are columns we can drop
col_drop = ['Community Served 6', 'Community Served 7',
            'Community Served 8', 'Community Served 9',
            'Community Served 10'
           ]

# these are columns that we want as Ids when we melt the table
ID_variable = ['Country','Province', 'District', 'Sector', 'Cell', 
               'Bridge Site Name', 'Project Stage', 
               'Project Sub-Stage', 'Project Code', 'Bridge Type',
               ' Span (m)', ' GPS (Latitude)', 'GPS (Longitude)',
               'Individuals Directly Served', 'Form: Form Name',
               'CaseSafeID Form', 'Bridge Opportunity: Opportunity ID',
               'Assessment Date'
              ]

# these are the columns that we want to melt to values
value_variables = ['Community Served 1', 'Community Served 2',
                   'Community Served 3', 'Community Served 4',
                   'Community Served 5'
                  ]

In [16]:
# melt the dataframe so we get all the communities that a bridge would serve in one column
# this will make duplicate observations of bridges. THis is okay since we want to be able to get counts
# of how many villages a bridge would service.

melted_df = pd.melt(df, id_vars=ID_variable, value_vars=value_variables, var_name='Original_Community_col', value_name='Community_Served', )
print(f'Dimensions of dataset: {melted_df.shape}')
melted_df.head()

In [15]:
# now that we have our melted dataframe, we want to delete any observation that has a 
# null value in the 'Community_Served' column

melted_df = melted_df[melted_df['Community_Served'].notna()]
print(f'Dimensions of dataset: {melted_df.shape}')
melted_df.head()

In [14]:
# lets check to see if there are still null values that we care about in the dataset

melted_df.isnull().sum()


In [13]:
melted_df.sort_values(by='Project Code').head()

In [125]:
print(melted_df['Province'].unique())

print(melted_df['Dist'].unique())

array(['Western Province', 'Northern Province', 'Southern Province',
       'Kigali', 'Eastern Province'], dtype=object)

In [19]:
# Read in the Data of Government Entities with Government IDs

gov_file_path = 'https://raw.githubusercontent.com/tmbern/Labs25-Bridges_to_Prosperity-TeamC-ds/main/Rwanda%20Administrative%20Levels%20and%20Codes_Province%20through%20Village_2019.02.28.csv'
gov_df = pd.read_csv(gov_file_path, encoding='latin-1')
print(f'Dimensions of dataset: {gov_df.shape}')
gov_df.head()


Dimensions of dataset: (14816, 12)


Unnamed: 0,Prov_ID,Province,Dist_ID,District,Sect_ID,Sector,Cell_ID,Cell,Vill_ID,Village,Status,FID
0,2,Amajyepfo,22,Gisagara,2205,Kigembe,220501,Agahabwa,22050101,Agahehe,Rural,1842
1,2,Amajyepfo,22,Gisagara,2205,Kigembe,220501,Agahabwa,22050102,Kabacuzi,Rural,1846
2,2,Amajyepfo,22,Gisagara,2205,Kigembe,220501,Agahabwa,22050103,Kamutozo,Rural,1854
3,2,Amajyepfo,22,Gisagara,2205,Kigembe,220501,Agahabwa,22050104,Kamweko,Rural,1866
4,2,Amajyepfo,22,Gisagara,2205,Kigembe,220501,Agahabwa,22050105,Nyamabuye,Rural,1873


In [26]:
# District ID dict 
district_ID_dict = gov_df['Dist_ID'].groupby(gov_df['District']).unique().apply(pd.Series).to_dict()[0]
district_ID_dict

{'Bugesera': 57,
 'Burera': 44,
 'Gakenke': 42,
 'Gasabo': 12,
 'Gatsibo': 53,
 'Gicumbi': 45,
 'Gisagara': 22,
 'Huye': 24,
 'Kamonyi': 28,
 'Karongi': 31,
 'Kayonza': 54,
 'Kicukiro': 13,
 'Kirehe': 55,
 'Muhanga': 27,
 'Musanze': 43,
 'Ngoma': 56,
 'Ngororero': 35,
 'Nyabihu': 34,
 'Nyagatare': 52,
 'Nyamagabe': 25,
 'Nyamasheke': 37,
 'Nyanza': 21,
 'Nyarugenge': 11,
 'Nyaruguru': 23,
 'Rubavu': 33,
 'Ruhango': 26,
 'Rulindo': 41,
 'Rusizi': 36,
 'Rutsiro': 32,
 'Rwamagana': 51}

In [126]:
# Province dictionary. Key is B2P Province name, Value is government province ID
province_dict = {'Northern Province': 4,
                 'Southern Province': 2,
                 'Eastern Province': 5,
                 'Western Province': 3,
                 'Kigali': 1
                 }

In [140]:
# There are the same amount of government Districts as there is in B2P districts. 
len(melted_df['District'].unique()) == len(gov_df['District'].unique())
# check length of "district" strings on both dfs
sorted(melted_df['District'].unique()) == sorted(gov_df['District'].unique())

True

In [27]:
melted_df.columns

Index(['Country', 'Province', 'District', 'Sector', 'Cell', 'Bridge Site Name',
       'Project Stage', 'Project Sub-Stage', 'Project Code', 'Bridge Type',
       ' Span (m)', ' GPS (Latitude)', 'GPS (Longitude)',
       'Individuals Directly Served', 'Form: Form Name', 'CaseSafeID Form',
       'Bridge Opportunity: Opportunity ID', 'Assessment Date',
       'Original_Community_col', 'Community_Served'],
      dtype='object')

In [28]:
gov_df.columns

Index(['Prov_ID', 'Province', 'Dist_ID', 'District', 'Sect_ID', 'Sector',
       'Cell_ID', 'Cell', 'Vill_ID', 'Village', 'Status', 'FID'],
      dtype='object')

In [29]:
# Sector ID 
# check that df['sector'] == gov_df['sector']
len(melted_df['Sector'].unique()) == len(gov_df['Sector'].unique())

False

In [44]:
# check df['sector'] not in gov_df['sector']
x = [x for x in set(melted_df['Sector']) if x not in set(gov_df['Sector'])]
len(x)

348

In [196]:
# Strip sectors down to single sector names 
# with regular expressions 
import re
melted_df['new_sector'] = [re.split("-|~|_|(sector)|(Sector)|(cell)|\(|~", i)[0] for i in melted_df['Sector']]
melted_df['new_sector'] = melted_df['new_sector'].str.replace(" ","")
melted_df['new_sector']

In [199]:
# confirm that df sector list is less than gov list of sectors - important 
melted_df['new_sector'].nunique() < gov_df['Sector'].nunique()

True