In [2]:
import pandas as pd
import numpy as np
# Load dataset
df = pd.read_csv("C:/Users/SMRC/Downloads/DigitalServiceUsage_Rwanda.csv")

In [3]:
# lets understand the nature of the data
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   District                1025 non-null   object 
 1   Service_Name            1025 non-null   object 
 2   Department              1025 non-null   object 
 3   Users_Reported          501 non-null    float64
 4   Satisfaction_Score_(%)  510 non-null    float64
 5   Year                    1025 non-null   int64  
 6   Month                   937 non-null    object 
dtypes: float64(2), int64(1), object(4)
memory usage: 56.2+ KB
None


In [4]:
df.head()

Unnamed: 0,District,Service_Name,Department,Users_Reported,Satisfaction_Score_(%),Year,Month
0,Musanze,eTax Portal,Ministry of Health,910.0,,2022,Unknown
1,Gasabo,Land Registration Portal,Immigration Department,3822.0,93.6,2023,Dec
2,Rusizi,eTax Portal,Rwanda Revenue Authority,516.0,82.9,2024,Mar
3,Nyagatare,E-Visa Service,Ministry of ICT,4476.0,,2024,Jun
4,Nyagatare,Digital Health Records,Rwanda Revenue Authority,,,2023,May


In [5]:
df.tail()

Unnamed: 0,District,Service_Name,Department,Users_Reported,Satisfaction_Score_(%),Year,Month
1020,Kayonza,Land Registration Portal,Rwanda Revenue Authority,1115.0,,2022,Sep
1021,Nyagatare,E-Visa Service,Ministry of ICT,1009.0,84.8,2023,Jan
1022,Rwamagana,Irembo Services,Rwanda Revenue Authority,,78.9,2024,May
1023,Nyarugenge,Irembo Services,Ministry of Health,370.0,,2023,Oct
1024,Kicukiro,eTax Portal,Ministry of Health,,77.0,2021,Nov


In [6]:
df.describe()

Unnamed: 0,Users_Reported,Satisfaction_Score_(%),Year
count,501.0,510.0,1025.0
mean,2524.217565,75.161569,2022.437073
std,1431.196236,14.497803,1.119863
min,51.0,50.1,2021.0
25%,1268.0,62.3,2021.0
50%,2447.0,75.45,2022.0
75%,3798.0,87.7,2023.0
max,4983.0,100.0,2024.0


In [7]:
# Checking for missing values
print(df.isnull().sum())

District                    0
Service_Name                0
Department                  0
Users_Reported            524
Satisfaction_Score_(%)    515
Year                        0
Month                      88
dtype: int64


We observe that Users_Reported is missing 524 values while Satisfaction_Score_(%) attribute miss 515. Since these attributes are considered important in analysis, the analysis decided to fill the missing values with median since they are continous in nature.

In [8]:
df['Users_Reported'].fillna(df['Users_Reported'].median(), inplace=True)
df['Satisfaction_Score_(%)'].fillna(df['Satisfaction_Score_(%)'].median(), inplace=True)

In [10]:
print(df.isnull().sum())

District                   0
Service_Name               0
Department                 0
Users_Reported             0
Satisfaction_Score_(%)     0
Year                       0
Month                     88
dtype: int64


we observe that the month attribute still have missing values. it needs to be handled indipendently. we start by checking its format

In [11]:
df['Month'].unique()

array(['Unknown', 'Dec', 'Mar', 'Jun', 'May', 'Aug', nan, 'Apr', 'Sep',
       'Feb', 'Jan', 'Nov', 'Jul', 'Oct'], dtype=object)

lets hanlde inconsistancies and then Fill by Related Group

In [12]:
df['Month'] = df['Month'].str.strip().str.title()

In [13]:
df['Month'] = df.groupby('District')['Month'].transform(
    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Unknown')
)

In [14]:
print(df.isnull().sum())

District                  0
Service_Name              0
Department                0
Users_Reported            0
Satisfaction_Score_(%)    0
Year                      0
Month                     0
dtype: int64


The above output shows that all the missing values have been handled

In [15]:
# lets check duplicated values
print(df.duplicated().sum())

25


We find that there are 25 duplicates in this dataset

In [17]:
df[df.duplicated()]

Unnamed: 0,District,Service_Name,Department,Users_Reported,Satisfaction_Score_(%),Year,Month
1000,Rubavu,Land Registration Portal,Ministry of Lands,2447.0,68.2,2021,Mar
1001,Kayonza,Irembo Services,Ministry of Health,1290.0,99.7,2022,Jun
1002,Gakenke,E-Visa Service,Ministry of Health,2447.0,69.5,2022,Unknown
1003,Rusizi,eTax Portal,Rwanda Revenue Authority,4458.0,75.45,2024,Mar
1004,Nyanza,E-Visa Service,Ministry of ICT,2447.0,75.45,2023,Jul
1005,Nyarugenge,Online Business Registration,Ministry of Health,2447.0,54.0,2021,Feb
1006,Karongi,E-Visa Service,Ministry of Health,4855.0,80.4,2022,Oct
1007,Nyarugenge,Digital Health Records,Immigration Department,2447.0,67.8,2021,Jul
1008,Rulindo,Irembo Services,Ministry of Health,3980.0,75.45,2023,Mar
1009,Bugesera,eTax Portal,Ministry of Lands,2447.0,75.45,2022,Mar


In [18]:
# removing duplicates
df.drop_duplicates(inplace=True)

In [19]:
print(df.duplicated().sum())

0


We observe that the duplicated values have been removed

In [20]:
# Check unique values for categorical columns
for col in df.select_dtypes(include='object'):
    print(f"\n{col} unique values:\n", df[col].unique())

# Check data types
print(df.dtypes)


District unique values:
 ['Musanze' 'Gasabo' 'Rusizi' 'Nyagatare' 'Rubavu' 'Burera' 'Nyarugenge'
 'Nyamagabe' 'Kicukiro' 'Huye' 'Nyanza' 'Rulindo' 'Rwamagana' 'Karongi'
 'Musanze-' 'Gakenke' 'Bugesera' 'Ngoma' 'Kayonza' 'Nyamagabe-' 'Ngoma-'
 'Huyex' 'Karongi-' 'Karongix' 'Bugeserax' 'Musanzex' 'Kicukirox'
 'Nyanzax']

Service_Name unique values:
 ['eTax Portal' 'Land Registration Portal' 'E-Visa Service'
 'Digital Health Records' 'Online Business Registration' 'Irembo Services']

Department unique values:
 ['Ministry of Health' 'Immigration Department' 'Rwanda Revenue Authority'
 'Ministry of ICT' 'Ministry of Lands']

Month unique values:
 ['Unknown' 'Dec' 'Mar' 'Jun' 'May' 'Aug' 'Apr' 'Sep' 'Feb' 'Jan' 'Nov'
 'Jul' 'Oct']
District                   object
Service_Name               object
Department                 object
Users_Reported            float64
Satisfaction_Score_(%)    float64
Year                        int64
Month                      object
dtype: object


We observe that District column has some District that typed wrong and in Department column too there there is no ministry of lands exists here in rwanda instead there is Ministry of environment and the national land authority which is in charge of land registrations and then in Month column I decided to make it full so that anyone can easly understand what that means. 

In [21]:
df['District'].unique()

array(['Musanze', 'Gasabo', 'Rusizi', 'Nyagatare', 'Rubavu', 'Burera',
       'Nyarugenge', 'Nyamagabe', 'Kicukiro', 'Huye', 'Nyanza', 'Rulindo',
       'Rwamagana', 'Karongi', 'Musanze-', 'Gakenke', 'Bugesera', 'Ngoma',
       'Kayonza', 'Nyamagabe-', 'Ngoma-', 'Huyex', 'Karongi-', 'Karongix',
       'Bugeserax', 'Musanzex', 'Kicukirox', 'Nyanzax'], dtype=object)

In [22]:
df.loc[:,'District'] = df['District'].replace({
    'Musanze-': 'Musanze',
    'Nyamagabe-': 'Nyamagabe',
    'Ngoma-': 'Ngoma',
    'Huyex': 'Huye',
    'Karongi-': 'Karongi',
    'Karongix': 'Karongi',
    'Bugeserax': 'Bugesera',

    'Musanzex': 'Musanze',
    'Kicukirox': 'Kicukiro',
    'Nyanzax': 'Nyanza',
})

In [23]:
df['District'].unique()

array(['Musanze', 'Gasabo', 'Rusizi', 'Nyagatare', 'Rubavu', 'Burera',
       'Nyarugenge', 'Nyamagabe', 'Kicukiro', 'Huye', 'Nyanza', 'Rulindo',
       'Rwamagana', 'Karongi', 'Gakenke', 'Bugesera', 'Ngoma', 'Kayonza'],
      dtype=object)

In [24]:
df.loc[:,'Department'] = df['Department'].replace({
    'Ministry of Lands': 'National Land Authority',
})

In [25]:
df['Department'].unique()

array(['Ministry of Health', 'Immigration Department',
       'Rwanda Revenue Authority', 'Ministry of ICT',
       'National Land Authority'], dtype=object)

In [26]:
df.loc[:,'Month'] = df['Month'].map({
    
    'Unknown' :'Unknown',
    'Dec' :'December',
    'Mar' :'March',
    'Jun' :'June',
    'May' :'May',
    'Aug' :'August',
    'Apr' :'April',
    'Sep' :'September',
    'Feb' :'February',
    'Jan' :'January',
    'Nov' :'November',
    'Jul' :'July',
    'Oct' :'October',
})

In [28]:
df['Month'].unique()

array(['Unknown', 'December', 'March', 'June', 'May', 'August', 'April',
       'September', 'February', 'January', 'November', 'July', 'October'],
      dtype=object)

In [29]:
df.head()

Unnamed: 0,District,Service_Name,Department,Users_Reported,Satisfaction_Score_(%),Year,Month
0,Musanze,eTax Portal,Ministry of Health,910.0,75.45,2022,Unknown
1,Gasabo,Land Registration Portal,Immigration Department,3822.0,93.6,2023,December
2,Rusizi,eTax Portal,Rwanda Revenue Authority,516.0,82.9,2024,March
3,Nyagatare,E-Visa Service,Ministry of ICT,4476.0,75.45,2024,June
4,Nyagatare,Digital Health Records,Rwanda Revenue Authority,2447.0,75.45,2023,May


In [30]:
# Save to CSV in the same folder as the notebook
df.to_csv("DigitalServiceUsage_Rwanda_cleaned_Dataset.csv", index=False)