# Load the dataset

In [None]:
import pandas as pd

raw_df = pd.read_excel(
    "../1_datasets/raw_data/IRENA_OFGStats.raw.xlsx", sheet_name="data"
)
df = raw_df.copy()
df.head()

Unnamed: 0,Region,UN Sub-region,Country,IRENA Label,ISO Code,Flow,Group Technology,Sub-Technology,Technology,Product Code,DataType,Value,Unit,Year,Ptype,Publication
0,Africa,Sub-Saharan Africa,Angola,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,24129.0,ELECCAP,0.0,MW,2000,220,2024 OFG RE Statistics
1,Africa,Sub-Saharan Africa,Angola,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,24129.0,ELECCAP,0.0,MW,2001,220,2024 OFG RE Statistics
2,Africa,Sub-Saharan Africa,Angola,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,24129.0,ELECCAP,0.0,MW,2002,220,2024 OFG RE Statistics
3,Africa,Sub-Saharan Africa,Angola,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,24129.0,ELECCAP,0.0,MW,2003,220,2024 OFG RE Statistics
4,Africa,Sub-Saharan Africa,Angola,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,24129.0,ELECCAP,0.0,MW,2004,220,2024 OFG RE Statistics


In [2]:
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52343 entries, 0 to 52342
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Region            52343 non-null  object 
 1   UN Sub-region     52343 non-null  object 
 2   Country           52343 non-null  object 
 3   IRENA Label       52343 non-null  object 
 4   ISO Code          52343 non-null  object 
 5   Flow              52343 non-null  object 
 6   Group Technology  52343 non-null  object 
 7   Sub-Technology    52343 non-null  object 
 8   Technology        52343 non-null  object 
 9   Product Code      51205 non-null  float64
 10  DataType          52343 non-null  object 
 11  Value             52321 non-null  float64
 12  Unit              52343 non-null  object 
 13  Year              52343 non-null  int64  
 14  Ptype             52343 non-null  int64  
 15  Publication       52343 non-null  object 
dtypes: float64(2), int64(2), object(12)
memo

(52343, 16)

In [3]:
df.isnull().sum()

Region                 0
UN Sub-region          0
Country                0
IRENA Label            0
ISO Code               0
Flow                   0
Group Technology       0
Sub-Technology         0
Technology             0
Product Code        1138
DataType               0
Value                 22
Unit                   0
Year                   0
Ptype                  0
Publication            0
dtype: int64

- **It's clear that there are a lot of null values in the product Code but the column itself is not relevant to our study so it will be dropped along with: UN Sub-region, IRENA Label, Ptype, and publication**

In [4]:
df.drop(
    [
        "UN Sub-region",
        "IRENA Label",
        "Ptype",
        "Publication",
        "Product Code",
        "DataType",
    ],
    axis=1,
    inplace=True,
)

In [5]:
df.head()

Unnamed: 0,Region,Country,ISO Code,Flow,Group Technology,Sub-Technology,Technology,Value,Unit,Year
0,Africa,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,0.0,MW,2000
1,Africa,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,0.0,MW,2001
2,Africa,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,0.0,MW,2002
3,Africa,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,0.0,MW,2003
4,Africa,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,0.0,MW,2004


- **Now, we should handle the 22 missing values in the "Value" Column**

In [6]:
missing_rows = df[df["Value"].isnull()]
missing_rows

Unnamed: 0,Region,Country,ISO Code,Flow,Group Technology,Sub-Technology,Technology,Value,Unit,Year
39940,Central America and the Caribbean,Bahamas (the),BHS,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),,MW,2023
39946,Central America and the Caribbean,Belize,BLZ,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),,MW,2023
39957,Central America and the Caribbean,Costa Rica,CRI,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),,MW,2023
39971,Central America and the Caribbean,Dominica,DMA,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),,MW,2023
40021,Central America and the Caribbean,Nicaragua,NIC,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),,MW,2023
40035,Central America and the Caribbean,Puerto Rico,PRI,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),,MW,2023
40082,South America,Brazil,BRA,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),,MW,2023
40089,South America,Chile,CHL,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),,MW,2023
40141,Africa,Egypt,EGY,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),,MW,2023
40169,Africa,Sudan (the),SDN,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),,MW,2023


- **So, it's apparent that most of the missing values here are from countries out of our scope, so it's better to just fill the null values with zeros**

# Filling the missing values with zeros

In [7]:
df2 = df.fillna(value=0)
df2.isnull().sum().sum()

0

- Now, let's checkup the year range, our research study is from 2010 onward

In [8]:
df2["Year"].unique()

array([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021,
       2022, 2023], dtype=int64)

In [9]:
df2 = df2[df2["Year"] >= 2010]
df2["Year"].unique()

array([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020,
       2021, 2022, 2023], dtype=int64)

- **Now, we can add a conflict status column to the dataset to present whether a country experienced conflicts in the predefined period or not.**

In [10]:
conflict_countries = [
    "Syria",
    "Iraq",
    "Sudan (the)",
    "South Sudan",
    "Palestine",
    "Mali",
    "Ethiopia",
    "Ukraine",
    "Yemen",
    "Libya",
    "Afghanistan",
    "Nigeria",
    "Central African Republic",
    "Somalia",
    "Pakistan",
    "Mozambique",
    "Myanmar",
    "Chad",
    "Democratic Republic of the Congo",
]
df2["Conflict Status"] = df2["Country"].apply(
    lambda x: "Conflict" if x in conflict_countries else "No Conflict"
)
df2.sample(10)

Unnamed: 0,Region,Country,ISO Code,Flow,Group Technology,Sub-Technology,Technology,Value,Unit,Year,Conflict Status
50824,Africa,Djibouti,DJI,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,0.0245,MW,2018,No Conflict
34562,Africa,Burundi,BDI,Off-grid biogas production,Bioenergy,Biogas production,Total biogas production,2.44,"1,000 m3",2020,No Conflict
42675,Africa,Chad,TCD,Off-grid capacity,Solar energy,Solar lights,Solar lights (<11 W),0.06208,MW,2021,Conflict
49955,Africa,Senegal,SEN,Off-grid capacity,Solar energy,Solar PV (Public lighting),Other off-grid solar PV (of which public light...,0.0425,MW,2022,No Conflict
41546,Oceania,Vanuatu,VUT,Off-grid capacity,Solar energy,Solar home systems,Solar home systems (SHS 11-50 W),0.47239,MW,2022,No Conflict
35737,Africa,Zimbabwe,ZWE,Off-grid energy access,Solar energy,Solar lights,Number of people using solar lights (<11 W),596.330586,Thousands,2019,No Conflict
50111,Africa,Burundi,BDI,Off-grid capacity,Solar energy,Solar PV (Commercial/Public),Other off-grid solar PV (of which commercial/p...,0.003,MW,2015,No Conflict
47027,Middle East,Iran (Islamic Republic of),IRN,Off-grid capacity,Solar energy,Solar PV (Not specified),Other off-grid solar PV (of which not specified),0.378,MW,2013,No Conflict
41786,Asia,Thailand,THA,Off-grid capacity,Solar energy,Solar home systems,Solar home systems (SHS 11-50 W),0.02328,MW,2017,No Conflict
50094,Africa,Somalia,SOM,Off-grid capacity,Solar energy,Solar PV (Multi-purpose),Other off-grid solar PV (of which multi-purpose),3.5,MW,2023,Conflict


- **let's check if all the unit in the `Unit` column are the same**

In [11]:
df2["Unit"].unique()

array(['MW', 'Thousands', '1,000 m3'], dtype=object)

- Hmm, that's interesting, we have to somehow standardize the dataset

In [12]:
df2[df2["Unit"] == "1,000 m3"]

Unnamed: 0,Region,Country,ISO Code,Flow,Group Technology,Sub-Technology,Technology,Value,Unit,Year,Conflict Status
11281,Africa,Burundi,BDI,Off-grid biogas production,Bioenergy,Biogas production,Production of biogas for industry,0.0000,"1,000 m3",2010,No Conflict
11282,Africa,Burundi,BDI,Off-grid biogas production,Bioenergy,Biogas production,Production of biogas for industry,0.0000,"1,000 m3",2011,No Conflict
11283,Africa,Burundi,BDI,Off-grid biogas production,Bioenergy,Biogas production,Production of biogas for industry,0.0000,"1,000 m3",2012,No Conflict
11284,Africa,Burundi,BDI,Off-grid biogas production,Bioenergy,Biogas production,Production of biogas for industry,0.0000,"1,000 m3",2013,No Conflict
11285,Africa,Burundi,BDI,Off-grid biogas production,Bioenergy,Biogas production,Production of biogas for industry,0.0000,"1,000 m3",2014,No Conflict
...,...,...,...,...,...,...,...,...,...,...,...
34763,Africa,Zimbabwe,ZWE,Off-grid biogas production,Bioenergy,Biogas production,Total biogas production,1148.0810,"1,000 m3",2019,No Conflict
34764,Africa,Zimbabwe,ZWE,Off-grid biogas production,Bioenergy,Biogas production,Total biogas production,1142.4995,"1,000 m3",2020,No Conflict
34765,Africa,Zimbabwe,ZWE,Off-grid biogas production,Bioenergy,Biogas production,Total biogas production,1135.5455,"1,000 m3",2021,No Conflict
34766,Africa,Zimbabwe,ZWE,Off-grid biogas production,Bioenergy,Biogas production,Total biogas production,1055.3000,"1,000 m3",2022,No Conflict


- The `1000 m3` unit is appeared to be primarily used for `Biogass` technology, which is irrelevant to us

In [13]:
df2.groupby(["Flow", "Unit"])["Technology"].count()

Flow                        Unit     
Off-grid biogas production  1,000 m3      2702
Off-grid capacity           MW           19658
Off-grid energy access      Thousands     8526
Name: Technology, dtype: int64

In [14]:
df2 = df2[~df2["Sub-Technology"].str.contains("biogas", case=False, na=False)]
df2 = df2[df2["Group Technology"] != "Bioenergy"]

In [15]:
df2.groupby(["Flow", "Unit"])["Technology"].count()

Flow                    Unit     
Off-grid capacity       MW           18762
Off-grid energy access  Thousands     7714
Name: Technology, dtype: int64

In [16]:
df2.sample(10)

Unnamed: 0,Region,Country,ISO Code,Flow,Group Technology,Sub-Technology,Technology,Value,Unit,Year,Conflict Status
39965,Central America and the Caribbean,Cuba,CUB,Off-grid capacity,Solar energy,Solar lights and SHS,Solar lights and solar home systems (SHS),0.858615,MW,2021,No Conflict
51766,Middle East,Iraq,IRQ,Off-grid capacity,Solar energy,Solar PV (Others),Other off-grid solar PV,37.6,MW,2021,Conflict
47667,Asia,Afghanistan,AFG,Off-grid capacity,Solar energy,Solar PV (Public lighting),Other off-grid solar PV (of which public light...,0.09321,MW,2022,Conflict
50292,Africa,Angola,AGO,Off-grid capacity,Solar energy,Solar PV (Health),Other off-grid solar PV (of which health),0.021125,MW,2015,No Conflict
44923,Asia,Philippines (the),PHL,Off-grid capacity,Solar energy,Solar pumps,Solar pumps (public water supply),0.0076,MW,2016,No Conflict
27824,Africa,Morocco,MAR,Off-grid capacity,Solar energy,Solar cookers,Solar cookers,0.09294,MW,2019,No Conflict
2728,Africa,Benin,BEN,Off-grid capacity,Solar energy,Solar mini-grids,Solar mini-grids,0.0,MW,2010,No Conflict
39673,Africa,Liberia,LBR,Off-grid capacity,Hydropower (excl. Pumped Storage),Hydropower,Hydropower,4.86,MW,2013,No Conflict
47798,Asia,Bangladesh,BGD,Off-grid capacity,Solar energy,Solar PV (Commercial/Public),Other off-grid solar PV (of which commercial/p...,2.719755,MW,2016,No Conflict
49168,South America,Bolivia (Plurinational State of),BOL,Off-grid capacity,Solar energy,Solar PV (Commercial/Public),Other off-grid solar PV (of which commercial/p...,8e-05,MW,2013,No Conflict


In [17]:
df2["Sub-Technology"].unique()

array(['Solar PV (Others)', 'Solar PV (Health)',
       'Solar PV (Commercial/Public)', 'Solar PV (Education)',
       'Solar PV (Not specified)', 'Solar PV (Industry)',
       'Solar PV (Tourism)', 'Solar PV (Public lighting)',
       'Solar PV (Communication)', 'Solar PV (Residential)',
       'Solar PV (Agriculture)', 'Solar PV (Multi-purpose)',
       'Solar mini-grids', 'Solar lights', 'Solar pumps',
       'Solar home systems', 'Onshore wind energy', 'Hydropower',
       'Solar cookers', 'Solar lights and SHS'], dtype=object)

- **Now, The resultant dataset will be saved to "cleaned_data" sub-folder in the "1_dataset" folder**

In [18]:
df2.to_excel(
    "../1_datasets/cleaned_data/IRENA_OFGStats.cleaned.xlsx",
    sheet_name="Cleaned_data",
    index=False,
)