# Nigeria Inflation Data Cleaning
## cleaning and prepare the dataset for analysis.

### Import necessary Libraries

In [1]:
# Import Libraries
import calendar
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import zscore

### Loading Dataset

In [2]:
# Load Dataset
df = pd.read_csv("NigeriaInflationRates.csv")

# View first 5 rows
print(df.head(5))

   Year  Month  Inflation_Rate  Crude Oil Price  Production  Crude Oil Export  \
0  2008      1             8.6            94.26        2.17              1.72   
1  2008      2             8.0            98.15        2.08              1.63   
2  2008      3             7.8           103.73        2.06              1.61   
3  2008      4             8.2           116.73        1.96              1.51   
4  2008      5             9.7           126.57        2.05              1.60   

    CPI_Food  CPI_Energy  CPI_Health  CPI_Transport  CPI_Communication  \
0  75.534431   84.612846   83.863139      86.530038          83.129440   
1  75.154185   85.231632   83.365891      88.355236          82.889814   
2  78.242523   83.251516   86.082993      84.376304          81.893471   
3  79.434268   84.348830   89.445332      88.600334          93.937865   
4  80.783677   85.574027   90.540461      87.932833          97.553285   

   CPI_Education  
0      75.284466  
1      75.457619  
2      74.8

### Checking Missing Values

In [3]:
# Checking for missing total values
df.isnull().sum()

Year                 0
Month                0
Inflation_Rate       0
Crude Oil Price      1
Production           1
Crude Oil Export     1
CPI_Food             0
CPI_Energy           0
CPI_Health           0
CPI_Transport        0
CPI_Communication    0
CPI_Education        0
dtype: int64

### Handling Missing Values

In [4]:
# Filter rows with any missing values to display specific columns
df[df.isna().any(axis=1)][['Year', 'Month', 'Crude Oil Price', 'Production', 'Crude Oil Export']]

Unnamed: 0,Year,Month,Crude Oil Price,Production,Crude Oil Export
183,2023,4,,,


### Apr 2023 has missing values for 'Crude Oil Price', 'Production', 'Crude Oil Export'

In [5]:
# Fill missing April 2023 values with official CBN data
df.loc[(df['Year'] == 2023) & (df['Month'] == 4), ['Crude Oil Price', 'Production', 'Crude Oil Export']] = [86.57, 1.0, 0.55]

# Confirm update
df.loc[(df['Year'] == 2023) & (df['Month'] == 4)]

Unnamed: 0,Year,Month,Inflation_Rate,Crude Oil Price,Production,Crude Oil Export,CPI_Food,CPI_Energy,CPI_Health,CPI_Transport,CPI_Communication,CPI_Education
183,2023,4,22.22,86.57,1.0,0.55,640.043131,444.157193,409.360422,472.364875,226.482519,410.271828


### Checking The Overall Information Of The Data

In [6]:
# Get overall info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               198 non-null    int64  
 1   Month              198 non-null    int64  
 2   Inflation_Rate     198 non-null    float64
 3   Crude Oil Price    198 non-null    float64
 4   Production         198 non-null    float64
 5   Crude Oil Export   198 non-null    float64
 6   CPI_Food           198 non-null    float64
 7   CPI_Energy         198 non-null    float64
 8   CPI_Health         198 non-null    float64
 9   CPI_Transport      198 non-null    float64
 10  CPI_Communication  198 non-null    float64
 11  CPI_Education      198 non-null    float64
dtypes: float64(10), int64(2)
memory usage: 18.7 KB


### Checking For Duplicates 

In [7]:
#Check if there are duplicate rows
df.duplicated().sum()

np.int64(0)

### Checking The Data Shape

In [8]:
# checking the number of rows and columns 
df.shape

(198, 12)

### Confirming If The Missing Value Was Handled Perfectly 

In [9]:
# rechecking after handling the missing value 
df.isnull().sum()

Year                 0
Month                0
Inflation_Rate       0
Crude Oil Price      0
Production           0
Crude Oil Export     0
CPI_Food             0
CPI_Energy           0
CPI_Health           0
CPI_Transport        0
CPI_Communication    0
CPI_Education        0
dtype: int64

### Checking For  data Columns Data Type

In [10]:
# checking the data type of each column
df.dtypes

Year                   int64
Month                  int64
Inflation_Rate       float64
Crude Oil Price      float64
Production           float64
Crude Oil Export     float64
CPI_Food             float64
CPI_Energy           float64
CPI_Health           float64
CPI_Transport        float64
CPI_Communication    float64
CPI_Education        float64
dtype: object

### Creating  Datetime Object By Combining Year and Month Columns

In [11]:
# combining Year and Month into a single Date column
# The assign(Day=1) adds a Day column with value 1 to create complete dates
df['Date'] = pd.to_datetime(df[['Year', 'Month']].assign(Day=1))

### Cross Checking If Date Column Was Added

In [12]:
# checking again for the new added column which is Date
df.head(5)

Unnamed: 0,Year,Month,Inflation_Rate,Crude Oil Price,Production,Crude Oil Export,CPI_Food,CPI_Energy,CPI_Health,CPI_Transport,CPI_Communication,CPI_Education,Date
0,2008,1,8.6,94.26,2.17,1.72,75.534431,84.612846,83.863139,86.530038,83.12944,75.284466,2008-01-01
1,2008,2,8.0,98.15,2.08,1.63,75.154185,85.231632,83.365891,88.355236,82.889814,75.457619,2008-02-01
2,2008,3,7.8,103.73,2.06,1.61,78.242523,83.251516,86.082993,84.376304,81.893471,74.826847,2008-03-01
3,2008,4,8.2,116.73,1.96,1.51,79.434268,84.34883,89.445332,88.600334,93.937865,73.664248,2008-04-01
4,2008,5,9.7,126.57,2.05,1.6,80.783677,85.574027,90.540461,87.932833,97.553285,74.880442,2008-05-01


### Filtering The Main Focus Of The Analysis 

In [13]:
# Filtering 2015–2025 period for consistency
df_filtered = df[(df['Date'] >= '2015') & (df['Date'] < '2024')]

### Checking The Shape Of The Filtered Data

In [14]:
# checking the number of rows and columns after again Filtering 2015–2025
df_filtered.shape

(108, 13)

### Feature Engineering
Created new features:

Inflation_Change: Month-to-month difference  
Trend: Rising, Falling, or Stable inflation  
Date: Combined Year and Month for time-series analysis  
Converted numeric month to month abbreviations (`Jan`, `Feb`, …, `Dec`)

In [15]:
# Sort the dataframe by Year and Month, then reset the index
df_filtered = df_filtered.sort_values(['Year', 'Month']).reset_index(drop=True)

# Calculate the change in inflation rate from the previous month and fill NaN values with 0
df_filtered['Inflation_Change'] = df_filtered['Inflation_Rate'].diff().fillna(0)

# Determine the trend of inflation (Rising, Falling, or Stable) based on the change
df_filtered['Trend'] = df_filtered['Inflation_Change'].apply(lambda x: 'Rising' if x > 0 else ('Falling' if x < 0 else 'Stable'))

# Convert month numbers to abbreviated month names (e.g., 1 -> 'Jan')
df_filtered['Month'] = df_filtered['Month'].apply(lambda x: calendar.month_abbr[x])

# Display the first 5 rows of the dataframe with selected columns
df_filtered[['Year', 'Month', 'Inflation_Rate', 'Inflation_Change', 'Trend']].head(15)

Unnamed: 0,Year,Month,Inflation_Rate,Inflation_Change,Trend
0,2015,Jan,8.2,0.0,Stable
1,2015,Feb,8.4,0.2,Rising
2,2015,Mar,8.5,0.1,Rising
3,2015,Apr,8.7,0.2,Rising
4,2015,May,9.0,0.3,Rising
5,2015,Jun,9.2,0.2,Rising
6,2015,Jul,9.2,0.0,Stable
7,2015,Aug,9.3,0.1,Rising
8,2015,Sep,9.4,0.1,Rising
9,2015,Oct,9.3,-0.1,Falling


# Exploratory Data Analysis (EDA):

### Maximum Inflation
The highest recorded inflation rate was 28.92% in the year 2023, indicating a peak in price increases during that period.

In [16]:
# Find the maximum inflation value and its year
max_inflation_value = df_filtered['Inflation_Rate'].max()
max_inflation_year = df_filtered[df_filtered['Inflation_Rate'] == max_inflation_value]['Year'].values[0]

max_inflation_value, max_inflation_year

(28.92, np.int64(2023))

### Minimum Inflation
The lowest recorded inflation rate was 8.2 in the year 2015, indicating the period of least price increase.

In [17]:
# Find the minimum inflation value
min_inflation = df_filtered['Inflation_Rate'].min()
min_inflation_year = df_filtered[df_filtered['Inflation_Rate'] == min_inflation]['Year'].values[0]

min_inflation, min_inflation_year

(8.2, np.int64(2015))

### Average Inflation
The overall average inflation rate was 15.36, with the year 2017 being closest to this average.

In [18]:
# Overall average inflation
avg_inflation = df_filtered['Inflation_Rate'].mean().round(2)

# Find the year closest to the average inflation
avg_inflation_year = df_filtered.iloc[(df_filtered['Inflation_Rate'] - avg_inflation).abs().argmin()]['Year']

avg_inflation, avg_inflation_year

(np.float64(15.36), np.int64(2017))

### Standard deviation = 4.68%
Indicates moderate volatility: inflation fluctuates noticeably with occasional spikes (28.92%) and periods of relative calm.

In [19]:
# Calculate the standard deviation of the Inflation_Rate column 
# This measures how much the inflation rates vary from the mean value
df_filtered['Inflation_Rate'].std()

4.686537375485675

### Inflation Descriptive Statistics

In [20]:
# Calculating summary statistics for the 'Inflation_Rate'
# to understand count, mean, std, min, 25%, 50%, 75%, and max values
df_filtered['Inflation_Rate'].describe()

count    108.000000
mean      15.355370
std        4.686537
min        8.200000
25%       11.377500
50%       15.590000
75%       17.797500
max       28.920000
Name: Inflation_Rate, dtype: float64

### Monthly Inflation Averages
The table shows the average inflation rate for each year and month 
helping identify seasonal trends and patterns in inflation over time.

In [21]:
# Calculate the average inflation rate for each year-month combination
average_inflation_rate_by_year_and_month = df_filtered.groupby(['Year', 'Month'])['Inflation_Rate'].mean().reset_index()
average_inflation_rate_by_year_and_month

Unnamed: 0,Year,Month,Inflation_Rate
0,2015,Apr,8.70
1,2015,Aug,9.30
2,2015,Dec,9.55
3,2015,Feb,8.40
4,2015,Jan,8.20
...,...,...,...
103,2023,Mar,22.04
104,2023,May,22.41
105,2023,Nov,28.20
106,2023,Oct,27.33


### Correlation Analysis Summary:
Inflation rises with CPI components, declines with Production/Exports.

CPI components move together strongly.

Oil price moderately influences inflation.

Production & Exports are negatively linked to Inflation.

In [22]:
corr = df_filtered.corr(numeric_only=True).round(2)
corr

Unnamed: 0,Year,Inflation_Rate,Crude Oil Price,Production,Crude Oil Export,CPI_Food,CPI_Energy,CPI_Health,CPI_Transport,CPI_Communication,CPI_Education,Inflation_Change
Year,1.0,0.68,0.64,-0.81,-0.81,0.95,0.96,0.95,0.95,0.97,0.97,0.19
Inflation_Rate,0.68,1.0,0.47,-0.78,-0.78,0.8,0.81,0.79,0.8,0.76,0.79,0.31
Crude Oil Price,0.64,0.47,1.0,-0.64,-0.64,0.66,0.66,0.67,0.66,0.68,0.66,0.08
Production,-0.81,-0.78,-0.64,1.0,1.0,-0.83,-0.82,-0.83,-0.82,-0.87,-0.82,-0.29
Crude Oil Export,-0.81,-0.78,-0.64,1.0,1.0,-0.83,-0.82,-0.83,-0.82,-0.87,-0.82,-0.29
CPI_Food,0.95,0.8,0.66,-0.83,-0.83,1.0,0.99,1.0,1.0,0.99,1.0,0.29
CPI_Energy,0.96,0.81,0.66,-0.82,-0.82,0.99,1.0,0.99,1.0,0.98,1.0,0.26
CPI_Health,0.95,0.79,0.67,-0.83,-0.83,1.0,0.99,1.0,1.0,0.99,1.0,0.28
CPI_Transport,0.95,0.8,0.66,-0.82,-0.82,1.0,1.0,1.0,1.0,0.99,1.0,0.28
CPI_Communication,0.97,0.76,0.68,-0.87,-0.87,0.99,0.98,0.99,0.99,1.0,0.99,0.28


### Year-over-Year Inflation Changes
This table shows the average inflation per year and the percentage change compared to the previous year
highlighting trends and spikes in inflation over time.

In [23]:
# Compare year-over-year (%) inflation changes
year_over_year_inflation_changes = df_filtered.groupby('Year')['Inflation_Rate'].mean().reset_index()
year_over_year_inflation_changes['year-over-year (%)'] = year_over_year_inflation_changes['Inflation_Rate'].pct_change().fillna(0) * 100
year_over_year_inflation_changes = year_over_year_inflation_changes.round(2)

year_over_year_inflation_changes

Unnamed: 0,Year,Inflation_Rate,year-over-year (%)
0,2015,9.01,0.0
1,2016,15.62,73.42
2,2017,16.55,5.9
3,2018,12.15,-26.6
4,2019,11.39,-6.21
5,2020,13.21,15.95
6,2021,16.98,28.58
7,2022,18.76,10.49
8,2023,24.52,30.67


### Average Inflation by Month
This table shows the average inflation rate for each month
revealing seasonal patterns and which months typically experience higher or lower inflation.


In [24]:
# Average inflation per month across all years


# Define month order
month_order = list(calendar.month_abbr)[1:] 

# Convert Month to Categorical with proper order
df_filtered['Month'] = pd.Categorical(df_filtered['Month'], categories=month_order, ordered=True)

# Group by Month
Average_inflation_per_month = df_filtered.groupby('Month')['Inflation_Rate'].mean().reset_index().round(2)
Average_inflation_per_month

Unnamed: 0,Month,Inflation_Rate
0,Jan,14.34
1,Feb,14.48
2,Mar,14.61
3,Apr,14.78
4,May,14.92
5,Jun,15.1
6,Jul,15.39
7,Aug,15.75
8,Sep,15.95
9,Oct,16.12


### Average CPI Components
This shows the average contribution of each CPI category to overall inflation 
highlighting which sectors have the largest impact.

In [25]:
# Calculate the average value for each CPI category
avg_cpi = df_filtered[['CPI_Food','CPI_Energy','CPI_Health','CPI_Transport','CPI_Communication',
                       'CPI_Education']].mean().sort_values(ascending=False)
avg_cpi

CPI_Food             363.151562
CPI_Energy           299.859618
CPI_Transport        292.677695
CPI_Education        264.128844
CPI_Health           260.314910
CPI_Communication    167.201390
dtype: float64

### Average Inflation by Year
This table shows the average inflation rate for each year
providing a clear view of annual inflation trends over the period.

In [26]:
# Calculate the average inflation rate for each year by:
average_inflation_rate_by_year  = df_filtered.groupby('Year')['Inflation_Rate'].mean().reset_index().round(2)
average_inflation_rate_by_year

Unnamed: 0,Year,Inflation_Rate
0,2015,9.01
1,2016,15.62
2,2017,16.55
3,2018,12.15
4,2019,11.39
5,2020,13.21
6,2021,16.98
7,2022,18.76
8,2023,24.52


### Top and Bottom Inflation Years
Identifing years with the highest and years with the lowest average inflation rates
Illustrating the peaks and troughs of inflation over time.

In [27]:
# Sort inflation data to find years with highest inflation rates and select top 5
top_years = average_inflation_rate_by_year.sort_values(by='Inflation_Rate', ascending=False).head(5)

# Sort inflation data to find years with lowest inflation rates and select top 5
lowest_years = average_inflation_rate_by_year.sort_values(by='Inflation_Rate', ascending=True).head(5)

# Display the results
print("Top 5 highest inflation years:\n", top_years)
print("\nTop 5 lowest inflation years:\n", lowest_years)

Top 5 highest inflation years:
    Year  Inflation_Rate
8  2023           24.52
7  2022           18.76
6  2021           16.98
2  2017           16.55
1  2016           15.62

Top 5 lowest inflation years:
    Year  Inflation_Rate
0  2015            9.01
4  2019           11.39
3  2018           12.15
5  2020           13.21
1  2016           15.62


### Inflation Outliers
Using the Z-score method, to detect months and years where inflation deviated significantly 
from the average, highlighting major inflation spikes.

In [28]:
# Calculate z-scores for inflation rates to identify statistical outliers
df_filtered['z_score'] = zscore(df_filtered['Inflation_Rate'])

# Identify outliers where absolute z-score exceeds 2 (roughly 5% of normal distribution)
outliers = df_filtered[df_filtered['z_score'].abs() > 2]  # inflation spikes

# Display years, months and inflation rates for these significant inflation anomalies
print("\nHigh inflation spikes:\n", outliers[['Year','Month','Inflation_Rate']])


High inflation spikes:
      Year Month  Inflation_Rate
103  2023   Aug           25.80
104  2023   Sep           26.72
105  2023   Oct           27.33
106  2023   Nov           28.20
107  2023   Dec           28.92


### Inflation vs Crude Oil Price Changes
Inflation and oil prices exhibit a weak correlation with frequent deviations
suggesting that domestic economic factors have a stronger influence on inflation than oil price movements

In [29]:
#Calculate the difference between percentage change in inflation rate and percentage change in crude oil price

yearly_comparison = df_filtered.groupby('Year')[['Inflation_Rate', 'Crude Oil Price']].mean().reset_index().round(2)


yearly_comparison['Inflation_vs_Oil_Change'] = yearly_comparison['Inflation_Rate'].pct_change().fillna(0) - yearly_comparison['Crude Oil Price'].pct_change().fillna(0)
yearly_comparison

Unnamed: 0,Year,Inflation_Rate,Crude Oil Price,Inflation_vs_Oil_Change
0,2015,9.01,52.65,0.0
1,2016,15.62,43.81,0.901531
2,2017,16.55,54.09,-0.175111
3,2018,12.15,72.66,-0.609178
4,2019,11.39,65.85,0.031173
5,2020,13.21,41.89,0.523647
6,2021,16.98,70.12,-0.388518
7,2022,18.76,104.62,-0.387184
8,2023,24.52,85.16,0.493043


In [30]:
df_filtered.to_csv('NigeriaInflationRates_2015_to_2024.csv', index=False)