### Basic exploratory data analysis
Looking at data types, missing value counts and to ensure format consistency with values like dates## Petroineos Summer Internship 2025 - Data Analysis Coding Challenge
Author: Chun (Johnny) Chan

Exploratory data analysis

In [132]:
# Import necessary libraries
import pandas as pd
import datetime as dt

In [133]:
# Import csv files
database = pd.read_csv("database.csv")
gas_fr_plants = pd.read_csv("gas_fr_plants.csv")
gas_plants = pd.read_csv("gas_plants.csv")
wind_plants = pd.read_csv("wind_plants.csv")

In [134]:
database.head()

Unnamed: 0,Date,Country,Technology,SiteName,Volume,country,date,updatedby,updatetime
0,01/01/2024,FR,Gas,Blenod-5,6753.0,France,2024-01-01,petroineos,2025-05-29 10:38:42.401709
1,02/01/2024,FR,Gas,Blenod-5,3896.0,France,2024-01-02,petroineos,2025-05-29 10:38:42.401709
2,03/01/2024,FR,Gas,Blenod-5,3636.0,France,2024-01-03,petroineos,2025-05-29 10:38:42.401709
3,04/01/2024,FR,Gas,Blenod-5,5138.0,France,2024-01-04,petroineos,2025-05-29 10:38:42.401709
4,05/01/2024,FR,Gas,Blenod-5,5265.0,France,2024-01-05,petroineos,2025-05-29 10:38:42.401709


Expected structure of database should be [date, country, SiteName, Technology, updatedby, updatetime, volume] but we seem to have duplicate columns.
We can drop the extra columns at the end.

In [135]:
gas_fr_plants.head()

Unnamed: 0,Date,Country,Technology,SiteName,Volume
0,01/01/2024,FR,Gas,Blenod-5,6753.0
1,02/01/2024,FR,Gas,Blenod-5,3896.0
2,03/01/2024,FR,Gas,Blenod-5,3636.0
3,04/01/2024,FR,Gas,Blenod-5,5138.0
4,05/01/2024,FR,Gas,Blenod-5,5265.0


In [136]:
gas_plants.head()

Unnamed: 0,Date,Country,Technology,SiteName,Volume
0,01/01/2024,GB,Gas,Pembroke-1,6570
1,02/01/2024,GB,Gas,Pembroke-1,8068
2,03/01/2024,GB,Gas,Pembroke-1,7225
3,04/01/2024,GB,Gas,Pembroke-1,5390
4,05/01/2024,GB,Gas,Pembroke-1,6720


In [137]:
wind_plants.head()

Unnamed: 0,Date,Country,Technology,SiteName,Volume
0,01/01/2024,GB,Wind,Hornsea-1,260.166079
1,02/01/2024,GB,Wind,Hornsea-1,709.48082
2,03/01/2024,GB,Wind,Hornsea-1,431.52768
3,04/01/2024,GB,Wind,Hornsea-1,223.868472
4,05/01/2024,GB,Wind,Hornsea-1,686.985009


In [138]:
# Filling missing values with 0 as instructed by the PDF
gas_fr_plants.fillna(0, inplace=True, axis=1)
gas_plants.fillna(0, inplace=True, axis=1)
wind_plants.fillna(0, inplace=True, axis=1)

  gas_fr_plants.fillna(0, inplace=True, axis=1)


In [139]:
# Checking that there are no missing values
gas_fr_plants.isna().sum(), gas_plants.isna().sum(), wind_plants.isna().sum()

(Date          0
 Country       0
 Technology    0
 SiteName      0
 Volume        0
 dtype: int64,
 Date          0
 Country       0
 Technology    0
 SiteName      0
 Volume        0
 dtype: int64,
 Date          0
 Country       0
 Technology    0
 SiteName      0
 Volume        0
 dtype: int64)

In [140]:
gas_fr_plants.dtypes, gas_plants.dtypes, wind_plants.dtypes

(Date          object
 Country       object
 Technology    object
 SiteName      object
 Volume        object
 dtype: object,
 Date          object
 Country       object
 Technology    object
 SiteName      object
 Volume        object
 dtype: object,
 Date          object
 Country       object
 Technology    object
 SiteName      object
 Volume        object
 dtype: object)

In [141]:
# Checking how many unique countries there are
gas_fr_plants['Volume'] = gas_fr_plants['Volume'].astype(float)

In [142]:
country_map = {'FR': 'France',
                      'GB': 'Great Britain',
                      '0': '0'}
gas_fr_plants['country'] = gas_fr_plants['Country '].map(country_map)
gas_fr_plants

Unnamed: 0,Date,Country,Technology,SiteName,Volume,country
0,01/01/2024,FR,Gas,Blenod-5,6753.0,France
1,02/01/2024,FR,Gas,Blenod-5,3896.0,France
2,03/01/2024,FR,Gas,Blenod-5,3636.0,France
3,04/01/2024,FR,Gas,Blenod-5,5138.0,France
4,05/01/2024,FR,Gas,Blenod-5,5265.0,France
...,...,...,...,...,...,...
957,0,0,0,0,0.0,
958,0,0,0,0,0.0,
959,0,0,0,0,0.0,
960,0,0,0,0,0.0,


In [143]:
gas_fr_plants['Date'] = pd.to_datetime(gas_fr_plants['Date'], format='%d/%m/%Y', errors='coerce')  # convert Date column to recognisable data type
gas_fr_plants['date'] = gas_fr_plants['Date'].dt.strftime('%Y-%m-%d')  # convert to correct date time format as in database.csv
gas_fr_plants  # have a look at the result

Unnamed: 0,Date,Country,Technology,SiteName,Volume,country,date
0,2024-01-01,FR,Gas,Blenod-5,6753.0,France,2024-01-01
1,2024-01-02,FR,Gas,Blenod-5,3896.0,France,2024-01-02
2,2024-01-03,FR,Gas,Blenod-5,3636.0,France,2024-01-03
3,2024-01-04,FR,Gas,Blenod-5,5138.0,France,2024-01-04
4,2024-01-05,FR,Gas,Blenod-5,5265.0,France,2024-01-05
...,...,...,...,...,...,...,...
957,NaT,0,0,0,0.0,,
958,NaT,0,0,0,0.0,,
959,NaT,0,0,0,0.0,,
960,NaT,0,0,0,0.0,,
