In [1]:
import numpy as np
import pandas as pd

# Main table

In [2]:
# Reading in the gzipped tsv file
mainDF = pd.read_table('https://ec.europa.eu/eurostat/api/dissemination/sdmx/2.1/data/FISH_LD_MAIN/?format=TSV&compressed=true&i', compression='gzip')
mainDF

Unnamed: 0,"freq,pres,species,natvessr,dest_use,unit,geo\TIME_PERIOD",2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,"A,CLA,BSS,TOTAL,HCN,EUR,EU27_2020",:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,19.396 e,:,:,:
1,"A,CLA,BSS,TOTAL,HCN,EUR,NL",:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,19.396 e,:,:,:
2,"A,CLA,BSS,TOTAL,HCN,EUR_T,EU27_2020",:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,9698 e,:,:,:
3,"A,CLA,BSS,TOTAL,HCN,EUR_T,NL",:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,9698 e,:,:,:
4,"A,CLA,BSS,TOTAL,HCN,TPW,EU27_2020",:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,0.002 e,:,:,:
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382231,"A,WHL,WIT,TOTAL,IND,EUR_T,NO",:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,327.4467,:,:
382232,"A,WHL,WIT,TOTAL,IND,TPW,NO",:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,0.4,:,:
382233,"A,WHL,WIT,TOTAL,TOTAL,EUR,NO",:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,130.9787,:,:
382234,"A,WHL,WIT,TOTAL,TOTAL,EUR_T,NO",:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,327.4467,:,:


The first 7 columns were separated by commas and the rest with tabs. It's also obvious that there is a white space at the end of every numerical column name, so I removed these white spaces with ```rename``` and ```str.strip()``` methods

In [3]:
# Showing all columns names
list(mainDF.keys())

['freq,pres,species,natvessr,dest_use,unit,geo\\TIME_PERIOD',
 '2000 ',
 '2001 ',
 '2002 ',
 '2003 ',
 '2004 ',
 '2005 ',
 '2006 ',
 '2007 ',
 '2008 ',
 '2009 ',
 '2010 ',
 '2011 ',
 '2012 ',
 '2013 ',
 '2014 ',
 '2015 ',
 '2016 ',
 '2017 ',
 '2018 ',
 '2019 ',
 '2020 ',
 '2021 ']

In [4]:
# removing white spaces from column names with rename and str.strip() methods
mainDF = mainDF.rename(columns=lambda x: x.strip()) # inplace not suitable here!
mainDF

Unnamed: 0,"freq,pres,species,natvessr,dest_use,unit,geo\TIME_PERIOD",2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,"A,CLA,BSS,TOTAL,HCN,EUR,EU27_2020",:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,19.396 e,:,:,:
1,"A,CLA,BSS,TOTAL,HCN,EUR,NL",:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,19.396 e,:,:,:
2,"A,CLA,BSS,TOTAL,HCN,EUR_T,EU27_2020",:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,9698 e,:,:,:
3,"A,CLA,BSS,TOTAL,HCN,EUR_T,NL",:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,9698 e,:,:,:
4,"A,CLA,BSS,TOTAL,HCN,TPW,EU27_2020",:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,0.002 e,:,:,:
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382231,"A,WHL,WIT,TOTAL,IND,EUR_T,NO",:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,327.4467,:,:
382232,"A,WHL,WIT,TOTAL,IND,TPW,NO",:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,0.4,:,:
382233,"A,WHL,WIT,TOTAL,TOTAL,EUR,NO",:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,130.9787,:,:
382234,"A,WHL,WIT,TOTAL,TOTAL,EUR_T,NO",:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,327.4467,:,:


In [5]:
list(mainDF.keys())

['freq,pres,species,natvessr,dest_use,unit,geo\\TIME_PERIOD',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '2021']

Showing first cell of the table to make sure the first columns separated by commas are grouped

In [6]:
mainDF.loc[0][0]

'A,CLA,BSS,TOTAL,HCN,EUR,EU27_2020'

In [7]:
# Second cell from first row
mainDF.loc[0][1]

': '

In the preparation of the data frame I created and merged two intermediary data frames containing:
1) The categorical columns (first seven)  
2) The columns with years and weights  
  
-> It also took extra actions for the column "geo", since it was between comma and tab separated values

### Handling the categorical colums

In [8]:
# Reading in the gzipped tsv file
categoricalDF = pd.read_table('https://ec.europa.eu/eurostat/api/dissemination/sdmx/2.1/data/FISH_LD_MAIN/?format=TSV&compressed=true&i', compression='gzip', usecols = [0,1,2,3,4,5,6], delimiter=',')
categoricalDF

Unnamed: 0,freq,pres,species,natvessr,dest_use,unit,geo\TIME_PERIOD\t2000 \t2001 \t2002 \t2003 \t2004 \t2005 \t2006 \t2007 \t2008 \t2009 \t2010 \t2011 \t2012 \t2013 \t2014 \t2015 \t2016 \t2017 \t2018 \t2019 \t2020 \t2021
0,A,CLA,BSS,TOTAL,HCN,EUR,EU27_2020\t: \t: \t: \t: \t: \t: \t: \t: \t: \...
1,A,CLA,BSS,TOTAL,HCN,EUR,NL\t: \t: \t: \t: \t: \t: \t: \t: \t: \t: \t: ...
2,A,CLA,BSS,TOTAL,HCN,EUR_T,EU27_2020\t: \t: \t: \t: \t: \t: \t: \t: \t: \...
3,A,CLA,BSS,TOTAL,HCN,EUR_T,NL\t: \t: \t: \t: \t: \t: \t: \t: \t: \t: \t: ...
4,A,CLA,BSS,TOTAL,HCN,TPW,EU27_2020\t: \t: \t: \t: \t: \t: \t: \t: \t: \...
...,...,...,...,...,...,...,...
382231,A,WHL,WIT,TOTAL,IND,EUR_T,NO\t: \t: \t: \t: \t: \t: \t: \t: \t: \t: \t: ...
382232,A,WHL,WIT,TOTAL,IND,TPW,NO\t: \t: \t: \t: \t: \t: \t: \t: \t: \t: \t: ...
382233,A,WHL,WIT,TOTAL,TOTAL,EUR,NO\t: \t: \t: \t: \t: \t: \t: \t: \t: \t: \t: ...
382234,A,WHL,WIT,TOTAL,TOTAL,EUR_T,NO\t: \t: \t: \t: \t: \t: \t: \t: \t: \t: \t: ...


In [9]:
# Renaming the last column as just "geo"
categoricalDF.rename(columns = {'geo\TIME_PERIOD\t2000 \t2001 \t2002 \t2003 \t2004 \t2005 \t2006 \t2007 \t2008 \t2009 \t2010 \t2011 \t2012 \t2013 \t2014 \t2015 \t2016 \t2017 \t2018 \t2019 \t2020 \t2021 ':'geo'}, inplace = True)
categoricalDF

Unnamed: 0,freq,pres,species,natvessr,dest_use,unit,geo
0,A,CLA,BSS,TOTAL,HCN,EUR,EU27_2020\t: \t: \t: \t: \t: \t: \t: \t: \t: \...
1,A,CLA,BSS,TOTAL,HCN,EUR,NL\t: \t: \t: \t: \t: \t: \t: \t: \t: \t: \t: ...
2,A,CLA,BSS,TOTAL,HCN,EUR_T,EU27_2020\t: \t: \t: \t: \t: \t: \t: \t: \t: \...
3,A,CLA,BSS,TOTAL,HCN,EUR_T,NL\t: \t: \t: \t: \t: \t: \t: \t: \t: \t: \t: ...
4,A,CLA,BSS,TOTAL,HCN,TPW,EU27_2020\t: \t: \t: \t: \t: \t: \t: \t: \t: \...
...,...,...,...,...,...,...,...
382231,A,WHL,WIT,TOTAL,IND,EUR_T,NO\t: \t: \t: \t: \t: \t: \t: \t: \t: \t: \t: ...
382232,A,WHL,WIT,TOTAL,IND,TPW,NO\t: \t: \t: \t: \t: \t: \t: \t: \t: \t: \t: ...
382233,A,WHL,WIT,TOTAL,TOTAL,EUR,NO\t: \t: \t: \t: \t: \t: \t: \t: \t: \t: \t: ...
382234,A,WHL,WIT,TOTAL,TOTAL,EUR_T,NO\t: \t: \t: \t: \t: \t: \t: \t: \t: \t: \t: ...


Checking the values of the column "geo", selected the first string of each cell and counted occurrences of all unique values

In [10]:
categoricalDF['geo'].str.split().str.get(0).value_counts()

EU           61867
EU27_2020    60476
EU28         53024
ES           49270
PT           21234
FR           16844
UK           12163
NO           11757
NL           11276
IE           10147
DK            8891
SE            8423
IT            8302
DE            6617
MT            5418
IS            4479
CY            4455
BE            3596
EL            3240
SI            3148
HR            3120
LT            2900
TR            2295
PL            2208
EE            2015
BG            1927
LV            1302
RO             936
FI             906
Name: geo, dtype: int64

In [11]:
# Updating the values of the column "geo" with the first string only 
categoricalDF['geo'] = categoricalDF['geo'].str.split().str.get(0)
categoricalDF

Unnamed: 0,freq,pres,species,natvessr,dest_use,unit,geo
0,A,CLA,BSS,TOTAL,HCN,EUR,EU27_2020
1,A,CLA,BSS,TOTAL,HCN,EUR,NL
2,A,CLA,BSS,TOTAL,HCN,EUR_T,EU27_2020
3,A,CLA,BSS,TOTAL,HCN,EUR_T,NL
4,A,CLA,BSS,TOTAL,HCN,TPW,EU27_2020
...,...,...,...,...,...,...,...
382231,A,WHL,WIT,TOTAL,IND,EUR_T,NO
382232,A,WHL,WIT,TOTAL,IND,TPW,NO
382233,A,WHL,WIT,TOTAL,TOTAL,EUR,NO
382234,A,WHL,WIT,TOTAL,TOTAL,EUR_T,NO


 -> Selecting cuantitative columns on another data frame

In [12]:
cuantitativeColumns = mainDF.drop('freq,pres,species,natvessr,dest_use,unit,geo\TIME_PERIOD', axis=1)
cuantitativeColumns

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,19.396 e,:,:,:
1,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,19.396 e,:,:,:
2,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,9698 e,:,:,:
3,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,9698 e,:,:,:
4,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,0.002 e,:,:,:
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382231,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,327.4467,:,:
382232,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,0.4,:,:
382233,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,130.9787,:,:
382234,:,:,:,:,:,:,:,:,:,:,...,:,:,:,:,:,:,:,327.4467,:,:


Before merging both data frames I checked the number of (c), (p) and (e) observations

In [13]:
print('year\tconfidential\tprovisional\testimated\n-------------------------------------------------') # header of the table. /t represents a TAB and /n is the new line character
for column in cuantitativeColumns:
    print(column, '\t', f"{cuantitativeColumns[column].str.contains('c').sum():>5}", '\t\t', f"{cuantitativeColumns[column].str.contains('p').sum():>5}", '\t\t', f"{cuantitativeColumns[column].str.contains('e').sum():>5}")
    # added format to align numbers to the right

year	confidential	provisional	estimated
-------------------------------------------------
2000 	     0 		     0 		     0
2001 	     0 		     0 		     0
2002 	     0 		     0 		     0
2003 	     0 		     0 		     0
2004 	     0 		     0 		     0
2005 	     0 		     0 		     0
2006 	     0 		     0 		     0
2007 	     0 		     0 		     0
2008 	     0 		     0 		     0
2009 	     0 		     0 		     0
2010 	     0 		     0 		     0
2011 	     0 		     0 		     0
2012 	  2304 		     0 		     0
2013 	  2664 		     0 		     0
2014 	  2664 		     0 		     0
2015 	  3888 		     0 		     0
2016 	  5760 		     0 		     0
2017 	 18603 		     0 		 14571
2018 	 10729 		 23346 		  9178
2019 	 10908 		 27032 		 11573
2020 	  6326 		 24904 		 10113
2021 	  4577 		  9558 		  7362


The count of "c" and ": c" matches, therefore no confidential data is provided

In [14]:
# Checking the count of ": c" per column
for column in cuantitativeColumns:
    print(column, '\t', f"{cuantitativeColumns[column].str.contains(': c').sum():>5}")

2000 	     0
2001 	     0
2002 	     0
2003 	     0
2004 	     0
2005 	     0
2006 	     0
2007 	     0
2008 	     0
2009 	     0
2010 	     0
2011 	     0
2012 	  2304
2013 	  2664
2014 	  2664
2015 	  3888
2016 	  5760
2017 	 18603
2018 	 10729
2019 	 10908
2020 	  6326
2021 	  4577


Removing flags from the cells so that they can be converted to floats

In [15]:
# Selecting the first string of each cell which holds the number that will be turned into a float.This for loop iterates through the entire data frame and updates each cell with its corresponding first string only
for column in cuantitativeColumns:
    cuantitativeColumns[column] = cuantitativeColumns[column].str.split().str.get(0)

Before converting to floats I checked the number of colons in the data frame, which actually is the most common value in the df

In [16]:
colonCount = cuantitativeColumns.stack().value_counts().head(1)

In [17]:
# another similar for loop for the conversion to floats. Cohercing errors returns NaN instead of erroring out (when the input is a colon)
for column in cuantitativeColumns:
    cuantitativeColumns[column] = pd.to_numeric(cuantitativeColumns[column], errors='coerce')

Checking the data types

In [34]:
cuantitativeColumns.dtypes

2000    float64
2001    float64
2002    float64
2003    float64
2004    float64
2005    float64
2006    float64
2007    float64
2008    float64
2009    float64
2010    float64
2011    float64
2012    float64
2013    float64
2014    float64
2015    float64
2016    float64
2017    float64
2018    float64
2019    float64
2020    float64
2021    float64
dtype: object

Counting the occurrences of NaN's

In [19]:
nanCount = cuantitativeColumns.isnull().sum().sum()

All the numbers were converted to float data type

In [20]:
colonCount == nanCount

:    True
dtype: bool

-> Merging cuantitative and qualitative data frames

In [21]:
mainDF = pd.concat([categoricalDF, cuantitativeColumns], axis=1)
mainDF

Unnamed: 0,freq,pres,species,natvessr,dest_use,unit,geo,2000,2001,2002,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,A,CLA,BSS,TOTAL,HCN,EUR,EU27_2020,,,,...,,,,,,,19.396,,,
1,A,CLA,BSS,TOTAL,HCN,EUR,NL,,,,...,,,,,,,19.396,,,
2,A,CLA,BSS,TOTAL,HCN,EUR_T,EU27_2020,,,,...,,,,,,,9698.000,,,
3,A,CLA,BSS,TOTAL,HCN,EUR_T,NL,,,,...,,,,,,,9698.000,,,
4,A,CLA,BSS,TOTAL,HCN,TPW,EU27_2020,,,,...,,,,,,,0.002,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382231,A,WHL,WIT,TOTAL,IND,EUR_T,NO,,,,...,,,,,,,,327.4467,,
382232,A,WHL,WIT,TOTAL,IND,TPW,NO,,,,...,,,,,,,,0.4000,,
382233,A,WHL,WIT,TOTAL,TOTAL,EUR,NO,,,,...,,,,,,,,130.9787,,
382234,A,WHL,WIT,TOTAL,TOTAL,EUR_T,NO,,,,...,,,,,,,,327.4467,,


Since the columns "freq" (frequency) and "natvessr" (Nationality of registration of vessel) have one unique value (A and TOTAL respecitvely), I dropped them:

In [22]:
print(mainDF['freq'].value_counts(),'\n\n',mainDF['natvessr'].value_counts())

A    382236
Name: freq, dtype: int64 

 TOTAL    382236
Name: natvessr, dtype: int64


In [23]:
mainDF.drop(['freq','natvessr'], axis=1, inplace = True) # inplace to save memory
mainDF

Unnamed: 0,pres,species,dest_use,unit,geo,2000,2001,2002,2003,2004,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,CLA,BSS,HCN,EUR,EU27_2020,,,,,,...,,,,,,,19.396,,,
1,CLA,BSS,HCN,EUR,NL,,,,,,...,,,,,,,19.396,,,
2,CLA,BSS,HCN,EUR_T,EU27_2020,,,,,,...,,,,,,,9698.000,,,
3,CLA,BSS,HCN,EUR_T,NL,,,,,,...,,,,,,,9698.000,,,
4,CLA,BSS,HCN,TPW,EU27_2020,,,,,,...,,,,,,,0.002,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382231,WHL,WIT,IND,EUR_T,NO,,,,,,...,,,,,,,,327.4467,,
382232,WHL,WIT,IND,TPW,NO,,,,,,...,,,,,,,,0.4000,,
382233,WHL,WIT,TOTAL,EUR,NO,,,,,,...,,,,,,,,130.9787,,
382234,WHL,WIT,TOTAL,EUR_T,NO,,,,,,...,,,,,,,,327.4467,,


### General check

In [24]:
mainDF.shape

(382236, 27)

In [36]:
mainDF.describe()

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
count,28263.0,37435.0,55068.0,54527.0,68049.0,74758.0,74891.0,90962.0,99801.0,96974.0,...,164259.0,191468.0,185226.0,203277.0,207981.0,196778.0,130215.0,137245.0,129836.0,89201.0
mean,2181900.0,2406947.0,9653639.0,10993730.0,9029300.0,7927699.0,8896657.0,7597905.0,6321732.0,5822545.0,...,4225646.0,3530596.0,4119175.0,3489556.0,3159049.0,2516176.0,3269761.0,3250035.0,3009861.0,2633736.0
std,16739330.0,19037350.0,103572600.0,125154300.0,114725300.0,102225200.0,117303200.0,107674900.0,94496410.0,87914180.0,...,44616500.0,41739520.0,50503040.0,43233390.0,39282710.0,34724980.0,41566370.0,41674330.0,38664410.0,32702790.0
min,-1080.196,-5726.34,-619.0809,-33749.11,-22831.59,-29234.85,-18510.69,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,121.2,118.4,204.0,197.4,118.2,101.9504,171.61,146.025,97.1,79.1,...,80.7096,47.798,35.8,41.41,36.1665,23.23,41.06,35.2,33.5721,36.955
50%,1661.893,1787.235,2212.724,2337.3,2000.0,2052.8,2306.0,2080.133,1975.0,1884.663,...,1940.352,1733.53,1750.953,1551.744,1504.76,1484.343,1755.034,1613.111,1515.171,1738.291
75%,20845.51,19295.76,38307.8,40229.3,23981.8,23541.85,25362.82,19538.24,14708.41,13139.7,...,11244.71,9126.409,9772.927,8577.3,8580.18,8250.0,10103.2,9320.101,9066.222,10447.18
max,654052400.0,1047244000.0,5112131000.0,6379519000.0,6529695000.0,5943221000.0,6795872000.0,7322345000.0,6686818000.0,6248084000.0,...,1958458000.0,2139882000.0,2696236000.0,2127582000.0,2113555000.0,2152023000.0,2135951000.0,2194340000.0,2190319000.0,1779608000.0


In [35]:
mainDF.describe(include = object)

Unnamed: 0,pres,species,dest_use,unit,geo
count,382236,382236,382236,382236,382236
unique,41,1529,3,3,29
top,TOTAL,F00,TOTAL,TPW,EU
freq,63718,3448,182720,132709,61867


-> They are all strings:

IRISH DATA FRAME

In [28]:
irishDF = mainDF[mainDF['geo'] == 'IE']
irishDF

Unnamed: 0,pres,species,dest_use,unit,geo,2000,2001,2002,2003,2004,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
92,CLA,CRE,HCN,EUR,IE,,,,,,...,,,,2279.2000,4625.9132,8006.8901,22849.4278,17183.6710,,114224.3198
105,CLA,CRE,HCN,EUR_T,IE,,,,,,...,,,,468.9712,627.6680,600.4417,756.1166,853.0289,,4785.1065
118,CLA,CRE,HCN,TPW,IE,,,,,,...,,,,4.8600,7.3700,13.3350,30.2195,20.1443,,23.8708
143,CLA,CRE,TOTAL,EUR,IE,,,,,,...,,,,2279.2000,4625.9132,8006.8901,22849.4278,17183.6710,,
156,CLA,CRE,TOTAL,EUR_T,IE,,,,,,...,,,,468.9712,627.6680,600.4417,756.1166,853.0289,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381432,UNK,WRF,TOTAL,EUR_T,IE,,,,,,...,2000.000,,,,2000.0000,2000.0000,,,,
381436,UNK,WRF,TOTAL,TPW,IE,,,,,,...,1.099,,,,0.1620,0.3730,,,,
381444,UNK,YRS,TOTAL,EUR,IE,,,,,,...,,,,,,,,,,
381446,UNK,YRS,TOTAL,EUR_T,IE,,,,,,...,,,,,,,,,,


Reindexing the Irish data frame

In [50]:
# inplace to save memory and dropping the old index columns which would remain otherwise
irishDF.reset_index(inplace = True, drop = True)
irishDF = irishDF.drop('geo', axis=1)
irishDF

Unnamed: 0,pres,species,dest_use,unit,2000,2001,2002,2003,2004,2005,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,CLA,CRE,HCN,EUR,,,,,,,...,,,,2279.2000,4625.9132,8006.8901,22849.4278,17183.6710,,114224.3198
1,CLA,CRE,HCN,EUR_T,,,,,,,...,,,,468.9712,627.6680,600.4417,756.1166,853.0289,,4785.1065
2,CLA,CRE,HCN,TPW,,,,,,,...,,,,4.8600,7.3700,13.3350,30.2195,20.1443,,23.8708
3,CLA,CRE,TOTAL,EUR,,,,,,,...,,,,2279.2000,4625.9132,8006.8901,22849.4278,17183.6710,,
4,CLA,CRE,TOTAL,EUR_T,,,,,,,...,,,,468.9712,627.6680,600.4417,756.1166,853.0289,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10142,UNK,WRF,TOTAL,EUR_T,,,,,,,...,2000.000,,,,2000.0000,2000.0000,,,,
10143,UNK,WRF,TOTAL,TPW,,,,,,,...,1.099,,,,0.1620,0.3730,,,,
10144,UNK,YRS,TOTAL,EUR,,,,,,,...,,,,,,,,,,
10145,UNK,YRS,TOTAL,EUR_T,,,,,,,...,,,,,,,,,,


In [51]:
irishDF.describe()

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
count,2893.0,2095.0,3032.0,2008.0,2014.0,1910.0,1878.0,1968.0,1866.0,2064.0,...,3126.0,2034.0,1980.0,3846.0,3828.0,3372.0,903.0,1464.0,641.0,867.0
mean,384375.2,1015908.0,2369275.0,5299102.0,4056583.0,3282618.0,7046261.0,7778967.0,5625251.0,4809590.0,...,4894405.0,6169077.0,10721560.0,3378662.0,4911704.0,2743489.0,487307.0,896382.9,494782.0,110217.9
std,1960634.0,4831585.0,13661710.0,25182500.0,18827540.0,15516530.0,32322520.0,35636610.0,25235170.0,22984920.0,...,29607290.0,31619660.0,57774330.0,23368740.0,33917870.0,17648160.0,3156235.0,6711495.0,3094199.0,692759.0
min,0.0,0.0,0.1,0.0,0.1,0.0,0.0,0.0,0.1,0.0,...,0.007,0.02,0.06,0.0017,0.002,0.002,0.102,0.0054,0.0,0.0
25%,57.1,195.35,380.5,760.2,573.3626,600.0,688.7,643.0,776.0877,639.6,...,340.7352,756.06,947.0,110.478,220.9186,155.1362,236.5821,267.7004,73.0569,23.2094
50%,1142.764,2095.906,1803.0,3155.735,2558.332,3014.8,3120.164,3574.456,3200.0,2952.865,...,2000.0,2424.454,3393.475,1065.487,1500.0,2000.0,2376.927,2231.978,1876.877,1394.353
75%,6513.756,15558.47,26144.9,102310.9,59844.83,64616.26,84792.0,91827.78,105766.5,53004.88,...,20888.16,86159.91,52439.35,6956.148,10755.0,9486.401,20361.55,19421.78,15758.74,10000.0
max,21136870.0,65475420.0,170811600.0,253012700.0,193897000.0,149047900.0,314810000.0,364209800.0,249668500.0,236062900.0,...,396736200.0,299492100.0,507926100.0,308953200.0,485937300.0,292143900.0,44554830.0,152276100.0,61623260.0,10521250.0


In [52]:
irishDF.describe(include = object)

Unnamed: 0,pres,species,dest_use,unit
count,10147,10147,10147,10147
unique,25,327,3,3
top,TOTAL,F01,TOTAL,TPW
freq,2062,152,5120,3439


In [31]:
irishDF.isnull().sum().sum()

175253

In [53]:
irishDF.head(50)

Unnamed: 0,pres,species,dest_use,unit,2000,2001,2002,2003,2004,2005,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,CLA,CRE,HCN,EUR,,,,,,,...,,,,2279.2,4625.9132,8006.8901,22849.4278,17183.671,,114224.3198
1,CLA,CRE,HCN,EUR_T,,,,,,,...,,,,468.9712,627.668,600.4417,756.1166,853.0289,,4785.1065
2,CLA,CRE,HCN,TPW,,,,,,,...,,,,4.86,7.37,13.335,30.2195,20.1443,,23.8708
3,CLA,CRE,TOTAL,EUR,,,,,,,...,,,,2279.2,4625.9132,8006.8901,22849.4278,17183.671,,
4,CLA,CRE,TOTAL,EUR_T,,,,,,,...,,,,468.9712,627.668,600.4417,756.1166,853.0289,,
5,CLA,CRE,TOTAL,TPW,,,,,,,...,,,,4.86,7.37,13.335,30.2195,20.1443,,
6,CLA,F00,HCN,EUR,,,,,,,...,,,,36416.2,4625.9132,8006.8901,22849.4278,67403.8528,,
7,CLA,F00,HCN,EUR_T,,,,,,,...,,,,933.8206,627.668,600.4417,756.1166,624.186,,
8,CLA,F00,HCN,TPW,,,,,,,...,,,,38.997,7.37,13.335,30.2195,107.9868,,
9,CLA,F00,TOTAL,EUR,,,,,,,...,,,,36416.2,4625.9132,8006.8901,22849.4278,67403.8528,,
