# Task 14: Data cleaning and preprocessing with Pandas

In [44]:
import pandas as pd 
df=pd.read_csv("emissions_EU28.csv")
df.dtypes

airpol       object
cpa08        object
induse       object
origin       object
unit         object
geo\time     object
2018        float64
2017        float64
2016        float64
2015        float64
2014        float64
2013        float64
2012        float64
2011        float64
2010        float64
2009        float64
2008        float64
dtype: object

### Import Pandas and Load the Dataset

In [76]:
import pandas as pd
df = pd.read_csv('emissions_EU28.csv')
print("Original DataFrame:")
print(df.head())

Original DataFrame:
  airpol    cpa08 induse origin    unit geo\time         2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        3.952        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  2027709.000  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     2027.709     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0.932        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   477994.000   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  
0        3.965        3.955       

### 1. Identify Missing Values

In [46]:
# Identify missing values in the DataFrame
print("\nMissing values in each column:")
print(df.isnull().sum())


Missing values in each column:
airpol      0
cpa08       0
induse      0
origin      0
unit        0
geo\time    0
2018        0
2017        0
2016        0
2015        0
2014        0
2013        0
2012        0
2011        0
2010        0
2009        0
2008        0
dtype: int64


### 2. Drop Rows with Any Missing Values

In [47]:
# Drop rows with any missing values
df_no_missing_rows = df.dropna()
print("\nDataFrame after dropping rows with missing values:")
print(df_no_missing_rows.head())



DataFrame after dropping rows with missing values:
  airpol    cpa08 induse origin    unit geo\time         2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        3.952        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  2027709.000  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     2027.709     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0.932        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   477994.000   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  
0 

### 3. Drop Columns with Any Missing Values

In [48]:
# Drop columns with any missing values
df_no_missing_cols = df.dropna(axis=1)
print("\nDataFrame after dropping columns with missing values:")
print(df_no_missing_cols.head())



DataFrame after dropping columns with missing values:
  airpol    cpa08 induse origin    unit geo\time         2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        3.952        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  2027709.000  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     2027.709     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0.932        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   477994.000   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  

### 4. Fill Missing Values with a Specific Value

In [49]:
# Fill missing values with a specific value
df_filled_value = df.fillna(0)
print("\nDataFrame after filling missing values with 0:")
print(df_filled_value.head())



DataFrame after filling missing values with 0:
  airpol    cpa08 induse origin    unit geo\time         2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        3.952        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  2027709.000  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     2027.709     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0.932        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   477994.000   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  
0     

### 5. Fill Missing Values Using Forward Fill and Backward Fill Methods

In [50]:
# Forward fill missing values
df_forward_fill = df.fillna(method='ffill')
print("\nDataFrame after forward fill:")
print(df_forward_fill.head())

# Backward fill missing values
df_backward_fill = df.fillna(method='bfill')
print("\nDataFrame after backward fill:")
print(df_backward_fill.head())


  df_forward_fill = df.fillna(method='ffill')
  df_backward_fill = df.fillna(method='bfill')



DataFrame after forward fill:
  airpol    cpa08 induse origin    unit geo\time         2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        3.952        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  2027709.000  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     2027.709     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0.932        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   477994.000   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  
0        3.965        3

### 6. Interpolate Missing Values

In [51]:
# Interpolate missing values
df_interpolated = df.interpolate()
print("\nDataFrame after interpolating missing values:")
print(df_interpolated.head())



DataFrame after interpolating missing values:
  airpol    cpa08 induse origin    unit geo\time         2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        3.952        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  2027709.000  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     2027.709     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0.932        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   477994.000   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  
0      

  df_interpolated = df.interpolate()


### 7. Convert a Column to a Different Data Type

In [52]:
# Convert a column to a different data type
df['2018'] = df['2018'].astype('int64')
print("\nDataFrame after converting '2018' column to integer type:")
print(df.dtypes)



DataFrame after converting '2018' column to integer type:
airpol       object
cpa08        object
induse       object
origin       object
unit         object
geo\time     object
2018          int64
2017        float64
2016        float64
2015        float64
2014        float64
2013        float64
2012        float64
2011        float64
2010        float64
2009        float64
2008        float64
dtype: object


### 8. Apply a Function to Transform the Values of a Column

In [53]:
# Apply a function to transform the values of a column
df['2018'] = df['2018'].apply(lambda x: x * 2)
print("\nDataFrame after applying a function to '2018' column:")
print(df.head())



DataFrame after applying a function to '2018' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  
0        3.965        3

### 9. Normalize a Column Using Min-Max Scaling

In [54]:
# Normalize a column using Min-Max scaling
df['2018_normalized'] = (df['2018'] - df['2018'].min()) / (df['2018'].max() - df['2018'].min())
print("\nDataFrame after Min-Max scaling of '2018' column:")
print(df.head())



DataFrame after Min-Max scaling of '2018' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  2018_normalized  
0        3

### 10. Standardize a Column (Z-Score Normalization)

In [55]:
# Standardize a column (z-score normalization)
df['2018_standardized'] = (df['2018'] - df['2018'].mean()) / df['2018'].std()
print("\nDataFrame after z-score normalization of '2018' column:")
print(df.head())



DataFrame after z-score normalization of '2018' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  2018_normalized  \
0  

### 11. Identify Duplicate Rows in the DataFrame

In [56]:
# Identify duplicate rows in the DataFrame
duplicates = df.duplicated()
print("\nDuplicate rows in the DataFrame:")
print(df[duplicates])



Duplicate rows in the DataFrame:
Empty DataFrame
Columns: [airpol, cpa08, induse, origin, unit, geo\time, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2018_normalized, 2018_standardized]
Index: []


### 12. Drop Duplicate Rows

In [57]:
# Drop duplicate rows
df_no_duplicates = df.drop_duplicates()
print("\nDataFrame after dropping duplicate rows:")
print(df_no_duplicates.head())



DataFrame after dropping duplicate rows:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  2018_normalized  \
0        3.965    

### 13. Drop Duplicate Rows Based on Specific Columns

In [58]:
# Drop duplicate rows based on specific columns
df_no_dup_specific = df.drop_duplicates(subset=['airpol', 'cpa08'])
print("\nDataFrame after dropping duplicate rows based on 'airpol' and 'cpa08':")
print(df_no_dup_specific.head())



DataFrame after dropping duplicate rows based on 'airpol' and 'cpa08':
    airpol       cpa08 induse origin    unit geo\time  2018   2017   2016  \
0      ACG     CPA_A01     P3    DOM  KG_HAB     EU28     6  4.034  3.952   
108    ACG     CPA_A02     P3    DOM  KG_HAB     EU28     0  0.019  0.018   
216    ACG     CPA_A03     P3    DOM  KG_HAB     EU28     0  0.104  0.111   
324    ACG       CPA_B     P3    DOM  KG_HAB     EU28     0  0.019  0.017   
432    ACG  CPA_C10-12     P3    DOM  KG_HAB     EU28    10  5.429  6.062   

      2015   2014   2013   2012   2011   2010   2009   2008  2018_normalized  \
0    3.901  3.824  4.182  4.260  3.965  3.955  4.025  4.082         0.002689   
108  0.021  0.017  0.017  0.017  0.018  0.020  0.021  0.022         0.002689   
216  0.088  0.097  0.108  0.108  0.111  0.130  0.137  0.130         0.002689   
324  0.023  0.016  0.020  0.017  0.022  0.032  0.035  0.039         0.002689   
432  5.752  5.873  6.135  6.249  6.170  6.442  6.808  6.603      

### 14. Convert All String Values in a Column to Lowercase

In [59]:
# Convert all string values in a column to lowercase
df['airpol'] = df['airpol'].str.lower()
print("\nDataFrame after converting 'airpol' column to lowercase:")
print(df.head())



DataFrame after converting 'airpol' column to lowercase:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  2018_normalized  \
0 

### 15. Remove Leading and Trailing Spaces from String Values in a Column

In [60]:
# Remove leading and trailing spaces from string values in a column
df['airpol'] = df['airpol'].str.strip()
print("\nDataFrame after removing leading and trailing spaces from 'airpol' column:")
print(df.head())



DataFrame after removing leading and trailing spaces from 'airpol' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  201

### 16. Replace a Specific Substring in a Column with Another Substring

In [61]:
# Replace a specific substring in a column with another substring
df['airpol'] = df['airpol'].str.replace('co2', 'CO2')
print("\nDataFrame after replacing 'co2' with 'CO2' in 'airpol' column:")
print(df.head())



DataFrame after replacing 'co2' with 'CO2' in 'airpol' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  2018_normalized

### 17. Extract a Substring from Each Value in a Column

In [62]:
# Extract a substring from each value in a column
df['airpol_substring'] = df['airpol'].str[:3]
print("\nDataFrame after extracting first 3 characters of 'airpol' column:")
print(df.head())



DataFrame after extracting first 3 characters of 'airpol' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  2018_normali

### 18. Convert a Column to Datetime Format

In [63]:
# Create a date column for demonstration
df['date'] = pd.to_datetime('2020-01-01')
print("\nDataFrame after adding 'date' column:")
print(df.head())



DataFrame after adding 'date' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015  ...         2013         2012         2011  \
0        3.952        3.901  ...        4.182        4.260        3.965   
1  2018821.000  1986991.000  ...  2116310.000  2149511.000  1996301.000   
2     2018.821     1986.991  ...     2116.310     2149.511     1996.301   
3        0.862        0.898  ...        0.706        0.762        0.857   
4   440390.000   457340.000  ...   357358.000   384481.000   431637.000   

          2010         2009         2008  2018_normalized  2018_s

### 19. Extract Year, Month, and Day from a Datetime Column

In [64]:
# Extract year, month, and day from a datetime column
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
print("\nDataFrame after extracting year, month, and day from 'date' column:")
print(df.head())



DataFrame after extracting year, month, and day from 'date' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015  ...         2010         2009         2008  \
0        3.952        3.901  ...        3.955        4.025        4.082   
1  2018821.000  1986991.000  ...  1992765.000  2023324.000  2046045.000   
2     2018.821     1986.991  ...     1992.765     2023.324     2046.045   
3        0.862        0.898  ...        0.853        0.774        0.991   
4   440390.000   457340.000  ...   429703.000   389072.000   496690.000   

   2018_normalized  2018_standardiz

### 20. Filter Rows Based on a Date Range

In [66]:
# Filter rows based on a date range
df_date_filtered = df[(df['date'] >= '2010-01-01') & (df['date'] <= '2018-12-31')]
print("\nDataFrame after filtering rows based on date range 2010-2018:")
print(df_date_filtered.head())



DataFrame after filtering rows based on date range 2010-2018:
Empty DataFrame
Columns: [airpol, cpa08, induse, origin, unit, geo\time, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2018_normalized, 2018_standardized, airpol_substring, date, year, month, day]
Index: []

[0 rows x 24 columns]


### 21. Convert a Categorical Column to Numerical Using One-Hot Encoding

In [67]:
# Convert a categorical column to numerical using one-hot encoding
df_one_hot_encoded = pd.get_dummies(df, columns=['airpol'])
print("\nDataFrame after one-hot encoding 'airpol' column:")
print(df_one_hot_encoded.head())



DataFrame after one-hot encoding 'airpol' column:
     cpa08 induse origin    unit geo\time     2018         2017         2016  \
0  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034        3.952   
1  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000  2018821.000   
2  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165     2018.821   
3  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930        0.862   
4  CPA_A01     P3    ROW       T     EU28   955988   476142.000   440390.000   

          2015         2014  ...  airpol_nh3_so2e  airpol_nmvoc  airpol_nox  \
0        3.901        3.824  ...            False         False       False   
1  1986991.000  1942285.000  ...            False         False       False   
2     1986.991     1942.285  ...            False         False       False   
3        0.898        0.732  ...            False         False       False   
4   457340.000   371967.000  ...            False         False       Fal

### 22. Convert a Categorical Column to Numerical Using Label Encoding

In [68]:
# Convert a categorical column to numerical using label encoding
df['airpol_encoded'] = df['airpol'].astype('category').cat.codes
print("\nDataFrame after label encoding 'airpol' column:")
print(df.head())



DataFrame after label encoding 'airpol' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015  ...         2009         2008  2018_normalized  \
0        3.952        3.901  ...        4.025        4.082         0.002689   
1  2018821.000  1986991.000  ...  2023324.000  2046045.000         0.003056   
2     2018.821     1986.991  ...     2023.324     2046.045         0.002690   
3        0.862        0.898  ...        0.774        0.991         0.002689   
4   440390.000   457340.000  ...   389072.000   496690.000         0.002776   

   2018_standardized  airpol_su

### 23. Group Values in a Categorical Column and Create a New Column with Grouped Categories

In [69]:
# Group values in a categorical column and create a new column with grouped categories
def group_airpol(x):
    if x in ['co2', 'ch4']:
        return 'group1'
    else:
        return 'group2'

df['airpol_grouped'] = df['airpol'].apply(group_airpol)
print("\nDataFrame after grouping 'airpol' column:")
print(df.head())



DataFrame after grouping 'airpol' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015  ...         2008  2018_normalized  \
0        3.952        3.901  ...        4.082         0.002689   
1  2018821.000  1986991.000  ...  2046045.000         0.003056   
2     2018.821     1986.991  ...     2046.045         0.002690   
3        0.862        0.898  ...        0.991         0.002689   
4   440390.000   457340.000  ...   496690.000         0.002776   

   2018_standardized  airpol_substring       date  year  month  day  \
0          -0.033285               acg 2020-

### 24. Merge Two DataFrames Based on a Common Column

In [70]:
# Create another DataFrame to merge
df2 = pd.DataFrame({
    'airpol': ['co2', 'ch4', 'n2o', 'co', 'nh3'],
    'description': ['Carbon Dioxide', 'Methane', 'Nitrous Oxide', 'Carbon Monoxide', 'Ammonia']
})

# Merge two DataFrames based on a common column
df_merged = pd.merge(df, df2, on='airpol')
print("\nMerged DataFrame based on 'airpol' column:")
print(df_merged.head())



Merged DataFrame based on 'airpol' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    ch4  CPA_A01     P3    DOM  KG_HAB     EU28        8        5.053   
1    ch4  CPA_A01     P3    DOM       T     EU28  5113510  2586624.000   
2    ch4  CPA_A01     P3    DOM   THS_T     EU28     5112     2586.624   
3    ch4  CPA_A01     P3    ROW  KG_HAB     EU28        2        1.200   
4    ch4  CPA_A01     P3    ROW       T     EU28  1238778   614328.000   

          2016         2015  ...  2018_normalized  2018_standardized  \
0        4.948        4.888  ...         0.002689          -0.033285   
1  2527350.000  2489581.000  ...         0.003151           0.010388   
2     2527.350     2489.581  ...         0.002690          -0.033242   
3        1.110        1.173  ...         0.002689          -0.033285   
4   566745.000   597375.000  ...         0.002801          -0.022705   

   airpol_substring       date  year  month  day  airpol_encoded  \
0         

### 25. Concatenate Two DataFrames Vertically

In [71]:
# Concatenate two DataFrames vertically
df_concat_vert = pd.concat([df, df], axis=0)
print("\nDataFrame after vertical concatenation:")
print(df_concat_vert.head())



DataFrame after vertical concatenation:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015  ...         2008  2018_normalized  \
0        3.952        3.901  ...        4.082         0.002689   
1  2018821.000  1986991.000  ...  2046045.000         0.003056   
2     2018.821     1986.991  ...     2046.045         0.002690   
3        0.862        0.898  ...        0.991         0.002689   
4   440390.000   457340.000  ...   496690.000         0.002776   

   2018_standardized  airpol_substring       date  year  month  day  \
0          -0.033285               acg 2020-01

### 26. Concatenate Two DataFrames Horizontally

In [72]:
df_concat_horiz = pd.concat([df, df2], axis=1)
print("\nDataFrame after horizontal concatenation:")
print(df_concat_horiz.head())


DataFrame after horizontal concatenation:
   airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0     acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1     acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2     acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3     acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4     acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015  ...  2018_standardized  airpol_substring  \
0        3.952        3.901  ...          -0.033285               acg   
1  2018821.000  1986991.000  ...           0.001351               acg   
2     2018.821     1986.991  ...          -0.033251               acg   
3        0.862        0.898  ...          -0.033285               acg   
4   440390.000   457340.000  ...          -0.025120               acg   

        date  year  month  day  airpol_encoded  airpol_grouped  air

### 27. Create a New Column Based on Existing Columns

In [73]:
df['total_emissions'] = df[['2018', '2017', '2016', '2015', '2014']].sum(axis=1)
print("\nDataFrame after creating 'total_emissions' column:")
print(df.head())


DataFrame after creating 'total_emissions' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015  ...  2018_normalized  2018_standardized  \
0        3.952        3.901  ...         0.002689          -0.033285   
1  2018821.000  1986991.000  ...         0.003056           0.001351   
2     2018.821     1986.991  ...         0.002690          -0.033251   
3        0.862        0.898  ...         0.002689          -0.033285   
4   440390.000   457340.000  ...         0.002776          -0.025120   

   airpol_substring       date  year  month  day  airpol_encoded  \
0 

### 28. Discretize a Continuous Column into Bins

In [74]:
df['emission_bins'] = pd.cut(df['2018'], bins=3, labels=['Low', 'Medium', 'High'])
print("\nDataFrame after discretizing '2018' column into bins:")
print(df.head())


DataFrame after discretizing '2018' column into bins:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015  ...  2018_standardized  airpol_substring  \
0        3.952        3.901  ...          -0.033285               acg   
1  2018821.000  1986991.000  ...           0.001351               acg   
2     2018.821     1986.991  ...          -0.033251               acg   
3        0.862        0.898  ...          -0.033285               acg   
4   440390.000   457340.000  ...          -0.025120               acg   

        date  year  month  day  airpol_encoded  airpol_groupe

### 29. Create Polynomial Features from Existing Numerical Columns

In [75]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
poly_features = poly.fit_transform(df[['2018', '2017']])

df_poly = pd.DataFrame(poly_features, columns=['1', '2018', '2017', '2018^2', '2018*2017', '2017^2'])
print("\nDataFrame with polynomial features:")
print(df_poly.head())


DataFrame with polynomial features:
     1       2018         2017        2018^2     2018*2017        2017^2
0  1.0        6.0        4.034  3.600000e+01  2.420400e+01  1.627316e+01
1  1.0  4055418.0  2065165.000  1.644642e+13  8.375107e+12  4.264906e+12
2  1.0     4054.0     2065.165  1.643492e+07  8.372179e+06  4.264906e+06
3  1.0        0.0        0.930  0.000000e+00  0.000000e+00  8.649000e-01
4  1.0   955988.0   476142.000  9.139131e+11  4.551860e+11  2.267112e+11
