# Task 14: Data cleaning and preprocessing with Pandas

# Dataset 1 --> emissions_EU28.csv

### Import Pandas and Load the Dataset

In [1]:
import pandas as pd
df = pd.read_csv('emissions_EU28.csv')
print("Original DataFrame:")
print(df.head())
print(df.dtypes)

Original DataFrame:
  airpol    cpa08 induse origin    unit geo\time         2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        3.952        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  2027709.000  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     2027.709     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0.932        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   477994.000   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  
0        3.965        3.955       

### 1. Identify Missing Values

In [2]:
print("\nMissing values in each column:")
print(df.isnull().sum())


Missing values in each column:
airpol      0
cpa08       0
induse      0
origin      0
unit        0
geo\time    0
2018        0
2017        0
2016        0
2015        0
2014        0
2013        0
2012        0
2011        0
2010        0
2009        0
2008        0
dtype: int64


### 2. Drop Rows with Any Missing Values

In [3]:
df_no_missing_rows = df.dropna()
print("\nDataFrame after dropping rows with missing values:")
print(df_no_missing_rows.head())


DataFrame after dropping rows with missing values:
  airpol    cpa08 induse origin    unit geo\time         2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        3.952        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  2027709.000  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     2027.709     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0.932        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   477994.000   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  
0 

### 3. Drop Columns with Any Missing Values

In [4]:
df_no_missing_cols = df.dropna(axis=1)
print("\nDataFrame after dropping columns with missing values:")
print(df_no_missing_cols.head())


DataFrame after dropping columns with missing values:
  airpol    cpa08 induse origin    unit geo\time         2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        3.952        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  2027709.000  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     2027.709     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0.932        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   477994.000   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  

### 4. Fill Missing Values with a Specific Value

In [5]:
df_filled_value = df.fillna(0)
print("\nDataFrame after filling missing values with 0:")
print(df_filled_value.head())


DataFrame after filling missing values with 0:
  airpol    cpa08 induse origin    unit geo\time         2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        3.952        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  2027709.000  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     2027.709     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0.932        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   477994.000   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  
0     

### 5. Fill Missing Values Using Forward Fill and Backward Fill Methods

In [6]:
df_forward_fill = df.fillna(method='ffill')
print("\nDataFrame after forward fill:")
print(df_forward_fill.head())

# Backward fill missing values
df_backward_fill = df.fillna(method='bfill')
print("\nDataFrame after backward fill:")
print(df_backward_fill.head())

  df_forward_fill = df.fillna(method='ffill')



DataFrame after forward fill:
  airpol    cpa08 induse origin    unit geo\time         2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        3.952        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  2027709.000  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     2027.709     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0.932        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   477994.000   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  
0        3.965        3

  df_backward_fill = df.fillna(method='bfill')



DataFrame after backward fill:
  airpol    cpa08 induse origin    unit geo\time         2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        3.952        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  2027709.000  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     2027.709     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0.932        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   477994.000   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  
0        3.965        

### 6. Interpolate Missing Values

In [7]:
df_interpolated = df.interpolate()
print("\nDataFrame after interpolating missing values:")
print(df_interpolated.head())


DataFrame after interpolating missing values:
  airpol    cpa08 induse origin    unit geo\time         2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        3.952        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  2027709.000  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     2027.709     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0.932        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   477994.000   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  
0      

  df_interpolated = df.interpolate()


### 7. Convert a Column to a Different Data Type

In [8]:
df['2018'] = df['2018'].astype('int64')
print("\nDataFrame after converting '2018' column to integer type:")
print(df.dtypes)


DataFrame after converting '2018' column to integer type:
airpol       object
cpa08        object
induse       object
origin       object
unit         object
geo\time     object
2018          int64
2017        float64
2016        float64
2015        float64
2014        float64
2013        float64
2012        float64
2011        float64
2010        float64
2009        float64
2008        float64
dtype: object


### 8. Apply a Function to Transform the Values of a Column

In [9]:
df['2018'] = df['2018'].apply(lambda x: x * 2)
print("\nDataFrame after applying a function to '2018' column:")
print(df.head())


DataFrame after applying a function to '2018' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  
0        3.965        3

### 9. Normalize a Column Using Min-Max Scaling

In [10]:
df['2018_normalized'] = (df['2018'] - df['2018'].min()) / (df['2018'].max() - df['2018'].min())
print("\nDataFrame after Min-Max scaling of '2018' column:")
print(df.head())


DataFrame after Min-Max scaling of '2018' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  2018_normalized  
0        3

### 10. Standardize a Column (Z-Score Normalization)

In [11]:
df['2018_standardized'] = (df['2018'] - df['2018'].mean()) / df['2018'].std()
print("\nDataFrame after z-score normalization of '2018' column:")
print(df.head())


DataFrame after z-score normalization of '2018' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  2018_normalized  \
0  

### 11. Identify Duplicate Rows in the DataFrame

In [12]:
duplicates = df.duplicated()
print("\nDuplicate rows in the DataFrame:")
print(df[duplicates])


Duplicate rows in the DataFrame:
Empty DataFrame
Columns: [airpol, cpa08, induse, origin, unit, geo\time, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2018_normalized, 2018_standardized]
Index: []


### 12. Drop Duplicate Rows

In [13]:
df_no_duplicates = df.drop_duplicates()
print("\nDataFrame after dropping duplicate rows:")
print(df_no_duplicates.head())


DataFrame after dropping duplicate rows:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    ACG  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    ACG  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    ACG  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    ACG  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    ACG  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  2018_normalized  \
0        3.965    

### 13. Drop Duplicate Rows Based on Specific Columns

In [14]:
df_no_dup_specific = df.drop_duplicates(subset=['airpol', 'cpa08'])
print("\nDataFrame after dropping duplicate rows based on 'airpol' and 'cpa08':")
print(df_no_dup_specific.head())


DataFrame after dropping duplicate rows based on 'airpol' and 'cpa08':
    airpol       cpa08 induse origin    unit geo\time  2018   2017   2016  \
0      ACG     CPA_A01     P3    DOM  KG_HAB     EU28     6  4.034  3.952   
108    ACG     CPA_A02     P3    DOM  KG_HAB     EU28     0  0.019  0.018   
216    ACG     CPA_A03     P3    DOM  KG_HAB     EU28     0  0.104  0.111   
324    ACG       CPA_B     P3    DOM  KG_HAB     EU28     0  0.019  0.017   
432    ACG  CPA_C10-12     P3    DOM  KG_HAB     EU28    10  5.429  6.062   

      2015   2014   2013   2012   2011   2010   2009   2008  2018_normalized  \
0    3.901  3.824  4.182  4.260  3.965  3.955  4.025  4.082         0.002689   
108  0.021  0.017  0.017  0.017  0.018  0.020  0.021  0.022         0.002689   
216  0.088  0.097  0.108  0.108  0.111  0.130  0.137  0.130         0.002689   
324  0.023  0.016  0.020  0.017  0.022  0.032  0.035  0.039         0.002689   
432  5.752  5.873  6.135  6.249  6.170  6.442  6.808  6.603      

### 14. Convert All String Values in a Column to Lowercase

In [15]:
df['airpol'] = df['airpol'].str.lower()
print("\nDataFrame after converting 'airpol' column to lowercase:")
print(df.head())


DataFrame after converting 'airpol' column to lowercase:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  2018_normalized  \
0 

### 15. Remove Leading and Trailing Spaces from String Values in a Column

In [16]:
df['airpol'] = df['airpol'].str.strip()
print("\nDataFrame after removing leading and trailing spaces from 'airpol' column:")
print(df.head())


DataFrame after removing leading and trailing spaces from 'airpol' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  201

### 16. Replace a Specific Substring in a Column with Another Substring

In [17]:
df['airpol'] = df['airpol'].str.replace('co2', 'CO2')
print("\nDataFrame after replacing 'co2' with 'CO2' in 'airpol' column:")
print(df.head())


DataFrame after replacing 'co2' with 'CO2' in 'airpol' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  2018_normalized

### 17. Extract a Substring from Each Value in a Column

In [18]:
df['airpol_substring'] = df['airpol'].str[:3]
print("\nDataFrame after extracting first 3 characters of 'airpol' column:")
print(df.head())


DataFrame after extracting first 3 characters of 'airpol' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015         2014         2013         2012  \
0        3.952        3.901        3.824        4.182        4.260   
1  2018821.000  1986991.000  1942285.000  2116310.000  2149511.000   
2     2018.821     1986.991     1942.285     2116.310     2149.511   
3        0.862        0.898        0.732        0.706        0.762   
4   440390.000   457340.000   371967.000   357358.000   384481.000   

          2011         2010         2009         2008  2018_normali

### 18. Convert a Column to Datetime Format

In [19]:
df['date'] = pd.to_datetime('2020-01-01')
print("\nDataFrame after adding 'date' column:")
print(df.head())


DataFrame after adding 'date' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015  ...         2013         2012         2011  \
0        3.952        3.901  ...        4.182        4.260        3.965   
1  2018821.000  1986991.000  ...  2116310.000  2149511.000  1996301.000   
2     2018.821     1986.991  ...     2116.310     2149.511     1996.301   
3        0.862        0.898  ...        0.706        0.762        0.857   
4   440390.000   457340.000  ...   357358.000   384481.000   431637.000   

          2010         2009         2008  2018_normalized  2018_s

### 19. Extract Year, Month, and Day from a Datetime Column

In [20]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
print("\nDataFrame after extracting year, month, and day from 'date' column:")
print(df.head())


DataFrame after extracting year, month, and day from 'date' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015  ...         2010         2009         2008  \
0        3.952        3.901  ...        3.955        4.025        4.082   
1  2018821.000  1986991.000  ...  1992765.000  2023324.000  2046045.000   
2     2018.821     1986.991  ...     1992.765     2023.324     2046.045   
3        0.862        0.898  ...        0.853        0.774        0.991   
4   440390.000   457340.000  ...   429703.000   389072.000   496690.000   

   2018_normalized  2018_standardiz

### 20. Filter Rows Based on a Date Range

In [21]:
df_date_filtered = df[(df['date'] >= '2010-01-01') & (df['date'] <= '2018-12-31')]
print("\nDataFrame after filtering rows based on date range 2010-2018:")
print(df_date_filtered.head())


DataFrame after filtering rows based on date range 2010-2018:
Empty DataFrame
Columns: [airpol, cpa08, induse, origin, unit, geo\time, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2018_normalized, 2018_standardized, airpol_substring, date, year, month, day]
Index: []

[0 rows x 24 columns]


### 21. Convert a Categorical Column to Numerical Using One-Hot Encoding

In [22]:
df_one_hot_encoded = pd.get_dummies(df, columns=['airpol'])
print("\nDataFrame after one-hot encoding 'airpol' column:")
print(df_one_hot_encoded.head())


DataFrame after one-hot encoding 'airpol' column:
     cpa08 induse origin    unit geo\time     2018         2017         2016  \
0  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034        3.952   
1  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000  2018821.000   
2  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165     2018.821   
3  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930        0.862   
4  CPA_A01     P3    ROW       T     EU28   955988   476142.000   440390.000   

          2015         2014  ...  airpol_nh3_so2e  airpol_nmvoc  airpol_nox  \
0        3.901        3.824  ...            False         False       False   
1  1986991.000  1942285.000  ...            False         False       False   
2     1986.991     1942.285  ...            False         False       False   
3        0.898        0.732  ...            False         False       False   
4   457340.000   371967.000  ...            False         False       Fal

### 22. Convert a Categorical Column to Numerical Using Label Encoding

In [23]:
df['airpol_encoded'] = df['airpol'].astype('category').cat.codes
print("\nDataFrame after label encoding 'airpol' column:")
print(df.head())


DataFrame after label encoding 'airpol' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015  ...         2009         2008  2018_normalized  \
0        3.952        3.901  ...        4.025        4.082         0.002689   
1  2018821.000  1986991.000  ...  2023324.000  2046045.000         0.003056   
2     2018.821     1986.991  ...     2023.324     2046.045         0.002690   
3        0.862        0.898  ...        0.774        0.991         0.002689   
4   440390.000   457340.000  ...   389072.000   496690.000         0.002776   

   2018_standardized  airpol_su

### 23. Group Values in a Categorical Column and Create a New Column with Grouped Categories

In [24]:
def group_airpol(x):
    if x in ['co2', 'ch4']:
        return 'group1'
    else:
        return 'group2'

df['airpol_grouped'] = df['airpol'].apply(group_airpol)
print("\nDataFrame after grouping 'airpol' column:")
print(df.head())


DataFrame after grouping 'airpol' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015  ...         2008  2018_normalized  \
0        3.952        3.901  ...        4.082         0.002689   
1  2018821.000  1986991.000  ...  2046045.000         0.003056   
2     2018.821     1986.991  ...     2046.045         0.002690   
3        0.862        0.898  ...        0.991         0.002689   
4   440390.000   457340.000  ...   496690.000         0.002776   

   2018_standardized  airpol_substring       date  year  month  day  \
0          -0.033285               acg 2020-

### 24. Merge Two DataFrames Based on a Common Column

In [25]:
df2 = pd.DataFrame({
    'airpol': ['co2', 'ch4', 'n2o', 'co', 'nh3'],
    'description': ['Carbon Dioxide', 'Methane', 'Nitrous Oxide', 'Carbon Monoxide', 'Ammonia']
})

df_merged = pd.merge(df, df2, on='airpol')
print("\nMerged DataFrame based on 'airpol' column:")
print(df_merged.head())


Merged DataFrame based on 'airpol' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    ch4  CPA_A01     P3    DOM  KG_HAB     EU28        8        5.053   
1    ch4  CPA_A01     P3    DOM       T     EU28  5113510  2586624.000   
2    ch4  CPA_A01     P3    DOM   THS_T     EU28     5112     2586.624   
3    ch4  CPA_A01     P3    ROW  KG_HAB     EU28        2        1.200   
4    ch4  CPA_A01     P3    ROW       T     EU28  1238778   614328.000   

          2016         2015  ...  2018_normalized  2018_standardized  \
0        4.948        4.888  ...         0.002689          -0.033285   
1  2527350.000  2489581.000  ...         0.003151           0.010388   
2     2527.350     2489.581  ...         0.002690          -0.033242   
3        1.110        1.173  ...         0.002689          -0.033285   
4   566745.000   597375.000  ...         0.002801          -0.022705   

   airpol_substring       date  year  month  day  airpol_encoded  \
0         

### 25. Concatenate Two DataFrames Vertically

In [26]:
df_concat_vert = pd.concat([df, df], axis=0)
print("\nDataFrame after vertical concatenation:")
print(df_concat_vert.head())


DataFrame after vertical concatenation:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015  ...         2008  2018_normalized  \
0        3.952        3.901  ...        4.082         0.002689   
1  2018821.000  1986991.000  ...  2046045.000         0.003056   
2     2018.821     1986.991  ...     2046.045         0.002690   
3        0.862        0.898  ...        0.991         0.002689   
4   440390.000   457340.000  ...   496690.000         0.002776   

   2018_standardized  airpol_substring       date  year  month  day  \
0          -0.033285               acg 2020-01

### 26. Concatenate Two DataFrames Horizontally

In [27]:
df_concat_horiz = pd.concat([df, df2], axis=1)
print("\nDataFrame after horizontal concatenation:")
print(df_concat_horiz.head())


DataFrame after horizontal concatenation:
   airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0     acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1     acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2     acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3     acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4     acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015  ...  2018_standardized  airpol_substring  \
0        3.952        3.901  ...          -0.033285               acg   
1  2018821.000  1986991.000  ...           0.001351               acg   
2     2018.821     1986.991  ...          -0.033251               acg   
3        0.862        0.898  ...          -0.033285               acg   
4   440390.000   457340.000  ...          -0.025120               acg   

        date  year  month  day  airpol_encoded  airpol_grouped  air

### 27. Create a New Column Based on Existing Columns

In [28]:
df['total_emissions'] = df[['2018', '2017', '2016', '2015', '2014']].sum(axis=1)
print("\nDataFrame after creating 'total_emissions' column:")
print(df.head())


DataFrame after creating 'total_emissions' column:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015  ...  2018_normalized  2018_standardized  \
0        3.952        3.901  ...         0.002689          -0.033285   
1  2018821.000  1986991.000  ...         0.003056           0.001351   
2     2018.821     1986.991  ...         0.002690          -0.033251   
3        0.862        0.898  ...         0.002689          -0.033285   
4   440390.000   457340.000  ...         0.002776          -0.025120   

   airpol_substring       date  year  month  day  airpol_encoded  \
0 

### 28. Discretize a Continuous Column into Bins

In [29]:
df['emission_bins'] = pd.cut(df['2018'], bins=3, labels=['Low', 'Medium', 'High'])
print("\nDataFrame after discretizing '2018' column into bins:")
print(df.head())


DataFrame after discretizing '2018' column into bins:
  airpol    cpa08 induse origin    unit geo\time     2018         2017  \
0    acg  CPA_A01     P3    DOM  KG_HAB     EU28        6        4.034   
1    acg  CPA_A01     P3    DOM       T     EU28  4055418  2065165.000   
2    acg  CPA_A01     P3    DOM   THS_T     EU28     4054     2065.165   
3    acg  CPA_A01     P3    ROW  KG_HAB     EU28        0        0.930   
4    acg  CPA_A01     P3    ROW       T     EU28   955988   476142.000   

          2016         2015  ...  2018_standardized  airpol_substring  \
0        3.952        3.901  ...          -0.033285               acg   
1  2018821.000  1986991.000  ...           0.001351               acg   
2     2018.821     1986.991  ...          -0.033251               acg   
3        0.862        0.898  ...          -0.033285               acg   
4   440390.000   457340.000  ...          -0.025120               acg   

        date  year  month  day  airpol_encoded  airpol_groupe

### 29. Create Polynomial Features from Existing Numerical Columns

In [30]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
poly_features = poly.fit_transform(df[['2018', '2017']])
df_poly = pd.DataFrame(poly_features, columns=['1', '2018', '2017', '2018^2', '2018*2017', '2017^2'])
print("\nDataFrame with polynomial features:")
print(df_poly.head())


DataFrame with polynomial features:
     1       2018         2017        2018^2     2018*2017        2017^2
0  1.0        6.0        4.034  3.600000e+01  2.420400e+01  1.627316e+01
1  1.0  4055418.0  2065165.000  1.644642e+13  8.375107e+12  4.264906e+12
2  1.0     4054.0     2065.165  1.643492e+07  8.372179e+06  4.264906e+06
3  1.0        0.0        0.930  0.000000e+00  0.000000e+00  8.649000e-01
4  1.0   955988.0   476142.000  9.139131e+11  4.551860e+11  2.267112e+11


# Dataset 2 --> final_book_dataset_kaggle2.csv

### Import Pandas and Load the Dataset

In [31]:
df = pd.read_csv('final_book_dataset_kaggle2.csv')
print("Initial DataFrame:")
print(df.head())
print(df.dtypes)

Initial DataFrame:
                                               title  \
0  Data Analysis Using R (Low Priced Edition): A ...   
1  Head First Data Analysis: A learner's guide to...   
2  Guerrilla Data Analysis Using Microsoft Excel:...   
3  Python for Data Analysis: Data Wrangling with ...   
4  Excel Data Analysis For Dummies (For Dummies (...   

                               author  price price (including used books)  \
0                 [ Dr Dhaval Maheta]   6.75                         6.75   
1                                 NaN  33.72               21.49 - 33.72    
2  [ Oz du Soleil,  and , Bill Jelen]  32.07                        32.07   
3                 [ William McKinney]  53.99                        53.99   
4                   [ Paul McFedries]  24.49                        24.49   

  pages  avg_reviews n_reviews star5 star4 star3 star2 star1  \
0   500          4.4        23   55%   39%    6%   NaN   NaN   
1   484          4.3       124   61%   20%    9%    4

### 1. Identify Missing Values

In [32]:
missing_values = df.isnull().sum()
print("\nMissing values in each column:")
print(missing_values)


Missing values in each column:
title                             0
author                          173
price                           108
price (including used books)    108
pages                            85
avg_reviews                     128
n_reviews                       128
star5                           128
star4                           195
star3                           276
star2                           379
star1                           502
dimensions                      186
weight                          179
language                         71
publisher                       116
ISBN_13                         165
link                              0
complete_link                     0
dtype: int64


### 2. Drop Rows with Any Missing Values

In [33]:
df_dropped_rows = df.dropna()
print("\nDataFrame after dropping rows with missing values:")
print(df_dropped_rows)


DataFrame after dropping rows with missing values:
                                                 title  \
3    Python for Data Analysis: Data Wrangling with ...   
4    Excel Data Analysis For Dummies (For Dummies (...   
6    SQL for Data Analysis: Advanced Techniques for...   
15   SQL QuickStart Guide: The Simplified Beginner'...   
18   Python para Principiantes: 2 Libros en 1: Prog...   
..                                                 ...   
804  Essential Calculus Skills Practice Workbook wi...   
807  Machine Learning with R: Expert techniques for...   
821  Machine Learning with PyTorch and Scikit-Learn...   
825   Deep Learning: Engage the World Change the World   
826  Machine Learning in Finance: From Theory to Pr...   

                                              author  price  \
3                                [ William McKinney]  53.99   
4                                  [ Paul McFedries]  24.49   
6                                  [ Cathy Tanimura]  40.49   

### 3. Drop Rows with Any Missing Values

In [34]:
df_dropped_columns = df.dropna(axis=1)
print("\nDataFrame after dropping columns with missing values:")
print(df_dropped_columns)


DataFrame after dropping columns with missing values:
                                                 title  \
0    Data Analysis Using R (Low Priced Edition): A ...   
1    Head First Data Analysis: A learner's guide to...   
2    Guerrilla Data Analysis Using Microsoft Excel:...   
3    Python for Data Analysis: Data Wrangling with ...   
4    Excel Data Analysis For Dummies (For Dummies (...   
..                                                 ...   
825   Deep Learning: Engage the World Change the World   
826  Machine Learning in Finance: From Theory to Pr...   
827  Practical Deep Learning at Scale with MLflow: ...   
828  Clinical Biostatistics and Epidemiology Made R...   
829  AI and Machine Learning for Coders: A Programm...   

                                                  link  \
0    /Data-Analysis-Using-Low-Priced/dp/1685549594/...   
1    /Head-First-Data-Analysis-statistics/dp/059615...   
2    /Guerrilla-Analysis-Using-Microsoft-Excel/dp/1...   
3    /Python-Dat

### 4. Drop Columns with Any Missing Values

In [35]:
df_filled_value = df.fillna(0)
print("\nDataFrame after filling missing values with 0:")
print(df_filled_value)


DataFrame after filling missing values with 0:
                                                 title  \
0    Data Analysis Using R (Low Priced Edition): A ...   
1    Head First Data Analysis: A learner's guide to...   
2    Guerrilla Data Analysis Using Microsoft Excel:...   
3    Python for Data Analysis: Data Wrangling with ...   
4    Excel Data Analysis For Dummies (For Dummies (...   
..                                                 ...   
825   Deep Learning: Engage the World Change the World   
826  Machine Learning in Finance: From Theory to Pr...   
827  Practical Deep Learning at Scale with MLflow: ...   
828  Clinical Biostatistics and Epidemiology Made R...   
829  AI and Machine Learning for Coders: A Programm...   

                                         author  price  \
0                           [ Dr Dhaval Maheta]   6.75   
1                                             0  33.72   
2            [ Oz du Soleil,  and , Bill Jelen]  32.07   
3                      

### 5. Fill Missing Values with a Specific Value

In [36]:
# Forward fill
df_ffill = df.fillna(method='ffill')
print("\nDataFrame after forward fill:")
print(df_ffill)

# Backward fill
df_bfill = df.fillna(method='bfill')
print("\nDataFrame after backward fill:")
print(df_bfill)


DataFrame after forward fill:
                                                 title  \
0    Data Analysis Using R (Low Priced Edition): A ...   
1    Head First Data Analysis: A learner's guide to...   
2    Guerrilla Data Analysis Using Microsoft Excel:...   
3    Python for Data Analysis: Data Wrangling with ...   
4    Excel Data Analysis For Dummies (For Dummies (...   
..                                                 ...   
825   Deep Learning: Engage the World Change the World   
826  Machine Learning in Finance: From Theory to Pr...   
827  Practical Deep Learning at Scale with MLflow: ...   
828  Clinical Biostatistics and Epidemiology Made R...   
829  AI and Machine Learning for Coders: A Programm...   

                                         author  price  \
0                           [ Dr Dhaval Maheta]   6.75   
1                           [ Dr Dhaval Maheta]  33.72   
2            [ Oz du Soleil,  and , Bill Jelen]  32.07   
3                           [ William Mc

  df_ffill = df.fillna(method='ffill')
  df_bfill = df.fillna(method='bfill')


### 6. Fill Missing Values Using Forward Fill and Backward Fill Methods

In [37]:
df_interpolated = df.interpolate()
print("\nDataFrame after interpolation:")
print(df_interpolated)


DataFrame after interpolation:
                                                 title  \
0    Data Analysis Using R (Low Priced Edition): A ...   
1    Head First Data Analysis: A learner's guide to...   
2    Guerrilla Data Analysis Using Microsoft Excel:...   
3    Python for Data Analysis: Data Wrangling with ...   
4    Excel Data Analysis For Dummies (For Dummies (...   
..                                                 ...   
825   Deep Learning: Engage the World Change the World   
826  Machine Learning in Finance: From Theory to Pr...   
827  Practical Deep Learning at Scale with MLflow: ...   
828  Clinical Biostatistics and Epidemiology Made R...   
829  AI and Machine Learning for Coders: A Programm...   

                                         author  price  \
0                           [ Dr Dhaval Maheta]   6.75   
1                                           NaN  33.72   
2            [ Oz du Soleil,  and , Bill Jelen]  32.07   
3                           [ William M

  df_interpolated = df.interpolate()


### 7. Interpolate Missing Values

In [38]:
import numpy as np 
df['price'].fillna(0, inplace=True)  # Replace NaN with 0 (adjust as needed)
df['price'] = df['price'].astype(int)
print("\nDataFrame after converting 'price' to integer:")
print(df.dtypes)


DataFrame after converting 'price' to integer:
title                            object
author                           object
price                             int32
price (including used books)     object
pages                            object
avg_reviews                     float64
n_reviews                        object
star5                            object
star4                            object
star3                            object
star2                            object
star1                            object
dimensions                       object
weight                           object
language                         object
publisher                        object
ISBN_13                          object
link                             object
complete_link                    object
dtype: object


### 8. Convert a Column to a Different Data Type

In [39]:
df['price'] = df['price'].apply(lambda x: x * 1.1)
print("\nDataFrame after applying a function to 'price':")
print(df.head())


DataFrame after applying a function to 'price':
                                               title  \
0  Data Analysis Using R (Low Priced Edition): A ...   
1  Head First Data Analysis: A learner's guide to...   
2  Guerrilla Data Analysis Using Microsoft Excel:...   
3  Python for Data Analysis: Data Wrangling with ...   
4  Excel Data Analysis For Dummies (For Dummies (...   

                               author  price price (including used books)  \
0                 [ Dr Dhaval Maheta]    6.6                         6.75   
1                                 NaN   36.3               21.49 - 33.72    
2  [ Oz du Soleil,  and , Bill Jelen]   35.2                        32.07   
3                 [ William McKinney]   58.3                        53.99   
4                   [ Paul McFedries]   26.4                        24.49   

  pages  avg_reviews n_reviews star5 star4 star3 star2 star1  \
0   500          4.4        23   55%   39%    6%   NaN   NaN   
1   484          4.3   

### 9. Apply a Function to Transform the Values of a Column

In [40]:
df['price_normalized'] = (df['price'] - df['price'].min()) / (df['price'].max() - df['price'].min())
print("\nDataFrame after Min-Max scaling of 'price':")
print(df[['price', 'price_normalized']].head())


DataFrame after Min-Max scaling of 'price':
   price  price_normalized
0    6.6          0.004552
1   36.3          0.025038
2   35.2          0.024279
3   58.3          0.040212
4   26.4          0.018209


### 10. Normalize a Column Using Min-Max Scaling

In [41]:
df['price_normalized'] = (df['price'] - df['price'].min()) / (df['price'].max() - df['price'].min())
print("\nDataFrame after Min-Max scaling of 'price':")
print(df[['price', 'price_normalized']].head())


DataFrame after Min-Max scaling of 'price':
   price  price_normalized
0    6.6          0.004552
1   36.3          0.025038
2   35.2          0.024279
3   58.3          0.040212
4   26.4          0.018209


### 11. Standardize a Column (Z-Score Normalization)

In [42]:
df['price_standardized'] = (df['price'] - df['price'].mean()) / df['price'].std()
print("\nDataFrame after z-score normalization of 'price':")
print(df[['price', 'price_standardized']].head())


DataFrame after z-score normalization of 'price':
   price  price_standardized
0    6.6           -0.554591
1   36.3           -0.112237
2   35.2           -0.128620
3   58.3            0.215433
4   26.4           -0.259688


### 12. Identify Duplicate Rows in the DataFrame

In [43]:
duplicate_rows = df.duplicated()
print("\nDuplicate rows in the DataFrame:")
print(duplicate_rows.sum())


Duplicate rows in the DataFrame:
0


### 13. Drop Duplicate Rows

In [44]:
df_no_duplicates = df.drop_duplicates()
print("\nDataFrame after dropping duplicate rows:")
print(df_no_duplicates)


DataFrame after dropping duplicate rows:
                                                 title  \
0    Data Analysis Using R (Low Priced Edition): A ...   
1    Head First Data Analysis: A learner's guide to...   
2    Guerrilla Data Analysis Using Microsoft Excel:...   
3    Python for Data Analysis: Data Wrangling with ...   
4    Excel Data Analysis For Dummies (For Dummies (...   
..                                                 ...   
825   Deep Learning: Engage the World Change the World   
826  Machine Learning in Finance: From Theory to Pr...   
827  Practical Deep Learning at Scale with MLflow: ...   
828  Clinical Biostatistics and Epidemiology Made R...   
829  AI and Machine Learning for Coders: A Programm...   

                                         author  price  \
0                           [ Dr Dhaval Maheta]    6.6   
1                                           NaN   36.3   
2            [ Oz du Soleil,  and , Bill Jelen]   35.2   
3                           [

### 14. Drop Duplicate Rows Based on Specific Columns

In [45]:
df_no_duplicates_specified = df.drop_duplicates(subset=['title', 'author'])
print("\nDataFrame after dropping duplicate rows based on 'title' and 'author':")
print(df_no_duplicates_specified)


DataFrame after dropping duplicate rows based on 'title' and 'author':
                                                 title  \
0    Data Analysis Using R (Low Priced Edition): A ...   
1    Head First Data Analysis: A learner's guide to...   
2    Guerrilla Data Analysis Using Microsoft Excel:...   
3    Python for Data Analysis: Data Wrangling with ...   
4    Excel Data Analysis For Dummies (For Dummies (...   
..                                                 ...   
825   Deep Learning: Engage the World Change the World   
826  Machine Learning in Finance: From Theory to Pr...   
827  Practical Deep Learning at Scale with MLflow: ...   
828  Clinical Biostatistics and Epidemiology Made R...   
829  AI and Machine Learning for Coders: A Programm...   

                                         author  price  \
0                           [ Dr Dhaval Maheta]    6.6   
1                                           NaN   36.3   
2            [ Oz du Soleil,  and , Bill Jelen]   35.2   

### 15. Convert All String Values in a Column to Lowercase

In [46]:
df['author'] = df['author'].str.lower()
print("\nDataFrame after converting 'author' to lowercase:")
print(df[['author']].head())


DataFrame after converting 'author' to lowercase:
                               author
0                 [ dr dhaval maheta]
1                                 NaN
2  [ oz du soleil,  and , bill jelen]
3                 [ william mckinney]
4                   [ paul mcfedries]


### 16. Remove Leading and Trailing Spaces from String Values in a Column

In [47]:
df['title'] = df['title'].str.strip()
print("\nDataFrame after removing leading and trailing spaces from 'title':")
print(df[['title']].head())


DataFrame after removing leading and trailing spaces from 'title':
                                               title
0  Data Analysis Using R (Low Priced Edition): A ...
1  Head First Data Analysis: A learner's guide to...
2  Guerrilla Data Analysis Using Microsoft Excel:...
3  Python for Data Analysis: Data Wrangling with ...
4  Excel Data Analysis For Dummies (For Dummies (...


### 17. Replace a Specific Substring in a Column with Another Substring

In [48]:
df['title'] = df['title'].str.replace('book', 'novel')
print("\nDataFrame after replacing 'book' with 'novel' in 'title':")
print(df[['title']].head())


DataFrame after replacing 'book' with 'novel' in 'title':
                                               title
0  Data Analysis Using R (Low Priced Edition): A ...
1  Head First Data Analysis: A learner's guide to...
2  Guerrilla Data Analysis Using Microsoft Excel:...
3  Python for Data Analysis: Data Wrangling with ...
4  Excel Data Analysis For Dummies (For Dummies (...


### 18. Extract a Substring from Each Value in a Column

In [49]:
df['title_short'] = df['title'].str[:10]
print("\nDataFrame after extracting first 10 characters of 'title':")
print(df[['title', 'title_short']].head())


DataFrame after extracting first 10 characters of 'title':
                                               title title_short
0  Data Analysis Using R (Low Priced Edition): A ...  Data Analy
1  Head First Data Analysis: A learner's guide to...  Head First
2  Guerrilla Data Analysis Using Microsoft Excel:...  Guerrilla 
3  Python for Data Analysis: Data Wrangling with ...  Python for
4  Excel Data Analysis For Dummies (For Dummies (...  Excel Data


### 19. Convert a Categorical Column to Numerical Using One-Hot Encoding

In [50]:
df_one_hot_encoded = pd.get_dummies(df, columns=['language'])
print("\nDataFrame after one-hot encoding 'language':")
print(df_one_hot_encoded.head())


DataFrame after one-hot encoding 'language':
                                               title  \
0  Data Analysis Using R (Low Priced Edition): A ...   
1  Head First Data Analysis: A learner's guide to...   
2  Guerrilla Data Analysis Using Microsoft Excel:...   
3  Python for Data Analysis: Data Wrangling with ...   
4  Excel Data Analysis For Dummies (For Dummies (...   

                               author  price price (including used books)  \
0                 [ dr dhaval maheta]    6.6                         6.75   
1                                 NaN   36.3               21.49 - 33.72    
2  [ oz du soleil,  and , bill jelen]   35.2                        32.07   
3                 [ william mckinney]   58.3                        53.99   
4                   [ paul mcfedries]   26.4                        24.49   

  pages  avg_reviews n_reviews star5 star4 star3  ... language_English  \
0   500          4.4        23   55%   39%    6%  ...             True   
1   48

### 20. Convert a Categorical Column to Numerical Using Label Encoding

In [51]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['publisher_encoded'] = label_encoder.fit_transform(df['publisher'])
print("\nDataFrame after label encoding 'publisher':")
print(df[['publisher', 'publisher_encoded']].head())


DataFrame after label encoding 'publisher':
                                           publisher  publisher_encoded
0     Notion Press Media Pvt Ltd (November 22, 2021)                344
1      O'Reilly Media; 1st edition (August 18, 2009)                350
2  Holy Macro! Books; Third edition (August 1, 2022)                160
3    O'Reilly Media; 2nd edition (November 14, 2017)                408
4        For Dummies; 5th edition (February 3, 2022)                150


### 21. Group Values in a Categorical Column and Create a New Column with Grouped Categories

In [52]:
df.dropna(subset=['publisher'], inplace=True)  # Drops rows with NaN in 'publisher'
publisher_counts = df['publisher'].value_counts()
df['publisher_grouped'] = df['publisher'].apply(lambda x: 'Other' if publisher_counts[x] < 5 else x)
print(df[['publisher', 'publisher_grouped']].head())

                                           publisher publisher_grouped
0     Notion Press Media Pvt Ltd (November 22, 2021)             Other
1      O'Reilly Media; 1st edition (August 18, 2009)             Other
2  Holy Macro! Books; Third edition (August 1, 2022)             Other
3    O'Reilly Media; 2nd edition (November 14, 2017)             Other
4        For Dummies; 5th edition (February 3, 2022)             Other


### 22. Merge Two DataFrames Based on a Common Column

In [53]:
additional_data = {
    'ISBN_13': ['978-3-16-148410-0', '978-1-86197-876-9'],
    'Genre': ['Fiction', 'Non-Fiction']
}
df_additional = pd.DataFrame(additional_data)

df_merged = pd.merge(df, df_additional, on='ISBN_13', how='left')
print("\nDataFrame after merging with additional data:")
print(df_merged.head())


DataFrame after merging with additional data:
                                               title  \
0  Data Analysis Using R (Low Priced Edition): A ...   
1  Head First Data Analysis: A learner's guide to...   
2  Guerrilla Data Analysis Using Microsoft Excel:...   
3  Python for Data Analysis: Data Wrangling with ...   
4  Excel Data Analysis For Dummies (For Dummies (...   

                               author  price price (including used books)  \
0                 [ dr dhaval maheta]    6.6                         6.75   
1                                 NaN   36.3               21.49 - 33.72    
2  [ oz du soleil,  and , bill jelen]   35.2                        32.07   
3                 [ william mckinney]   58.3                        53.99   
4                   [ paul mcfedries]   26.4                        24.49   

  pages  avg_reviews n_reviews star5 star4 star3  ...  \
0   500          4.4        23   55%   39%    6%  ...   
1   484          4.3       124   61%   

### 23. Concatenate Two DataFrames Vertically

In [54]:
additional_rows = pd.DataFrame({
    'title': ['New Book 1', 'New Book 2'],
    'author': ['Author A', 'Author B'],
    'price': [20.0, 30.0],
    'price (including used books)': [15.0, 25.0],
    'pages': [300, 400],
    'avg_reviews': [4.5, 4.0],
    'n_reviews': [100, 150],
    'star5': [50, 60],
    'star4': [30, 50],
    'star3': [10, 20],
    'star2': [5, 10],
    'star1': [5, 10],
    'dimensions': ['8 x 5 x 1', '9 x 6 x 1.5'],
    'weight': [1.0, 1.5],
    'language': ['English', 'English'],
    'publisher': ['Publisher A', 'Publisher B'],
    'ISBN_13': ['123-4-567-89012-3', '123-4-567-89012-4'],
    'link': ['link1', 'link2'],
    'complete_link': ['complete_link1', 'complete_link2']
})

df_concatenated_vertical = pd.concat([df, additional_rows], axis=0, ignore_index=True)
print("\nDataFrame after vertical concatenation:")
print(df_concatenated_vertical.tail())


DataFrame after vertical concatenation:
                                                 title  \
711   Deep Learning: Engage the World Change the World   
712  Machine Learning in Finance: From Theory to Pr...   
713  Practical Deep Learning at Scale with MLflow: ...   
714                                         New Book 1   
715                                         New Book 2   

                                         author  price  \
711     [ michael fullan, joanne quinn, et al.]    5.5   
712  [ matthew f. dixon, igor halperin, et al.]   60.5   
713       [ yong liu,  and , dr. matei zaharia]   48.4   
714                                    Author A   20.0   
715                                    Author B   30.0   

    price (including used books) pages  avg_reviews n_reviews star5 star4  \
711                8.55 - 35.33    208          4.5        74   72%   14%   
712               52.41 - 55.18    573          4.5        93   78%    7%   
713                        44.

### 24. Concatenate Two DataFrames Horizontally

In [55]:
additional_columns = pd.DataFrame({
    'new_col1': ['value1', 'value2', 'value3', 'value4', 'value5'],
    'new_col2': [1, 2, 3, 4, 5]
})

df_concatenated_horizontal = pd.concat([df, additional_columns], axis=1)
print("\nDataFrame after horizontal concatenation:")
print(df_concatenated_horizontal.head())


DataFrame after horizontal concatenation:
                                               title  \
0  Data Analysis Using R (Low Priced Edition): A ...   
1  Head First Data Analysis: A learner's guide to...   
2  Guerrilla Data Analysis Using Microsoft Excel:...   
3  Python for Data Analysis: Data Wrangling with ...   
4  Excel Data Analysis For Dummies (For Dummies (...   

                               author  price price (including used books)  \
0                 [ dr dhaval maheta]    6.6                         6.75   
1                                 NaN   36.3               21.49 - 33.72    
2  [ oz du soleil,  and , bill jelen]   35.2                        32.07   
3                 [ william mckinney]   58.3                        53.99   
4                   [ paul mcfedries]   26.4                        24.49   

  pages  avg_reviews n_reviews star5 star4 star3  ...         ISBN_13  \
0   500          4.4        23   55%   39%    6%  ...  978-1685549596   
1   484    

### 25. Create a New Column Based on Existing Columns

In [56]:
df = df[pd.to_numeric(df['pages'], errors='coerce').notnull()]  # Drops rows with non-numeric pages
df['price_per_page'] = df['price'] / df['pages'].astype(float)
print("\nDataFrame after creating 'price_per_page' column:")
print(df[['price', 'pages', 'price_per_page']].head())


DataFrame after creating 'price_per_page' column:
   price pages  price_per_page
0    6.6   500        0.013200
1   36.3   484        0.075000
2   35.2   274        0.128467
3   58.3   547        0.106581
4   26.4   368        0.071739


### 26. Discretize a Continuous Column into Bins

In [57]:
df['price_bins'] = pd.cut(df['price'], bins=[0, 20, 40, 60, 80, 100], labels=['0-20', '20-40', '40-60', '60-80', '80-100'])
print("\nDataFrame after discretizing 'price' column into bins:")
print(df[['price', 'price_bins']].head())


DataFrame after discretizing 'price' column into bins:
   price price_bins
0    6.6       0-20
1   36.3      20-40
2   35.2      20-40
3   58.3      40-60
4   26.4      20-40


### 27. Create Polynomial Features from Existing Numerical Columns

In [58]:
df['price_squared'] = df['price'] ** 2
df['avg_reviews_squared'] = df['avg_reviews'] ** 2
print("\nDataFrame after creating polynomial features:")
print(df[['price', 'price_squared', 'avg_reviews', 'avg_reviews_squared']].head())


DataFrame after creating polynomial features:
   price  price_squared  avg_reviews  avg_reviews_squared
0    6.6          43.56          4.4                19.36
1   36.3        1317.69          4.3                18.49
2   35.2        1239.04          4.7                22.09
3   58.3        3398.89          4.6                21.16
4   26.4         696.96          3.9                15.21
