# Data Transformation and Mapping 
- apply: allows us to apply a function to each element, row, or column ij a DataFrame or Series.
- map: is used to map values in a Series according to a dictionary or another series.
- replace: allows for replacing specific values in the DataFrame with new value.
- astype: used to convert datatype.
- pipe: it allows chaining and using complex functions that operate on the entire DataFrame. This can be used for complex transformation operation.

apply

In [1]:
# convert marks to scale of 1
import pandas as pd
students = pd.read_csv('data/student.csv')
students['mark_percent'] = students['mark'].apply(lambda m:m /100)
students.head()

Unnamed: 0,id,name,class,mark,gender,mark_percent
0,1,John Deo,Four,75,female,0.75
1,2,Max Ruin,Three,85,male,0.85
2,3,Arnold,Three,55,male,0.55
3,4,Krish Star,Four,60,female,0.6
4,5,John Mike,Four,60,female,0.6


map

In [2]:
# Replace
students['grade'] = students['class'].map({'One':1, 'Two':2, 'Three':3, 'Four':4, 'Five': 5, 'Six':6, 'Seven':7, 'Eight':8, 'Nine':9, 'Ten':10})

students['gender_code'] = students['gender'].map({'male':1, 'female':2})
students.head()

Unnamed: 0,id,name,class,mark,gender,mark_percent,grade,gender_code
0,1,John Deo,Four,75,female,0.75,4.0,2
1,2,Max Ruin,Three,85,male,0.85,3.0,1
2,3,Arnold,Three,55,male,0.55,3.0,1
3,4,Krish Star,Four,60,female,0.6,4.0,2
4,5,John Mike,Four,60,female,0.6,4.0,2


replace

In [4]:
# Replace specific values in the 'class' column
students['class'] = students['class'].replace({'Four': '4th', 'Ten': '10th', 'Eight': '8th'})
# replace name 'John Deo' with 'JaiRam' (replace specific value)
students['name'] = students['name'].replace({'John Deo':'JaiRam'})
students.head()

Unnamed: 0,id,name,class,mark,gender,mark_percent,grade,gender_code
0,1,JaiRam,4th,75,female,0.75,4.0,2
1,2,Max Ruin,Three,85,male,0.85,3.0,1
2,3,Arnold,Three,55,male,0.55,3.0,1
3,4,Krish Star,4th,60,female,0.6,4.0,2
4,5,John Mike,4th,60,female,0.6,4.0,2


astype

In [9]:
# Convert 'mark' column to float
students['mark'] = students['mark'].astype(float)
students.head()

Unnamed: 0,id,name,class,mark,gender,mark_percent,grade,gender_code
0,1,JaiRam,4th,75.0,female,0.75,4.0,2
1,2,Max Ruin,Three,85.0,male,0.85,3.0,1
2,3,Arnold,Three,55.0,male,0.55,3.0,1
3,4,Krish Star,4th,60.0,female,0.6,4.0,2
4,5,John Mike,4th,60.0,female,0.6,4.0,2


pipe

In [14]:
# create function that triples marks and pipes it to the pandas DataFrame

def triple_marks(df):
    df['mark'] = df['mark'] * 3
    return df

def check_result(df):
    df['result'] = df['mark'].apply(lambda m :'Pass' if m>40 else 'Fail')
    return df

students = students.pipe(check_result)
students.head()

Unnamed: 0,id,name,class,mark,gender,mark_percent,grade,gender_code,result
0,1,JaiRam,4th,675.0,female,0.75,4.0,2,Pass
1,2,Max Ruin,Three,765.0,male,0.85,3.0,1,Pass
2,3,Arnold,Three,495.0,male,0.55,3.0,1,Pass
3,4,Krish Star,4th,540.0,female,0.6,4.0,2,Pass
4,5,John Mike,4th,540.0,female,0.6,4.0,2,Pass


# Data Cleaning and Manipulation
- Handle Missing Data
- Handle Duplicates
- One-Hot-Encoding
- Normalization

# Handle Missing Data
- Removes Rows containing missing values ``(df.dropna())``
    - ``dropna(axis=0)`` removes rows containing at least one missing value
    - ``(dropna(axis=1))`` removes columns containing at least one missing value
- Replace Missing Values ``(df.fillna())``
    - Replace with specific value: ``(df.fillna(value))`` fill the missing values with the previous row value of column value (if axis=1)
    - Backward Fill: ``(df.fillna(value,method='ffill'))`` replaces the missing values with the next value
    - Fill individual column with specified value: ``(df.fillna({'height':150,'weight:60}``) replaces the missing values of column height with 150 and column weight with 60
    - Fill with average, Interpolation etc.: Fill the missing value using average, or using interpolation

In [None]:
from IPython.display import display
# axis, 0=row, 1=column

students = pd.read_csv('data/student.csv')
display(students.head())

#students.dropna(axis=0,how='any,inplace=True) # (axis=0 default) drop rows withNaN

students = students.fillna(1) # replace missing values with 1
display(students.head())

students.fillna(method='ffill') #fill with previous row value
students.fillna(method='bfill') #fill with next row value
students['mark'].fillna(50) # only fill empty value of Name column
students.fillna({'mark':50,'gender':'Male'}) # replace marks with 50 if its empty and gender with male
df_interpolated = students.interpolate()
students.fillna(df_interpolated)
students['mark'].fillna(students['mark'].mean())


Unnamed: 0,id,name,class,mark,gender
0,1,John Deo,Four,75,female
1,2,Max Ruin,Three,85,male
2,3,Arnold,Three,55,male
3,4,Krish Star,Four,60,female
4,5,John Mike,Four,60,female


Unnamed: 0,id,name,class,mark,gender
0,1,John Deo,Four,75,female
1,2,Max Ruin,Three,85,male
2,3,Arnold,Three,55,male
3,4,Krish Star,Four,60,female
4,5,John Mike,Four,60,female


  students.fillna(method='ffill') #fill with previous row value
  students.fillna(method='bfill') #fill with next row value
  df_interpolated = students.interpolate()


0     75
1     85
2     55
3     60
4     60
5     55
6     78
7     85
8     78
9     55
10    89
11    94
12    88
13    88
14    88
15    88
16    54
17    75
18    18
19    65
20    69
21    55
22    79
23    78
24    88
25    79
26    81
27    86
28    55
29    79
30    88
31    90
32    96
33    69
34    88
Name: mark, dtype: int64

# Handle Duplicate Data
- Check duplicates
- Drop duplicates

In [20]:
duplicates = students.duplicated() # check duplicate
students.drop_duplicates() #drop duplicate
duplicates

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
32    False
33    False
34    False
dtype: bool

# One-Hot-Encoding
- One Hot Encoding is a method for converting categorical variables into a binary format. It creates new binary columns (0s and 1s) for each category in the original variable. Each category in the original column is represented as a seperate column, where a value of 1 indicates the presence that category, and 0 indicates its absence. For example we replace the country name with numerical value eg. America=1, Nepal= 2, India=3

In [23]:
students = pd.read_csv('data/student.csv')
df_pandas_encoded = pd.get_dummies(students, columns=['class'])
print(f"One-Hot Encoded Data using Pandas:\n")
display(df_pandas_encoded)

One-Hot Encoded Data using Pandas:



Unnamed: 0,id,name,mark,gender,class_Eight,class_Fifth,class_Five,class_Four,class_Nine,class_Seven,class_Six,class_Three
0,1,John Deo,75,female,False,False,False,True,False,False,False,False
1,2,Max Ruin,85,male,False,False,False,False,False,False,False,True
2,3,Arnold,55,male,False,False,False,False,False,False,False,True
3,4,Krish Star,60,female,False,False,False,True,False,False,False,False
4,5,John Mike,60,female,False,False,False,True,False,False,False,False
5,6,Alex John,55,male,False,False,False,True,False,False,False,False
6,7,My John Rob,78,male,False,True,False,False,False,False,False,False
7,8,Asruid,85,male,False,False,True,False,False,False,False,False
8,9,Tes Qry,78,male,False,False,False,False,False,False,True,False
9,10,Big John,55,female,False,False,False,True,False,False,False,False


# Data Normalization
- Data normalization involves adjusting measurement values of different scales to a common scale. Normalization is only applicable to numerical columns.