In [27]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Example cleaned data
data = {
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Other'],
    'State': ['Lagos', 'Lagos', 'Ogun', 'Lagos', 'Unknown']
}
df = pd.DataFrame(data)

print("Original:")
print(df)


# Label Encoding (turns text to numbers)
le =LabelEncoder()
df["Gender_encoded"] = le.fit_transform(df["Gender"])
print("\nLabel Encoded:")
print(df)

# One-Hot Encoding (creates separate columns)
df_encoded = pd.get_dummies(df, columns=["State"])
print("\nOne-Hot Encoded")
print(df_encoded)


Original:
   Gender    State
0    Male    Lagos
1  Female    Lagos
2  Female     Ogun
3    Male    Lagos
4   Other  Unknown

Label Encoded:
   Gender    State  Gender_encoded
0    Male    Lagos               1
1  Female    Lagos               0
2  Female     Ogun               0
3    Male    Lagos               1
4   Other  Unknown               2

One-Hot Encoded
   Gender  Gender_encoded  State_Lagos  State_Ogun  State_Unknown
0    Male               1         True       False          False
1  Female               0         True       False          False
2  Female               0        False        True          False
3    Male               1         True       False          False
4   Other               2        False       False           True


**Mini Data Cleaning & Encoding Challenge**

In [None]:
import pandas as pd

# Create the DataFrame
data = {
    'Name': ['Aisha', 'Tunde', 'Amaka', 'Musa', 'Grace', 'Tunde'],
    'Age': [27, None, 33, 40, 27, None],
    'Gender': ['Female', 'M', 'Female', 'Male', 'Female', 'Male'],
    'State': ['LAG', 'Lagos', 'OGUN', 'lagos', None, 'Lagos'],
    'Satisfaction': [5, 4, None, 3, 5, 4],
    'Comment': ['Good', 'Nice', 'Satisfactory', 'Fair', 'Excellent', 'Nice']
}

df = pd.DataFrame(data)

# Inspect your data
print(df.head())

# Inspect the data
df.info()

# Print the dataset
df.head()

# Clean the dataset
df["Age"].fillna(df['Age'].median(), inplace=True)
df["Satisfaction"].fillna(df['Satisfaction'].median(), inplace=True)

df.head()

# Standardizing the text/ filling missing state
df["State"].replace({"LAG":"Lagos", "lagos":"Lagos", "OGUN":"Ogun", "None":"Unknown"}, inplace=True)
df["Gender"].replace({"M":"Male"}, inplace=True)

# remove duplicate rows
df.drop_duplicates()

# Encode the GEnder and State using one-hot encoding 
df_encoded = pd.get_dummies(df,columns=["State"])
df_encoded2 = pd.get_dummies(df,columns=["Gender"])
print(df_encoded)

#

    Name   Age  Gender  State  Satisfaction       Comment
0  Aisha  27.0  Female    LAG           5.0          Good
1  Tunde   NaN       M  Lagos           4.0          Nice
2  Amaka  33.0  Female   OGUN           NaN  Satisfactory
3   Musa  40.0    Male  lagos           3.0          Fair
4  Grace  27.0  Female   None           5.0     Excellent
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Name          6 non-null      object 
 1   Age           4 non-null      float64
 2   Gender        6 non-null      object 
 3   State         5 non-null      object 
 4   Satisfaction  5 non-null      float64
 5   Comment       6 non-null      object 
dtypes: float64(2), object(4)
memory usage: 420.0+ bytes
    Name   Age  Gender  Satisfaction       Comment  State_Lagos  State_Ogun
0  Aisha  27.0  Female           5.0          Good         True       False

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Satisfaction"].fillna(df['Satisfaction'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on w