In [3]:


# Question 5: Label Encoding vs One-Hot Encoding
# Task: Show the difference between Label Encoding and One-Hot Encoding on the Titanic dataset for the 'Sex' feature.
# Label Encoding vs One-Hot Encoding on Titanic Dataset
# This script demonstrates the difference between Label Encoding and One-Hot Encoding
# using the 'Sex' feature from the Titanic dataset

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Load the Titanic dataset
# For this example, we'll use the seaborn library which has the Titanic dataset built-in
titanic = sns.load_dataset('titanic')

# Display the first few rows of the dataset
print("First 5 rows of the Titanic dataset:")
print(titanic.head())

# Look at the unique values in the 'sex' column
print("\nUnique values in 'sex' column:")
print(titanic['sex'].unique())

# Check the value counts for 'sex'
print("\nValue counts for 'sex':")
print(titanic['sex'].value_counts())

# Create a copy of the dataset to work with
df = titanic.copy()

# LABEL ENCODING
print("\n--- LABEL ENCODING ---")

# Create a label encoder object
label_encoder = LabelEncoder()

# Fit and transform the 'sex' column
df['sex_label_encoded'] = label_encoder.fit_transform(df['sex'])

# Display the first few rows with label encoding
print("\nFirst 5 rows with label encoding:")
print(df[['sex', 'sex_label_encoded']].head())

# Show the mapping between original values and encoded values
print("\nLabel Encoding mapping:")
for i, category in enumerate(label_encoder.classes_):
    print(f"{category} -> {i}")

# ONE-HOT ENCODING
print("\n--- ONE-HOT ENCODING ---")

# Method 1: Using pandas get_dummies
df_dummies = pd.get_dummies(df['sex'], prefix='sex')

# Add the one-hot encoded columns to the original dataframe
df = pd.concat([df, df_dummies], axis=1)

# Display the first few rows with one-hot encoding
print("\nFirst 5 rows with one-hot encoding (using pandas get_dummies):")
print(df[['sex', 'sex_female', 'sex_male']].head())

# Method 2: Using scikit-learn OneHotEncoder
print("\nOne-Hot Encoding using scikit-learn:")
# Create a one-hot encoder object
onehot_encoder = OneHotEncoder(sparse_output=False)

# Reshape the data to fit into the encoder
sex_reshaped = df['sex'].values.reshape(-1, 1)

# Fit and transform the data
sex_onehot = onehot_encoder.fit_transform(sex_reshaped)

# Create a DataFrame with the one-hot encoded values
sex_onehot_df = pd.DataFrame(
    sex_onehot, 
    columns=[f"sex_{category}" for category in onehot_encoder.categories_[0]],
    index=df.index
)

# Display the first few rows with one-hot encoding
print("\nFirst 5 rows with one-hot encoding (using scikit-learn):")
print(pd.concat([df['sex'].reset_index(drop=True), sex_onehot_df.reset_index(drop=True)], axis=1).head())

# VISUALIZE THE DIFFERENCES
plt.figure(figsize=(14, 6))

# Plot 1: Label Encoding
plt.subplot(1, 2, 1)
sns.countplot(x='sex', hue='sex_label_encoded', data=df)
plt.title('Label Encoding of Sex Feature')
plt.xlabel('Original Sex Category')
plt.ylabel('Count')
plt.xticks([0, 1], ['female', 'male'])

# Plot 2: One-Hot Encoding
plt.subplot(1, 2, 2)
# Create a temporary dataframe for plotting
temp_df = df.melt(id_vars=['sex'], value_vars=['sex_female', 'sex_male'], 
                  var_name='one_hot_category', value_name='is_category')
temp_df = temp_df[temp_df['is_category'] == 1]  # Only keep rows where the category is present
sns.countplot(x='sex', hue='one_hot_category', data=temp_df)
plt.title('One-Hot Encoding of Sex Feature')
plt.xlabel('Original Sex Category')
plt.ylabel('Count')
plt.xticks([0, 1], ['female', 'male'])

plt.tight_layout()
plt.savefig('encoding_comparison.png')
plt.close()

# COMPARISON SUMMARY
print("\n--- COMPARISON SUMMARY ---")
print("Label Encoding:")
print("- Transforms categorical values into numerical values")
print("- For 'sex' column: female -> 0, male -> 1")
print("- Maintains a single column")
print("- Introduces ordinal relationship (which may not be appropriate for nominal data)")
print("- Memory efficient")

print("\nOne-Hot Encoding:")
print("- Creates a new binary column for each category")
print("- For 'sex' column: creates 'sex_female' and 'sex_male' columns")
print("- Expands to multiple columns (one per category)")
print("- Avoids ordinal relationship, better for nominal data")
print("- Less memory efficient, but more appropriate for machine learning algorithms")

print("\nWhen to use each:")
print("- Label Encoding: Good for ordinal data (e.g., 'low', 'medium', 'high')")
print("- One-Hot Encoding: Better for nominal data with no inherent order (e.g., 'sex', 'country')")
print("- For binary features like 'sex', both work similarly from a mathematical perspective")
print("  but one-hot encoding is generally preferred for consistency and clarity")

# Prepare a neat data comparison for final display
comparison_df = pd.DataFrame({
    'Original': df['sex'].head(10),
    'Label Encoded': df['sex_label_encoded'].head(10),
    'One-Hot (Female)': df['sex_female'].head(10),
    'One-Hot (Male)': df['sex_male'].head(10)
})

print("\nSide-by-side comparison of encodings:")
print(comparison_df)



# --- Question 6: Combining Feature Scaling Techniques ---
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, OrdinalEncoder
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
print("\n--- Question 6: Combining Feature Scaling Techniques ---")
data_scaling = pd.DataFrame({'Feature_A': [10, 20, 30, 40, 50],
                             'Feature_B': [100, 50, 200, 150, 250]})
print("Original Data:\n", data_scaling)

# Min-Max Scaling
scaler_minmax = MinMaxScaler()
scaled_minmax = scaler_minmax.fit_transform(data_scaling)
scaled_minmax_df = pd.DataFrame(scaled_minmax, columns=data_scaling.columns)
print("\nMin-Max Scaled Data:\n", scaled_minmax_df)
print("Explanation: Min-Max Scaling scales data to a fixed range (typically 0 to 1). It's useful when the range of the data is important or for algorithms sensitive to feature magnitude.")

# Standardization (Z-score Scaling)
scaler_standard = StandardScaler()
scaled_standard = scaler_standard.fit_transform(data_scaling)
scaled_standard_df = pd.DataFrame(scaled_standard, columns=data_scaling.columns)
print("\nStandardized Data:\n", scaled_standard_df)
print("Explanation: Standardization scales data to have a mean of 0 and a standard deviation of 1. It's less affected by outliers and useful for algorithms that assume data is normally distributed.")

# --- Question 7: Handling Multiple Categorical Features ---
print("\n--- Question 7: Handling Multiple Categorical Features ---")
# Load the Titanic dataset (replace 'titanic.csv' with the actual path)
try:
    titanic_data = pd.read_csv('titanic.csv')
except FileNotFoundError:
    print("Error: titanic.csv not found. Please provide the correct path.")
    titanic_data = pd.DataFrame({'Sex': ['male', 'female', 'male'], 'Embarked': ['S', 'C', 'S']}) # Sample data

if not titanic_data.empty:
    categorical_features = ['Sex', 'Embarked']
    encoder_onehot = OneHotEncoder(sparse_output=False, drop='first') # drop='first' to avoid multicollinearity
    encoded_features = encoder_onehot.fit_transform(titanic_data[categorical_features])
    encoded_df = pd.DataFrame(encoded_features, columns=encoder_onehot.get_feature_names_out(categorical_features))
    print("Original Categorical Features:\n", titanic_data[categorical_features].head())
    print("\nOne-Hot Encoded Features:\n", encoded_df.head())


print("\n--- Question 8: Ordinal Encoding for Ranked Categories ---")
if not titanic_data.empty and 'Pclass' in titanic_data.columns:
    ordinal_encoder = OrdinalEncoder(categories=[['3rd', '2nd', '1st']]) # Define the order
    titanic_data['Pclass_Encoded'] = ordinal_encoder.fit_transform(titanic_data[['Pclass']])
    print("Original Pclass:\n", titanic_data['Pclass'].head())
    print("\nOrdinal Encoded Pclass:\n", titanic_data['Pclass_Encoded'].head())
else:
    print("Pclass column not found in the Titanic dataset (or dataset is empty).")

# --- Question 9: Impact of Scaling on Different Algorithms ---
print("\n--- Question 9: Impact of Scaling on Different Algorithms ---")
# Create a synthetic dataset
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


dt_unscaled = DecisionTreeClassifier(random_state=42)
dt_unscaled.fit(X_train, y_train)
y_pred_dt_unscaled = dt_unscaled.predict(X_test)
accuracy_dt_unscaled = accuracy_score(y_test, y_pred_dt_unscaled)
print(f"\nDecision Tree Accuracy (Unscaled): {accuracy_dt_unscaled:.4f}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
dt_scaled = DecisionTreeClassifier(random_state=42)
dt_scaled.fit(X_train_scaled, y_train)
y_pred_dt_scaled = dt_scaled.predict(X_test_scaled)
accuracy_dt_scaled = accuracy_score(y_test, y_pred_dt_scaled)
print(f"Decision Tree Accuracy (Standard Scaled): {accuracy_dt_scaled:.4f}")
print("Explanation: Decision Trees make splits based on feature values, so the scale of the features generally doesn't significantly impact their performance.")


svm_unscaled = SVC(random_state=42)
svm_unscaled.fit(X_train, y_train)
y_pred_svm_unscaled = svm_unscaled.predict(X_test)
accuracy_svm_unscaled = accuracy_score(y_test, y_pred_svm_unscaled)
print(f"\nSVM Accuracy (Unscaled): {accuracy_svm_unscaled:.4f}")

svm_scaled = SVC(random_state=42)
svm_scaled.fit(X_train_scaled, y_train)
y_pred_svm_scaled = svm_scaled.predict(X_test_scaled)
accuracy_svm_scaled = accuracy_score(y_test, y_pred_svm_scaled)
print(f"SVM Accuracy (Standard Scaled): {accuracy_svm_scaled:.4f}")
print("Explanation: SVMs rely on distances between data points. Features with larger values can disproportionately influence the result. Scaling helps to ensure all features contribute more equally.")


print("\n--- Question 10: Custom Transformations for Categorical Features ---")
high_cardinality_data = pd.Series(['A', 'B', 'C', 'A', 'A', 'D', 'E', 'A', 'B', 'F', 'G', 'A', 'B', 'H', 'I', 'A', 'B', 'J', 'K', 'A', 'L', 'M', 'A', 'N', 'O', 'A', 'P', 'Q', 'A', 'R', 'S', 'A', 'T', 'U', 'A', 'V', 'W', 'A', 'X', 'Y', 'A', 'Z'])
value_counts = high_cardinality_data.value_counts()
rare_threshold = 5
rare_categories = value_counts[value_counts < rare_threshold].index

def custom_encoding(series, rare_threshold=5, rare_name='Other'):
    """Encodes a categorical series by grouping rare values."""
    value_counts = series.value_counts()
    rare_categories = value_counts[value_counts < rare_threshold].index
    return series.apply(lambda x: rare_name if x in rare_categories else x)

encoded_high_cardinality = custom_encoding(high_cardinality_data, rare_threshold=rare_threshold)
print("Original High Cardinality Data (first 20):\n", high_cardinality_data.head(20))
print(f"\nCategories with count less than {rare_threshold}: {list(rare_categories)}")
print("\nCustom Encoded High Cardinality Data (first 20):\n", encoded_high_cardinality.head(20))
print(f"\nValue Counts after Custom Encoding:\n", encoded_high_cardinality.value_counts())
print("Explanation: This custom transformation groups infrequent categories into a single 'Other' category. This can help reduce the dimensionality introduced by one-hot encoding for high cardinality features and potentially improve model performance by focusing on more frequent patterns.")




     

First 5 rows of the Titanic dataset:
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  

Unique values in 'sex' column:
['male' 'female']

Value counts for 'sex':
sex
male      577
female    314
Name: count, dtype: int64

--- LABEL ENCODING ---

First 5 rows with label 