In [1]:
# Q1. What is the difference between Ordinal Encoding and Label Encoding? Provide an example of when you
# might choose one over the other.

In [2]:
# Label Encoding assigns unique integers to each category without assuming any order. It's used when there is no natural
# order among the categories.

# Ordinal Encoding assigns integers based on the natural order of categories. It's used when the categories have a meaningful 
# sequence (e.g., education lev

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
colors = ['red', 'blue', 'green']
label_encoder = LabelEncoder()
encoded_colors = label_encoder.fit_transform(colors)

In [5]:
encoded_colors

array([2, 0, 1])

In [6]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

In [7]:
education = ['high school', 'bachelor', 'master', 'PhD']
education_levels = pd.DataFrame({'education': education})
ordinal_encoder = OrdinalEncoder(categories=[['high school', 'bachelor', 'master', 'PhD']])
encoded_education = ordinal_encoder.fit_transform(education_levels)

In [8]:
encoded_education

array([[0.],
       [1.],
       [2.],
       [3.]])

In [9]:
# When to Use:
# Label Encoding: Use for unordered categories (e.g., colors).
# Ordinal Encoding: Use for ordered categories (e.g., education level).

In [10]:
# Q2. Explain how Target Guided Ordinal Encoding works and provide an example of when you might use it in
# a machine learning project.

In [11]:
data = {'city': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C'],
        'house_price': [300, 200, 250, 320, 210, 240, 310, 220, 260]}
df = pd.DataFrame(data)

In [16]:
city_means = df.groupby('city')['house_price'].mean()

In [17]:
city_ordinal = city_means.rank().astype(int).to_dict()

In [18]:
df['city_encoded'] = df['city'].map(city_ordinal)

In [19]:
df

Unnamed: 0,city,house_price,city_encoded
0,A,300,3
1,B,200,1
2,C,250,2
3,A,320,3
4,B,210,1
5,C,240,2
6,A,310,3
7,B,220,1
8,C,260,2


In [20]:
# Target Guided Ordinal Encoding assigns ranks to categories based on their average value of the target variable.

# Use this method when the categorical feature's relationship with the target is important for your model.

In [26]:
# Q3. Define covariance and explain why it is important in statistical analysis. How is covariance calculated?

In [27]:
# Covariance:
# Covariance measures the degree to which two variables change together. If the variables tend to increase or decrease 
# simultaneously, the covariance is positive; if one increases while the other decreases, the covariance is negative. A
# value close to zero indicates no linear relationship.

In [28]:
# Importance:
    
# Covariance helps in understanding the relationship between two variables.
# It's used in statistics and data analysis to identify how changes in one variable might predict changes in another.
# Covariance is the basis for other statistical measures, such as correlation and Principal Component Analysis (PCA).

In [29]:
# Calculation:
# Covariance between two variables X and Y is calculated as:
#      cov(X,Y)=n−11∑i=1n(Xi−Xˉ)(Yi−Yˉ)
#     n is the number of data point
#     Xi adn Yi are individual data points
#     X bar and y bar the means X and Y respectively
    

In [30]:
# Q4. For a dataset with the following categorical variables: Color (red, green, blue), Size (small, medium,
# large), and Material (wood, metal, plastic), perform label encoding using Python's scikit-learn library.
# Show your code and explain the output.

In [31]:
from sklearn.preprocessing import LabelEncoder

In [32]:
data = {
    'Color': ['red', 'green', 'blue', 'green', 'red', 'blue'],
    'Size': ['small', 'medium', 'large', 'small', 'large', 'medium'],
    'Material': ['wood', 'metal', 'plastic', 'wood', 'metal', 'plastic']
}

In [33]:
df = pd.DataFrame(data)

In [34]:
label_encoder = LabelEncoder()

In [35]:
df['Color_encoded'] = label_encoder.fit_transform(df['Color'])
df['Size_encoded'] = label_encoder.fit_transform(df['Size'])
df['Material_encoded'] = label_encoder.fit_transform(df['Material'])

In [36]:
df

Unnamed: 0,Color,Size,Material,Color_encoded,Size_encoded,Material_encoded
0,red,small,wood,2,2,2
1,green,medium,metal,1,1,0
2,blue,large,plastic,0,0,1
3,green,small,wood,1,2,2
4,red,large,metal,2,0,0
5,blue,medium,plastic,0,1,1


In [37]:
# Label Encoding transforms categorical data into numerical labels, which can be used as input for machine learning models.
# However, be cautious when using Label Encoding with categorical variables that do not have an ordinal relationship, as this
# may introduce unintended order to the model.

In [44]:
# Q5. Calculate the covariance matrix for the following variables in a dataset: Age, Income, and Education
# level. Interpret the results.

In [38]:
import numpy as np

In [39]:
data = {
    'Age': [25, 32, 47, 51, 62],
    'Income': [50000, 60000, 80000, 85000, 90000],
    'Education_Level': [12, 14, 16, 16, 18]
}

In [40]:
df = pd.DataFrame(data)

In [41]:
cov_matrix = df.cov()

In [42]:
cov_matrix

Unnamed: 0,Age,Income,Education_Level
Age,221.3,251000.0,33.4
Income,251000.0,295000000.0,38000.0
Education_Level,33.4,38000.0,5.2


In [45]:
# The covariance matrix helps to understand the relationships between different variables in your dataset. Positive values
# indicate that the variables tend to increase together, while negative values indicate that one variable tends to decrease
# when the other increases. The magnitude of the covariance gives an idea of the strength of the relationship.

In [46]:
# Q6. You are working on a machine learning project with a dataset containing several categorical
# variables, including "Gender" (Male/Female), "Education Level" (High School/Bachelor's/Master's/PhD),
# and "Employment Status" (Unemployed/Part-Time/Full-Time). Which encoding method would you use for
# each variable, and why?

In [47]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder

In [48]:
data = {
    'Gender': ['Male', 'Female', 'Female', 'Male'],
    'Education_Level': ['High School', 'Bachelor\'s', 'Master\'s', 'PhD'],
    'Employment_Status': ['Unemployed', 'Part-Time', 'Full-Time', 'Unemployed']
}

In [49]:
df = pd.DataFrame(data)

In [50]:
label_encoder = LabelEncoder()
df['Gender_encoded'] = label_encoder.fit_transform(df['Gender'])

In [52]:
education_order = ['High School', 'Bachelor\'s', 'Master\'s', 'PhD']
ordinal_encoder = OrdinalEncoder(categories=[education_order])
df['Education_Level_encoded'] = ordinal_encoder.fit_transform(df[['Education_Level']])

In [54]:
onehot_encoder = OneHotEncoder(sparse=False)
employment_encoded = onehot_encoder.fit_transform(df[['Employment_Status']])
employment_encoded_df = pd.DataFrame(employment_encoded, columns=onehot_encoder.get_feature_names_out(['Employment_Status']))
df = pd.concat([df, employment_encoded_df], axis=1)




In [55]:
df

Unnamed: 0,Gender,Education_Level,Employment_Status,Gender_encoded,Education_Level_encoded,Employment_Status_Full-Time,Employment_Status_Part-Time,Employment_Status_Unemployed,Employment_Status_Full-Time.1,Employment_Status_Part-Time.1,Employment_Status_Unemployed.1
0,Male,High School,Unemployed,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,Female,Bachelor's,Part-Time,0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,Female,Master's,Full-Time,0,2.0,1.0,0.0,0.0,1.0,0.0,0.0
3,Male,PhD,Unemployed,1,3.0,0.0,0.0,1.0,0.0,0.0,1.0


In [56]:
# Gender (Male/Female): Use Label Encoding or One-Hot Encoding because it's a binary categorical variable. Label Encoding
# is simpler, but One-Hot Encoding avoids implying any order.

# Education Level (High School/Bachelor's/Master's/PhD): Use Ordinal Encoding because the categories have a natural order 
# (e.g., High School < Bachelor's < Master's < PhD).

# Employment Status (Unemployed/Part-Time/Full-Time): Use One-Hot Encoding because there is no inherent order among the 
# categories, and it allows the model to treat them as distinct and separate.

In [57]:
# Q7. You are analyzing a dataset with two continuous variables, "Temperature" and "Humidity", and two
# categorical variables, "Weather Condition" (Sunny/Cloudy/Rainy) and "Wind Direction" (North/South/
# East/West). Calculate the covariance between each pair of variables and interpret the results.

In [58]:
from sklearn.preprocessing import OneHotEncoder

In [59]:
data = {
    'Temperature': [30, 22, 25, 28, 32],
    'Humidity': [80, 70, 75, 85, 90],
    'Weather_Condition': ['Sunny', 'Cloudy', 'Rainy', 'Sunny', 'Cloudy'],
    'Wind_Direction': ['North', 'South', 'East', 'West', 'North']
}

In [62]:
df = pd.DataFrame(data)

In [63]:
encoder = OneHotEncoder(sparse=False)
encoded_weather = encoder.fit_transform(df[['Weather_Condition']])
encoded_wind = encoder.fit_transform(df[['Wind_Direction']])



In [65]:
encoded_df = pd.concat([df[['Temperature', 'Humidity']], 
pd.DataFrame(encoded_weather, columns=encoder.get_feature_names_out(['Weather_Condition'])),
pd.DataFrame(encoded_wind, columns=encoder.get_feature_names_out(['Wind_Direction']))], axis=1)

ValueError: input_features is not equal to feature_names_in_