In [None]:
# Q1. What is the difference between Ordinal Encoding and Label Encoding? Provide an example of when you
# might choose one over the other.
Ans.
Ordinal Encoding:
1. Ordinal encoding is used for categorical variables that have a natural order or ranking among their categories.
2. It assigns integer values to categories based on their order or rank, where the numerical values have meaning in
terms of the relative ordering of the categories.
3. The distances between the integer values may not be meaningful or consistent.

Label Encoding:
1. Label encoding is used for nominal categorical variables, where there is no inherent order or ranking among the
categories.
2. It assigns unique integer values to each category without considering their order.
3. The distances between the integer values do not have any meaningful interpretation.

In [13]:
# Example of Ordinal Encoding
import pandas as pd
df = pd.DataFrame({'Education' : ["High School","Bachelor's Degrees","Master's Degrees","PHD",
                                  "Bachelor's Degrees","Master's Degrees","PHD","High School"]
              })
df.head()

Unnamed: 0,Education
0,High School
1,Bachelor's Degrees
2,Master's Degrees
3,PHD
4,Bachelor's Degrees


In [14]:
from sklearn.preprocessing import OrdinalEncoder

In [19]:
encode = OrdinalEncoder(categories=[["High School","Bachelor's Degrees","Master's Degrees","PHD"]])

In [24]:
df['Education_Rank'] = encode.fit_transform(df[['Education']])

In [26]:
df

Unnamed: 0,Education,Education_Rank
0,High School,0.0
1,Bachelor's Degrees,1.0
2,Master's Degrees,2.0
3,PHD,3.0
4,Bachelor's Degrees,1.0
5,Master's Degrees,2.0
6,PHD,3.0
7,High School,0.0


In [38]:
# Example of Label
import pandas as pd

df = pd.DataFrame({'Color' : ['Red','Yellow','Red','Orange','Yellow']})
df

Unnamed: 0,Color
0,Red
1,Yellow
2,Red
3,Orange
4,Yellow


In [39]:
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [40]:
encode_label = LabelEncoder()

In [41]:
df['Color_Label'] = encode_label.fit_transform(df[['Color']])

In [42]:
df

Unnamed: 0,Color,Color_Label
0,Red,1
1,Yellow,2
2,Red,1
3,Orange,0
4,Yellow,2


In [None]:
# Q2. Explain how Target Guided Ordinal Encoding works and provide an example of when you might use it in
# a machine learning project.
Ans.
Target Guided Ordinal Encoding (TGOE) is a technique for encoding categorical variables for machine learning models. It is 
particularly useful when the target variable is ordinal, meaning that it has a natural order.
The encoding values are determined by the targets average behavior for each category and then creating a mapping between the 
categories of the categorical variable and their corresponding values in the target variable. 

In [59]:
import pandas as pd

df = pd.DataFrame({'Product_Name' : ['wood','metal','plastic','plastic','wood'],
                   'Price' : [200,400,350,270,300]})
df.head()

Unnamed: 0,Product_Name,Price
0,wood,200
1,metal,400
2,plastic,350
3,plastic,270
4,wood,300


In [60]:
mean_price = df.groupby('Product_Name')['Price'].mean().to_dict()
mean_price

{'metal': 400.0, 'plastic': 310.0, 'wood': 250.0}

In [61]:
df['Product_Encoded'] = df['Product_Name'].map(mean_price)

In [62]:
df

Unnamed: 0,Product_Name,Price,Product_Encoded
0,wood,200,250.0
1,metal,400,400.0
2,plastic,350,310.0
3,plastic,270,310.0
4,wood,300,250.0


In [None]:
# Q3. Define covariance and explain why it is important in statistical analysis. How is covariance calculated?
Ans.
covariance tells us how much two variables change together. If the covariance is positive, then the variables tend to
move in the same direction. If the covariance is negative, then the variables tend to move in opposite directions.
Covariance is important in statistical analysis because it can be used to identify relationships between variables.

In [70]:
import numpy as np
import seaborn as sns
df = sns.load_dataset("tips")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [73]:
df.cov()

Unnamed: 0,total_bill,tip,size
total_bill,79.252939,8.323502,5.065983
tip,8.323502,1.914455,0.643906
size,5.065983,0.643906,0.904591


In [19]:
# Q4. For a dataset with the following categorical variables: Color (red, green, blue), Size (small, medium,
# large), and Material (wood, metal, plastic), perform label encoding using Python's scikit-learn library.
# Show your code and explain the output.
import pandas as pd

df = pd.DataFrame({'Color' : ['red','green','blue','red','blue'],
              'Size' : ['small','medium','large','medium','small'],
              'Material' : ['wood','metal','plastic','wood','metal']})

In [20]:
df.head()

Unnamed: 0,Color,Size,Material
0,red,small,wood
1,green,medium,metal
2,blue,large,plastic
3,red,medium,wood
4,blue,small,metal


In [21]:
from sklearn.preprocessing import LabelEncoder

In [22]:
encode_lab = LabelEncoder()

In [23]:
df['Color_encoded'] = encode_lab.fit_transform(df['Color'])
df['Size_encoded'] = encode_lab.fit_transform(df['Size'])
df['Material_encoded'] = encode_lab.fit_transform(df['Material'])

In [24]:
df

Unnamed: 0,Color,Size,Material,Color_encoded,Size_encoded,Material_encoded
0,red,small,wood,2,2,2
1,green,medium,metal,1,1,0
2,blue,large,plastic,0,0,1
3,red,medium,wood,2,1,2
4,blue,small,metal,0,2,0


In [35]:
# Q5. Calculate the covariance matrix for the following variables in a dataset: Age, Income, and Education
# level. Interpret the results.

import pandas as pd
import warnings
warnings.filterwarnings('ignore')

df = pd.DataFrame({'Age' : [20,27,38,34,25],
              'Income' : [10000,20000,50000,40000,80000],
              'Education_level' : [4,5,8,10,12]})

In [36]:
df

Unnamed: 0,Age,Income,Education_level
0,20,10000,4
1,27,20000,5
2,38,50000,8
3,34,40000,10
4,25,80000,12


In [37]:
df.cov()

Unnamed: 0,Age,Income,Education_level
Age,51.7,60000.0,8.95
Income,60000.0,750000000.0,85000.0
Education_level,8.95,85000.0,11.2


In [1]:
# Q6. You are working on a machine learning project with a dataset containing several categorical
# variables, including "Gender" (Male/Female), "Education Level" (High School/Bachelor's/Master's/PhD),
# and "Employment Status" (Unemployed/Part-Time/Full-Time). Which encoding method would you use for
# each variable, and why?

import pandas as pd
df = pd.DataFrame({'Gender' : ['Male','Female','Male','Female','Female','Male'],
                   'Education_level' : ['High School','Bachelors','Masters','PhD','Masters','Bachelors'],
                   'Employment_Status' : ['Unemployed','Part-Time','Full-Time','Full-Time','Part-Time','Unemployed']})

In [2]:
df

Unnamed: 0,Gender,Education_level,Employment_Status
0,Male,High School,Unemployed
1,Female,Bachelors,Part-Time
2,Male,Masters,Full-Time
3,Female,PhD,Full-Time
4,Female,Masters,Part-Time
5,Male,Bachelors,Unemployed


In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
encoded_label = LabelEncoder()
encoded_ordinal = OrdinalEncoder(categories=[['High School','Bachelors','Masters','PHD']])
encoded_one = OneHotEncoder()

In [9]:
df['Encoded_Gender'] = encoded_label.fit_transform(df['Gender'])
df['Encoded_Eduaction_level'] = encoded_label.fit_transform(df['Education_level'])
df_encoded = encoded_one.fit_transform(df[['Employment_Status']])

In [18]:
binary_encoded = pd.DataFrame(df_encoded.toarray(),columns=encoded_one.get_feature_names_out())

In [21]:
pd.concat([df,encoded_int],axis=1)

Unnamed: 0,Gender,Education_level,Employment_Status,Encoded_Gender,Encoded_Eduaction_level,Employment_Status_Full-Time,Employment_Status_Part-Time,Employment_Status_Unemployed
0,Male,High School,Unemployed,1,1,0,0,1
1,Female,Bachelors,Part-Time,0,0,0,1,0
2,Male,Masters,Full-Time,1,2,1,0,0
3,Female,PhD,Full-Time,0,3,1,0,0
4,Female,Masters,Part-Time,0,2,0,1,0
5,Male,Bachelors,Unemployed,1,0,0,0,1


In [29]:
# Q7. You are analyzing a dataset with two continuous variables, "Temperature" and "Humidity", and two
# categorical variables, "Weather Condition" (Sunny/Cloudy/Rainy) and "Wind Direction" (North/South/
# East/West). Calculate the covariance between each pair of variables and interpret the results.

import pandas as pd

df = pd.DataFrame({'Temperature' : [20,30,35,40,50,45],
              'Humidity' : [11.5,18.2,27.3,30.7,66.4,40.9],
              'Weather_Condition' : ['Cloudy','Sunny','Rainy','Sunny','Cloudy','Sunny'],
              'Wind_Direction' : ['North','South','East','West','North','South']})


In [30]:
df

Unnamed: 0,Temperature,Humidity,Weather_Condition,Wind_Direction
0,20,11.5,Cloudy,North
1,30,18.2,Sunny,South
2,35,27.3,Rainy,East
3,40,30.7,Sunny,West
4,50,66.4,Cloudy,North
5,45,40.9,Sunny,South


In [32]:
from sklearn.preprocessing import OneHotEncoder

In [33]:
encoded =  OneHotEncoder()

In [35]:
encoded_sparse = encoded.fit_transform(df[['Weather_Condition','Wind_Direction']])

In [37]:
encoded_binary = pd.DataFrame(encoded_sparse.toarray(),columns=encoded.get_feature_names_out())

In [38]:
df.drop(['Weather_Condition','Wind_Direction'],axis=1,inplace=True)

In [41]:
combine_df = pd.concat([df,encoded_binary],axis=1)

In [42]:
combine_df 

Unnamed: 0,Temperature,Humidity,Weather_Condition_Cloudy,Weather_Condition_Rainy,Weather_Condition_Sunny,Wind_Direction_East,Wind_Direction_North,Wind_Direction_South,Wind_Direction_West
0,20,11.5,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,30,18.2,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,35,27.3,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,40,30.7,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,50,66.4,1.0,0.0,0.0,0.0,1.0,0.0,0.0
5,45,40.9,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [43]:
combine_df.cov()

Unnamed: 0,Temperature,Humidity,Weather_Condition_Cloudy,Weather_Condition_Rainy,Weather_Condition_Sunny,Wind_Direction_East,Wind_Direction_North,Wind_Direction_South,Wind_Direction_West
Temperature,116.666667,194.0,-0.666667,-0.333333,1.0,-0.333333,-0.666667,0.333333,0.666667
Humidity,194.0,379.108,2.58,-1.04,-1.54,-1.04,2.58,-1.18,-0.36
Weather_Condition_Cloudy,-0.666667,2.58,0.266667,-0.066667,-0.2,-0.066667,0.266667,-0.133333,-0.066667
Weather_Condition_Rainy,-0.333333,-1.04,-0.066667,0.166667,-0.1,0.166667,-0.066667,-0.066667,-0.033333
Weather_Condition_Sunny,1.0,-1.54,-0.2,-0.1,0.3,-0.1,-0.2,0.2,0.1
Wind_Direction_East,-0.333333,-1.04,-0.066667,0.166667,-0.1,0.166667,-0.066667,-0.066667,-0.033333
Wind_Direction_North,-0.666667,2.58,0.266667,-0.066667,-0.2,-0.066667,0.266667,-0.133333,-0.066667
Wind_Direction_South,0.333333,-1.18,-0.133333,-0.066667,0.2,-0.066667,-0.133333,0.266667,-0.066667
Wind_Direction_West,0.666667,-0.36,-0.066667,-0.033333,0.1,-0.033333,-0.066667,-0.066667,0.166667
