In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("avocado_cleaned.csv")
df.head()


Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,2015-12-27,1.33,11.070344,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015.0,Albany
1,2015-12-20,1.35,10.912867,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015.0,Albany
2,2015-12-13,0.93,11.680313,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015.0,Albany
3,2015-12-06,1.08,11.277116,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015.0,Albany
4,2015-11-29,1.28,10.840377,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015.0,Albany


In [6]:
df.dtypes


Date             object
AveragePrice    float64
Total Volume    float64
4046            float64
4225            float64
4770            float64
Total Bags      float64
Small Bags      float64
Large Bags      float64
XLarge Bags     float64
type             object
year            float64
region           object
dtype: object

In [7]:
le = LabelEncoder()
df["type_label"] = le.fit_transform(df["type"])

df[["type", "type_label"]].head()


Unnamed: 0,type,type_label
0,conventional,0
1,conventional,0
2,conventional,0
3,conventional,0
4,conventional,0


In [10]:
# one_hot Encoding
df_onehot = pd.get_dummies(df, columns=["type"], drop_first=True)
df_onehot.head()


Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,year,region,type_label,region_ordinal,type_organic
0,2015-12-27,1.33,11.070344,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,2015.0,Albany,0,0,0
1,2015-12-20,1.35,10.912867,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,2015.0,Albany,0,0,0
2,2015-12-13,0.93,11.680313,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,2015.0,Albany,0,0,0
3,2015-12-06,1.08,11.277116,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,2015.0,Albany,0,0,0
4,2015-11-29,1.28,10.840377,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,2015.0,Albany,0,0,0


In [11]:
# Ordinal Encoding 
region_categories = df["region"].unique()
region_map = {name: i for i, name in enumerate(region_categories)}

df["region_ordinal"] = df["region"].map(region_map)

df[["region", "region_ordinal"]].head()


Unnamed: 0,region,region_ordinal
0,Albany,0
1,Albany,0
2,Albany,0
3,Albany,0
4,Albany,0


In [12]:
# Frequency Encoding
freq_map = df["region"].value_counts()
df["region_frequency"] = df["region"].map(freq_map)

df[["region", "region_frequency"]].head()


Unnamed: 0,region,region_frequency
0,Albany,338
1,Albany,338
2,Albany,338
3,Albany,338
4,Albany,338


In [13]:
# target Encoding
target_mean = df.groupby("region")["AveragePrice"].mean()
df["region_target"] = df["region"].map(target_mean)

df[["region", "region_target"]].head()


Unnamed: 0,region,region_target
0,Albany,1.561036
1,Albany,1.561036
2,Albany,1.561036
3,Albany,1.561036
4,Albany,1.561036


In [14]:
df.to_csv("avocado_encoded.csv", index=False)
df_onehot.to_csv("avocado_onehot.csv", index=False)

print("Encoded files saved")


Encoded files saved
