In [1]:
import pandas as pd
import seaborn as sns
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)  # Adjust the width as needed
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.preprocessing import MinMaxScaler , StandardScaler

In [2]:
data = pd.read_csv('dataset.csv')

In [3]:
data = data.rename(columns={'Preference': 'Output'})

## Removing Duplicates

In [4]:
duplicates = data.duplicated()
# Print the number of duplicate rows
print(f"Number of duplicate rows: {duplicates.sum()}")

Number of duplicate rows: 0


As there are no duplicates so we are not removing them but if incase we have duplicates then we will remove them Using the code commented below

In [5]:
# data = data.drop_duplicates()

In [6]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,Age,Gender,Income,Education_Level,Travel_Frequency,Preferred_Activities,Vacation_Budget,Location,Proximity_to_Mountains,Proximity_to_Beaches,Favorite_Season,Pets,Environmental_Concerns,Output
0,45292,23,female,41104,bachelor,6,hiking,677,suburban,190,223,summer,0,0,1
1,5209,47,male,25444,master,0,hiking,3977,suburban,237,293,spring,0,0,1
2,1011,43,non-binary,54369,master,6,swimming,1318,rural,94,71,fall,0,0,0
3,8657,44,female,55839,bachelor,7,sunbathing,964,rural,97,281,spring,0,1,0
4,2824,19,male,118735,high school,2,swimming,1990,urban,250,261,fall,0,1,0


In [7]:
def scaling(data):
    columns_to_normalize_using_minmax = ['Age', 'Income', 'Vacation_Budget']
    columns_to_normalize_using_z_score = [ 'Proximity_to_Mountains', 'Proximity_to_Beaches', 'Travel_Frequency']
    minmaxscaler = MinMaxScaler()
    standard_z_scaler = StandardScaler()

    data[columns_to_normalize_using_minmax] = minmaxscaler.fit_transform(data[columns_to_normalize_using_minmax])
    data[columns_to_normalize_using_z_score] = standard_z_scaler.fit_transform(data[columns_to_normalize_using_z_score])
    return data

data = scaling(data)

In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,Age,Gender,Income,Education_Level,Travel_Frequency,Preferred_Activities,Vacation_Budget,Location,Proximity_to_Mountains,Proximity_to_Beaches,Favorite_Season,Pets,Environmental_Concerns,Output
0,45292,0.098039,female,0.211043,bachelor,0.527842,hiking,0.039342,suburban,0.460128,0.841332,summer,0,0,1
1,5209,0.568627,male,0.054433,master,-1.552459,hiking,0.772838,suburban,1.002048,1.649704,spring,0,0,1
2,1011,0.490196,non-binary,0.343701,master,0.527842,swimming,0.181818,rural,-0.646771,-0.913988,fall,0,0,0
3,8657,0.509804,female,0.358402,bachelor,0.874559,sunbathing,0.103134,rural,-0.612181,1.511126,spring,0,1,0
4,2824,0.019608,male,0.987399,high school,-0.859026,swimming,0.331185,urban,1.151941,1.280163,fall,0,1,0


# Encoding

###  Label Encoding

In [9]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
def le(data):
    le = LabelEncoder()
    categorical_columns = ['Gender',  'Preferred_Activities', 'Location']
    data[categorical_columns] = data[categorical_columns].apply(le.fit_transform)
    return data
data = le(data)
data.head()

Unnamed: 0.1,Unnamed: 0,Age,Gender,Income,Education_Level,Travel_Frequency,Preferred_Activities,Vacation_Budget,Location,Proximity_to_Mountains,Proximity_to_Beaches,Favorite_Season,Pets,Environmental_Concerns,Output
0,45292,0.098039,0,0.211043,bachelor,0.527842,0,0.039342,1,0.460128,0.841332,summer,0,0,1
1,5209,0.568627,1,0.054433,master,-1.552459,0,0.772838,1,1.002048,1.649704,spring,0,0,1
2,1011,0.490196,2,0.343701,master,0.527842,3,0.181818,0,-0.646771,-0.913988,fall,0,0,0
3,8657,0.509804,0,0.358402,bachelor,0.874559,2,0.103134,0,-0.612181,1.511126,spring,0,1,0
4,2824,0.019608,1,0.987399,high school,-0.859026,3,0.331185,2,1.151941,1.280163,fall,0,1,0


In [10]:
def oe(data):
    category_order = [ 
        ['high school', 'bachelor', 'master', 'doctorate'],
    ]
    encoder = OrdinalEncoder(categories=category_order)
    data["Education_Level"] = encoder.fit_transform(data[["Education_Level"]])
    return data
data = oe(data)

In [11]:
data.head()

Unnamed: 0.1,Unnamed: 0,Age,Gender,Income,Education_Level,Travel_Frequency,Preferred_Activities,Vacation_Budget,Location,Proximity_to_Mountains,Proximity_to_Beaches,Favorite_Season,Pets,Environmental_Concerns,Output
0,45292,0.098039,0,0.211043,1.0,0.527842,0,0.039342,1,0.460128,0.841332,summer,0,0,1
1,5209,0.568627,1,0.054433,2.0,-1.552459,0,0.772838,1,1.002048,1.649704,spring,0,0,1
2,1011,0.490196,2,0.343701,2.0,0.527842,3,0.181818,0,-0.646771,-0.913988,fall,0,0,0
3,8657,0.509804,0,0.358402,1.0,0.874559,2,0.103134,0,-0.612181,1.511126,spring,0,1,0
4,2824,0.019608,1,0.987399,0.0,-0.859026,3,0.331185,2,1.151941,1.280163,fall,0,1,0


In [12]:
def ohe(data):
    encoder = OneHotEncoder(sparse_output=False)  # Use sparse=False to get a dense array

    encoded_data = encoder.fit_transform(data[["Favorite_Season"]])

    encoded_df = pd.DataFrame(encoded_data, columns=encoder.categories_[0])
    encoded_df.head()
    data.drop(columns=["Favorite_Season"] , inplace=True)
    data = pd.concat([data, encoded_df], axis=1)
    return data
data = ohe(data)
data.head()

Unnamed: 0.1,Unnamed: 0,Age,Gender,Income,Education_Level,Travel_Frequency,Preferred_Activities,Vacation_Budget,Location,Proximity_to_Mountains,Proximity_to_Beaches,Pets,Environmental_Concerns,Output,fall,spring,summer,winter
0,45292,0.098039,0,0.211043,1.0,0.527842,0,0.039342,1,0.460128,0.841332,0,0,1,0.0,0.0,1.0,0.0
1,5209,0.568627,1,0.054433,2.0,-1.552459,0,0.772838,1,1.002048,1.649704,0,0,1,0.0,1.0,0.0,0.0
2,1011,0.490196,2,0.343701,2.0,0.527842,3,0.181818,0,-0.646771,-0.913988,0,0,0,1.0,0.0,0.0,0.0
3,8657,0.509804,0,0.358402,1.0,0.874559,2,0.103134,0,-0.612181,1.511126,0,1,0,0.0,1.0,0.0,0.0
4,2824,0.019608,1,0.987399,0.0,-0.859026,3,0.331185,2,1.151941,1.280163,0,1,0,1.0,0.0,0.0,0.0
