In [76]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/new-york-city-airbnb-open-data/AB_NYC_2019.csv
/kaggle/input/new-york-city-airbnb-open-data/New_York_City_.png


### Load the dataset

In [84]:
df = pd.read_csv("/kaggle/input/new-york-city-airbnb-open-data/AB_NYC_2019.csv")

df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     

In [86]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.dropna()

In [87]:
from sklearn.model_selection import train_test_split

X = df.drop(['price'], axis = 1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

### One Hot Encoding
### One Categorical Variable

In [88]:
#create a categorical based on a threshold we set
#we are using a lambda function for this. Lambdas allow us to write a function without defining it
#we are also using a ternary operator. This allows us to write a conditional statement in 1 line of code 
df['expensive'] = df['price'].apply(lambda x: 'expensive' if x > 175 else 'non-expensive')

In [89]:
df.groupby(['expensive']).mean()['price']

TypeError: agg function failed [how->mean,dtype->object]

In [30]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,expensive
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365,non-expensive
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,expensive
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194,non-expensive
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0,non-expensive
5,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129,expensive


In [33]:
dummies = pd.get_dummies(df["expensive"])
hot_encoded_single_df = pd.concat([df.drop(['expensive'], axis=1), dummies], axis=1)
dummies.head()

Unnamed: 0,expensive,non-expensive
0,False,True
1,True,False
3,False,True
4,False,True
5,True,False


In [34]:
X = hot_encoded_single_df.drop(['price'], axis = 1)
y = hot_encoded_single_df['price']

X_train_dummy, X_test_dummy, y_train_dummy, y_test_dummy = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [35]:
X_train_dummy.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,expensive,non-expensive
32728,25785298,Spacious room in Williamsburg (10min to Manhat...,889683,Rachel,Brooklyn,Williamsburg,40.71264,-73.94889,Private room,1,71,2019-06-21,5.37,2,51,False,True
26365,21005699,Vics cafe,45044964,Victor,Queens,Queens Village,40.72517,-73.76105,Shared room,1,11,2019-06-03,0.51,2,125,False,True
22249,17940198,"Cozy, queen sized bedroom in Bed-Stuy",24383863,Sophia,Brooklyn,Bedford-Stuyvesant,40.69471,-73.94406,Private room,2,24,2019-06-25,0.98,1,1,False,True
6170,4513084,Beautiful Sunny Private Penthouse Suite,23401472,Heather,Manhattan,Harlem,40.82624,-73.94527,Entire home/apt,30,1,2016-07-30,0.03,2,244,False,True
27267,21540496,Stylish 1 BD - 10 min to Manhattan & Central Park,156587568,Anna,Queens,Astoria,40.76129,-73.92009,Entire home/apt,1,56,2019-06-14,2.73,1,126,False,True


In [43]:
mult_hot_enc = pd.get_dummies(df['neighbourhood'])
hot_encoded_df = pd.concat([df.drop(['neighbourhood'], axis=1), mult_hot_enc], axis=1)
mult_cols = mult_hot_enc.columns
print(mult_cols)
hot_encoded_df.head()

Index(['Allerton', 'Arden Heights', 'Arrochar', 'Arverne', 'Astoria',
       'Bath Beach', 'Battery Park City', 'Bay Ridge', 'Bay Terrace',
       'Bay Terrace, Staten Island',
       ...
       'Westchester Square', 'Westerleigh', 'Whitestone', 'Williamsbridge',
       'Williamsburg', 'Willowbrook', 'Windsor Terrace', 'Woodhaven',
       'Woodlawn', 'Woodside'],
      dtype='object', length=218)


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,latitude,longitude,room_type,price,minimum_nights,...,Westchester Square,Westerleigh,Whitestone,Williamsbridge,Williamsburg,Willowbrook,Windsor Terrace,Woodhaven,Woodlawn,Woodside
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,40.64749,-73.97237,Private room,149,1,...,False,False,False,False,False,False,False,False,False,False
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,40.75362,-73.98377,Entire home/apt,225,1,...,False,False,False,False,False,False,False,False,False,False
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,40.68514,-73.95976,Entire home/apt,89,1,...,False,False,False,False,False,False,False,False,False,False
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,40.79851,-73.94399,Entire home/apt,80,10,...,False,False,False,False,False,False,False,False,False,False
5,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,40.74767,-73.975,Entire home/apt,200,3,...,False,False,False,False,False,False,False,False,False,False


In [38]:
X = hot_encoded_df.drop(['price'], axis=1)
y = hot_encoded_df['price']
X_train_me, X_test_me, y_train_me, y_test_me = train_test_split(X,y, test_size=0.33, random_state=42)

### Ordinal Encoding

There are a number of solutions to solving the cardinality problem. The first one is ordinal encoding. Ordinal encoding is a method of replacing categories with numbers. However, these numbers have an inherent ordering to them. Think, high school -> college -> grad_school as categorical variables with na inherent ordering. Luckily, sklearn has already written an implementation of ordinal encoding:

In [39]:
df['room_type'].unique()

array(['Private room', 'Entire home/apt', 'Shared room'], dtype=object)

In [41]:
df_ord = df.copy()
ord_encoding = {
    'Entire home/apt':0,
    'Private room': 1,
    'Shared room': 2,
}

df_ord['room_type_ord_enc'] = df_ord['room_type'].map(ord_encoding)
df_ord.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,expensive,room_type_ord_enc
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365,non-expensive,1
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,expensive,0
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194,non-expensive,0
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0,non-expensive,0
5,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129,expensive,0


### Frequency Encoding
Unlike multiple one-hot encoding, there are other waus to reduce the cardinality of the dataset. One of such techniques which are implemented are:
- Frequency Encoding,
- Target Encoding
- Probability ration encoding
- Weight of Evidence Encoder
- Binning

In [45]:
frequency_encoder = X_train.groupby(['neighbourhood']).size()
frequency_encoder

neighbourhood
Allerton            25
Arden Heights        4
Arrochar            14
Arverne             43
Astoria            480
                  ... 
Willowbrook          1
Windsor Terrace     79
Woodhaven           44
Woodlawn             8
Woodside           119
Length: 215, dtype: int64

In [46]:
X_test['neighbourhood'] = X_test['neighbourhood'].map(frequency_encoder)
X_test['neighbourhood']

5576      580.0
7729      284.0
2020     2143.0
4195     2143.0
9758     2143.0
          ...  
43519     669.0
10297     669.0
36169     550.0
32098     995.0
37697    1489.0
Name: neighbourhood, Length: 12811, dtype: float64

The above logic can also be implmented as 

In [50]:
class FrequencyEncoder:
    def fit(self, train_df, column):
        self.train_df = train_df
        self.column = column
        self.frequency_encodings=None
        
    def _compute_frequency(self):
        self.frequency_encodings = self.train_df.groupby([self.column]).size()
    
    def transform(self, test_df, column, fillna=True):
        self._compute_frequency()
        col_name = column + "_freq"
        test_df[col_name] = test_df[column].map(self.frequency_encodings)
        if fillna is True:
            test_df[col_name] = test_df[col_name].fillna(0)
        return test_df
    
fe = FrequencyEncoder()
fe.fit(X_train, column="neighbourhood")
X_train_freq_enc = fe.transform(X_train, column="neighbourhood")
X_test_freq_enc = fe.transform(X_test, column="neighbourhood")
X_train.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,neighbourhood_freq
32728,25785298,Spacious room in Williamsburg (10min to Manhat...,889683,Rachel,Brooklyn,Williamsburg,40.71264,-73.94889,Private room,1,71,2019-06-21,5.37,2,51,2127
26365,21005699,Vics cafe,45044964,Victor,Queens,Queens Village,40.72517,-73.76105,Shared room,1,11,2019-06-03,0.51,2,125,27
22249,17940198,"Cozy, queen sized bedroom in Bed-Stuy",24383863,Sophia,Brooklyn,Bedford-Stuyvesant,40.69471,-73.94406,Private room,2,24,2019-06-25,0.98,1,1,2143
6170,4513084,Beautiful Sunny Private Penthouse Suite,23401472,Heather,Manhattan,Harlem,40.82624,-73.94527,Entire home/apt,30,1,2016-07-30,0.03,2,244,1489
27267,21540496,Stylish 1 BD - 10 min to Manhattan & Central Park,156587568,Anna,Queens,Astoria,40.76129,-73.92009,Entire home/apt,1,56,2019-06-14,2.73,1,126,480


### Target Encoding - Replacing the categories with mean instead of frequency
Another method of encoding is called Target Encoding. Earlier, we learned about frequency encoding, where we encode the number of instances a category occurs as its value. Rather than encode the number of instances a category occurs, we can encode the mean of our target variable, like this:

In [67]:
# Identify non-numeric values in 'price' column
non_numeric_prices = df[~df['price'].apply(lambda x: str(x).replace('.', '').isdigit())]

# If non-numeric prices exist, remove those rows
if not non_numeric_prices.empty:
    print("Non-numeric prices found. Removing rows with non-numeric prices.")
    df = df.drop(non_numeric_prices.index)

# Convert 'price' column to float
df['price'] = df['price'].astype(float)

# Compute mean price by neighborhood
mean_price_by_neighborhood = df.groupby(['neighbourhood_group']).mean()['price']

TypeError: agg function failed [how->mean,dtype->object]

In [52]:
df_te = df.copy()
class Target_Encoding:
    def fit(self, train_df, target_col, categ_col):
        self.train_df = train_df
        self.target_col = target_col
        self.categ_col = categ_col
        self.target_encodings=None
    
    def fit(self):
    
        

TypeError: agg function failed [how->mean,dtype->object]