In [107]:
import pandas as pd
import numpy as np

#Pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

# Custom Transformers
from sklearn.base import BaseEstimator, TransformerMixin

In [109]:
df = pd.read_csv("Data/listings.csv", sep = ",")

In [110]:
#Custom transformers (for template purposes of how pipeline works)

#Drop columns
class ColumnDropperTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def transform(self, X, y=None):
        X = X.drop(self.columns, axis = 1)
        return X
    
    def fit(self, X):
        return self
    
#Filter
class DropNas(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def transform(self, X, y=None):
        X = X.dropna(subset = "bathrooms")
        X = X.dropna(subset = ["bedrooms", "beds"])
        X = X.dropna(subset = "price")
        
        return X
    
    def fit(self, X):
        return self

In [111]:
drop_columns = ["host_id", "host_url", "host_name", "host_location", "host_about", "host_thumbnail_url", 
        "host_picture_url", "host_neighbourhood", "host_since", "host_listings_count", "host_total_listings_count",
        "id", "scrape_id", "listing_url", "picture_url", "minimum_minimum_nights", "maximum_minimum_nights", "minimum_maximum_nights", 
        "maximum_maximum_nights", "minimum_nights_avg_ntm", "maximum_nights_avg_ntm", "last_scraped", "source", "first_review", "last_review", "license", "neighbourhood",
        "neighborhood_overview", "neighbourhood_group_cleansed", "name", "description",
        "calendar_updated", "calculated_host_listings_count", "calculated_host_listings_count_entire_homes",
        "calculated_host_listings_count_private_rooms", "calculated_host_listings_count_shared_rooms", "calendar_last_scraped", "longitude", "latitude","availability_365", "minimum_nights", 
        "maximum_nights", "availability_30","availability_60", "availability_90", "number_of_reviews_ltm", "number_of_reviews_l30d", "number_of_reviews",
               "review_scores_value", "review_scores_rating", "review_scores_accuracy",
               "review_scores_checkin", "review_scores_cleanliness", "review_scores_communication",
               "review_scores_location" ,"reviews_per_month", "has_availability", "instant_bookable", "host_response_rate", "host_acceptance_rate", "host_is_superhost", "host_has_profile_pic", "host_identity_verified",
                "host_response_time", "property_type", "host_verifications"
                
                
       ]

drop_transformer = ColumnDropperTransformer(drop_columns)

df_transformed = drop_transformer.fit_transform(df)
print(df_transformed.isna().sum())
df_transformed.columns

In [129]:
#Option 1: Naming the transformers
my_pipeline_named = Pipeline([
    ("dropnas", DropNas()),
    ("dropcolumns", ColumnDropperTransformer(columns = drop_columns)),
])

#Option 2: NOT naming the transformers
my_pipeline_unamed = make_pipeline(DropNas(), 
                            ColumnDropperTransformer(columns = drop_columns))

In [135]:
df_transformed = my_pipeline_named.fit_transform(df)
df_transformed.columns

Index(['neighbourhood_cleansed', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price'],
      dtype='object')

In [None]:
Implement the idea of the code bellow where we separate ccategorical and numerical data and apply5 different transofrmes to each (!)
using ColumnTransformers

from sklearn.compose import ColumnTransformer

num_attribs = ["longitude", "latitude", "housing_median_age", "total_rooms",
               "total_bedrooms", "population", "households", "median_income"]
cat_attribs = ["ocean_proximity"]

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

preprocessing = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs),
])