- KNN
- Scikit-learn pipelines

In [1]:
import pandas as pd 
import numpy as np

In [3]:
df = pd.read_csv('AB_NYC_2019.csv', nrows=4000)

In [4]:
from sklearn.neighbors import KNeighborsRegressor

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
df_train, df_val = train_test_split(df)

In [7]:
len(df_train)

3000

In [10]:
X_train = df_train[['latitude', 'longitude']].fillna(0).values
y_train = df_train.price.values

In [11]:
X_val = df_val[['latitude', 'longitude']].fillna(0).values
y_val = df_val.price.values

In [12]:
knn = KNeighborsRegressor(n_neighbors=5)

In [13]:
knn.fit(X_train, y_train)

KNeighborsRegressor()

In [15]:
y_pred = knn.predict(X_val)

In [None]:
for i in range(1, 50):
    knn = KNeighborsRegressor(n_neighbors=i)
    knn.fit(X_train, y_train)
    
    y_pred = knn.predict(X_val)
    # rmse

## Pipelines

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

In [61]:
df = pd.read_csv('AB_NYC_2019.csv', nrows=4000)

In [62]:
df['number_of_reviews'] = df['number_of_reviews'].fillna(0)

In [63]:
df.name = df.name.fillna('')
df.neighbourhood_group = df.neighbourhood_group.str.lower().str.replace(' ', '_').fillna('NA')
df.neighbourhood = df.neighbourhood.str.lower().str.replace(' ', '_').fillna('NA')
df.room_type = df.room_type.str.lower().str.replace(' ', '_').fillna('NA')

In [64]:
numerical = ['latitude',
    'longitude',
    'minimum_nights',
    'number_of_reviews',
    'reviews_per_month',
    'calculated_host_listings_count',
    'availability_365']

In [74]:
df[numerical] = df[numerical].fillna(0)

In [75]:
transformations = [
    ('numerical', 'passthrough', numerical),
    ('categories', OneHotEncoder(dtype='int32'),
         ['neighbourhood_group', 'neighbourhood', 'room_type']),
    ('name', CountVectorizer(min_df=100, dtype='int32'), 'name')
]

tranformer = ColumnTransformer(transformations, remainder='drop')


In [77]:
tranformer.fit(df)

ColumnTransformer(transformers=[('numerical', 'passthrough',
                                 ['latitude', 'longitude', 'minimum_nights',
                                  'number_of_reviews', 'reviews_per_month',
                                  'calculated_host_listings_count',
                                  'availability_365']),
                                ('categories', OneHotEncoder(dtype='int32'),
                                 ['neighbourhood_group', 'neighbourhood',
                                  'room_type']),
                                ('name',
                                 CountVectorizer(dtype='int32', min_df=100),
                                 'name')])

In [79]:
X = tranformer.transform(df)

In [80]:
y = df.price.values

In [81]:
from sklearn.linear_model import LinearRegression

In [82]:
lr = LinearRegression()
lr.fit(X, y)

LinearRegression()

In [83]:
from sklearn.pipeline import Pipeline

In [89]:
transformations = [
    ('numerical', 'passthrough', numerical),
    ('neighbourhood', CountVectorizer(token_pattern='.*', min_df=100, dtype='int32'), 'neighbourhood'),
    ('ng', CountVectorizer(token_pattern='.*', min_df=50, dtype='int32'), 'neighbourhood_group'),
    ('room_type', CountVectorizer(token_pattern='.*', min_df=100, dtype='int32'), 'room_type'),
    ('name', CountVectorizer(min_df=100, dtype='int32'), 'name')
]

tranformer = ColumnTransformer(transformations, remainder='drop')

In [90]:
pipeline = Pipeline([
    ('transormer', tranformer),
    ('lr', LinearRegression())
])

In [106]:
pipeline.fit(df, df.price.values);

In [92]:
pipeline.predict(df)

array([ 80.28976523, 340.61684387,  45.10203035, ..., 162.8269308 ,
        82.08441501, 111.36359434])

In [94]:
from sklearn.base import TransformerMixin

In [95]:
class ConcatenatingTranformer(TransformerMixin):
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        columns = list(X.columns)
        
        res = ''
        
        for c in columns:
            res = res + ' ' + c + '=' + X[c]

        return res.str.strip()

In [96]:
ct = ConcatenatingTranformer()

In [98]:
p2 = Pipeline([
    ('concatenate', ConcatenatingTranformer()),
    ('vectorize', CountVectorizer(token_pattern='\S+', min_df=100))
])

In [100]:
p2.fit_transform(df[['neighbourhood_group', 'neighbourhood', 'room_type']])

<4000x17 sparse matrix of type '<class 'numpy.int64'>'
	with 10165 stored elements in Compressed Sparse Row format>

In [None]:
' neighbourhood_group=' + df.neighbourhood_group + ' ' + \
    'neighbourhood=' + df.neighbourhood+ ' ' + \
    'room_type=' + df.room_type

In [102]:
transformations = [
    ('numerical', 'passthrough', numerical),
    ('categories', Pipeline([
        ('concatenate', ConcatenatingTranformer()),
        ('vectorize', CountVectorizer(token_pattern='\S+', min_df=100))
    ]), ['neighbourhood', 'neighbourhood_group', 'room_type']),
    ('name', CountVectorizer(min_df=100, dtype='int32'), 'name')
]

tranformer = ColumnTransformer(transformations, remainder='drop')

pipeline = Pipeline([
    ('transormer', tranformer),
    ('lr', LinearRegression())
])

In [103]:
pipeline.fit(df, df.price.values)
pipeline.predict(df)

array([ 80.28977269, 340.61684897,  45.10203641, ..., 162.82693488,
        82.08441669, 111.36359873])

In [104]:
import pickle

In [105]:
with open('pipeline.bin', 'wb') as f_out:
    pickle.dump(pipeline, f_out)