Read in the necessary libraries

In [536]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import MultiLabelBinarizer
import seaborn as sns

In [1]:
# The following function is used from the course materials and reduces the size of the dataframe for calculating an optimal r_sqared value:

def find_optimal_lm_mod(X, y, cutoffs, test_size = .30, plot=True):
    '''
    INPUT
    X - pandas dataframe, X matrix
    y - pandas dataframe, response variable
    cutoffs - list of ints, cutoff for number of non-zero values in dummy categorical vars
    test_size - float between 0 and 1, default 0.3, determines the proportion of data as test data
    random_state - int, default 42, controls random state for train_test_split
    plot - boolean, default 0.3, True to plot result

    OUTPUT
    r2_scores_test - list of floats of r2 scores on the test data
    r2_scores_train - list of floats of r2 scores on the train data
    lm_model - model object from sklearn
    X_train, X_test, y_train, y_test - output from sklearn train test split used for optimal model
    '''
    r2_scores_test, r2_scores_train, num_feats, results = [], [], [], dict()
    for cutoff in cutoffs:

        #reduce X matrix
        reduce_X = X.iloc[:, np.where((X.sum() > cutoff) == True)[0]]
        num_feats.append(reduce_X.shape[1])

        #split the data into train and test
        X_train, X_test, y_train, y_test = train_test_split(reduce_X, y, test_size = test_size)

        #fit the model and obtain pred response
        lm_model = LinearRegression(normalize=True)
        lm_model.fit(X_train, y_train)
        y_test_preds = lm_model.predict(X_test)
        y_train_preds = lm_model.predict(X_train)

        #append the r2 value from the test set
        r2_scores_test.append(r2_score(y_test, y_test_preds))
        r2_scores_train.append(r2_score(y_train, y_train_preds))
        results[str(cutoff)] = r2_score(y_test, y_test_preds)

    if plot:
        plt.plot(num_feats, r2_scores_test, label="Test", alpha=.5)
        plt.plot(num_feats, r2_scores_train, label="Train", alpha=.5)
        plt.xlabel('Number of Features')
        plt.ylabel('Rsquared')
        plt.title('Rsquared by Number of Features')
        plt.legend(loc=1)
        plt.show()

    best_cutoff = max(results, key=results.get)

    #reduce X matrix
    reduce_X = X.iloc[:, np.where((X.sum() > int(best_cutoff)) == True)[0]]
    num_feats.append(reduce_X.shape[1])

    #split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(reduce_X, y, test_size = test_size)

    #fit the model
    lm_model = LinearRegression(normalize=True)
    lm_model.fit(X_train, y_train)

    return r2_scores_test, r2_scores_train, lm_model, X_train, X_test, y_train, y_test

Read in the data to be analyzed

In [538]:
df_detailed_listings = pd.read_csv('./resource/detailed_listings.csv')

In [539]:
# df_listing_cleaned = df_detailed_listings.drop(columns=['listing_url', 'scrape_id', 'last_scraped', 'source', 'picture_url', 'host_url', \
#     'host_thumbnail_url', 'host_picture_url', 'latitude', 'longitude', 'calendar_updated', 'calendar_last_scraped', 'license'], 
#     axis=1).copy()
# df_listing_cleaned = df_listing_cleaned.dropna(axis=1, how='all')
# df_listing_cleaned['price'] = df_listing_cleaned.price.str[1:].str.replace(',','').str.split('.').str[0].astype(int)

Here we try to train a model to predict the price of a listing. Therefor we need to prepare the data further:

1) Drop all features which doesn't contain useful data for our model, such as URLs, dates and coordinates.
2) Drop all entries with missing values for the respondent, in this case the price.
3) If there are missing values for numerical features we fill them with the mean.
4) For the categorical values we need to implement dummy variables

In [540]:
# Drop all features which doesn't contain useful data for our model, such as URLs, dates and personal information.
df = df_detailed_listings.drop(columns=['id', 'host_id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'picture_url', 'host_url', \
    'host_thumbnail_url', 'host_picture_url', 'latitude', 'name', 'host_about', 'description', 'neighborhood_overview', 'host_name', 'longitude', 'calendar_updated', 'calendar_last_scraped', 'license'], 
    axis=1).copy()

In [541]:
# modify the price column and change it into a usable integer datatype:
df['price'] = df.price.str[1:].str.replace(',','').str.split('.').str[0].astype(int)

# drop all entries with missing values for the respondent, in this case the price.
df = df.dropna(subset=['price'], axis=0, how='all').copy()

In [542]:
df.columns.to_series().groupby(df.dtypes).groups

{int64: ['accommodates', 'price', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms'], float64: ['host_listings_count', 'host_total_listings_count', 'bathrooms', 'bedrooms', 'beds', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'reviews_per_month'], object: ['host_since', 'host_location', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_neighbourhoo

In [543]:
print(f'Numerical features in the dataset: \n{list(df.select_dtypes(["int","float"]).columns)}')
print(f'Categorical features in the dataset: \n{list(df.select_dtypes("object").columns)}')

Numerical features in the dataset: 
['host_listings_count', 'host_total_listings_count', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 'reviews_per_month']
Categorical features in the dataset: 
['host_since', 'host_location', 'host_response_time', 'host_response_rate', 'host_acceptance_

Now we need to fill in the missing data. First for the numerical features:

In [544]:
df.columns[df.isna().any()]

Index(['host_since', 'host_location', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_has_profile_pic',
       'host_identity_verified', 'neighbourhood', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'first_review', 'last_review',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'reviews_per_month'],
      dtype='object')

In [545]:
# To fill in all missing values regarding the numeric features we loop through them 
# and use the fillna method to fill them with the mean of the column
num_vars = df.select_dtypes(include=['float', 'int']).columns
for col in num_vars:
    df[col].fillna((df[col].mean()), inplace=True)

After filling all missing values for the numeric columns we take a look on the remaining missing values:

In [546]:
df.columns[df.isna().any()]

Index(['host_since', 'host_location', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_neighbourhood', 'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood', 'bathrooms', 'bathrooms_text', 'first_review',
       'last_review'],
      dtype='object')

In [547]:
df_NaN = df.loc[:, df.columns[df.isna().any()]].copy() # type: ignore
df_NaN.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15717 entries, 0 to 15716
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   host_since              15697 non-null  object 
 1   host_location           13605 non-null  object 
 2   host_response_time      7981 non-null   object 
 3   host_response_rate      7981 non-null   object 
 4   host_acceptance_rate    8838 non-null   object 
 5   host_is_superhost       15713 non-null  object 
 6   host_neighbourhood      9303 non-null   object 
 7   host_has_profile_pic    15697 non-null  object 
 8   host_identity_verified  15697 non-null  object 
 9   neighbourhood           8188 non-null   object 
 10  bathrooms               0 non-null      float64
 11  bathrooms_text          15703 non-null  object 
 12  first_review            12755 non-null  object 
 13  last_review             12755 non-null  object 
dtypes: float64(1), object(13)
memory usage

Somehow the 'bathrooms' column contains only missing values...

In [548]:
df_NaN.bathrooms.unique()

array([nan])

so we drop it.

In [549]:
df = df.drop(columns=['bathrooms']).copy()

In [550]:
df_NaN = df.loc[:, df.columns[df.isna().any()]].copy() # type: ignore
df_NaN.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15717 entries, 0 to 15716
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   host_since              15697 non-null  object
 1   host_location           13605 non-null  object
 2   host_response_time      7981 non-null   object
 3   host_response_rate      7981 non-null   object
 4   host_acceptance_rate    8838 non-null   object
 5   host_is_superhost       15713 non-null  object
 6   host_neighbourhood      9303 non-null   object
 7   host_has_profile_pic    15697 non-null  object
 8   host_identity_verified  15697 non-null  object
 9   neighbourhood           8188 non-null   object
 10  bathrooms_text          15703 non-null  object
 11  first_review            12755 non-null  object
 12  last_review             12755 non-null  object
dtypes: object(13)
memory usage: 1.6+ MB


In [551]:
df_NaN.head()

Unnamed: 0,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_has_profile_pic,host_identity_verified,neighbourhood,bathrooms_text,first_review,last_review
0,2008-10-19,"Coledale, Australia",within a few hours,100%,40%,f,Prenzlauer Berg,t,t,"Berlin, Germany",1 bath,2009-06-20,2021-01-01
1,2009-08-25,"Berlin, Germany",within a day,75%,0%,f,Prenzlauer Berg,t,t,"Berlin, Germany",2.5 baths,2015-08-09,2020-01-04
2,2009-11-14,"Berlin, Germany",within a few hours,100%,87%,t,Prenzlauer Berg,t,t,"Berlin, Germany",2 baths,2010-11-30,2022-10-23
3,2009-11-18,"Berlin, Germany",within a day,90%,9%,t,Prenzlauer Berg,t,t,,1 bath,2010-06-29,2021-06-21
4,2010-11-08,"Berlin, Germany",within a day,100%,90%,t,Prenzlauer Berg,t,t,,1.5 baths,2011-09-06,2022-10-13


After investigating the other columns containing missing values, we further drop the columns containing dates, because they're not that important for our model in regard of predicting the price of an apartment. Also the hosts location doesn't seem that relevant either.

In [552]:
df = df.drop(columns=['host_since','host_location','first_review','last_review','host_neighbourhood']).copy()

In [553]:
df_NaN = df.loc[:, df.columns[df.isna().any()]].copy() # type: ignore
df_NaN.head()

Unnamed: 0,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood,bathrooms_text
0,within a few hours,100%,40%,f,t,t,"Berlin, Germany",1 bath
1,within a day,75%,0%,f,t,t,"Berlin, Germany",2.5 baths
2,within a few hours,100%,87%,t,t,t,"Berlin, Germany",2 baths
3,within a day,90%,9%,t,t,t,,1 bath
4,within a day,100%,90%,t,t,t,,1.5 baths


Since the 'neighbourhood' column contains pretty messy data (shown in the next cell) and we already have a cleansed version of ot, we can drop it as well.

In [554]:
print(df_NaN.neighbourhood.unique())
df = df.drop(columns=['neighbourhood']).copy()

['Berlin, Germany' nan 'Friedrichshain, Berlin, Germany'
 'Berlin-Wedding, Berlin, Germany' 'Berlin Neukölln , Berlin, Germany'
 'Berlin-Mitte, Berlin, Germany' 'Berlín, Berlin, Germany'
 'Берлин, Berlin, Germany' 'Berlin, Mitte, Berlin, Germany'
 'Berlin - Mitte, Germany' 'Berlin - Schöneberg, Berlin, Germany'
 'Berlin- Charlottenburg, Berlin, Germany'
 'Berlin, schmargendorf, Germany' 'Mitte/Tiergarten, Berlin, Germany'
 'berlin, Berlin, Germany' 'Berlin-Kreuzberg, Berlin, Germany'
 'Berlin Friedrichshain, Berlin, Germany' 'Berlin, Zehlendorf, Germany'
 'Weissenhoher Strasse 14, Berlin, Germany' 'Berlin, DE, Germany'
 'Berlin, neukoelln, Germany' 'Berlin, Be, Germany'
 'Berlin, Berlin, DE, Berlin, Germany' 'Hoppegarten, Brandenburg, Germany'
 'Germany' 'Berlin , Germany' 'Berlin, SN, Germany'
 'Weissensee, Berlin, Germany' 'Potsdam, Brandenburg, Germany'
 'Berlin-Bohnsdorf, Germany' 'Berlin, Berlín, Germany' '柏林, Germany'
 'Berlin Prenzlauer Berg , Berlin, Germany' 'Mitte, Berlin, Ge

In [555]:
df_NaN = df.loc[:, df.columns[df.isna().any()]].copy() # type: ignore
df_NaN.head()

Unnamed: 0,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_has_profile_pic,host_identity_verified,bathrooms_text
0,within a few hours,100%,40%,f,t,t,1 bath
1,within a day,75%,0%,f,t,t,2.5 baths
2,within a few hours,100%,87%,t,t,t,2 baths
3,within a day,90%,9%,t,t,t,1 bath
4,within a day,100%,90%,t,t,t,1.5 baths


The rest of the columns containing missing data seem to be interesting and should be included into the model, if they can. Lets see how many values ara actually missing:

In [556]:
df_NaN.apply(lambda col: f"{col.name}: {round(col.isnull().sum() / len(col) * 100, 2)}% missing values")

host_response_time           host_response_time: 49.22% missing values
host_response_rate           host_response_rate: 49.22% missing values
host_acceptance_rate       host_acceptance_rate: 43.77% missing values
host_is_superhost              host_is_superhost: 0.03% missing values
host_has_profile_pic        host_has_profile_pic: 0.13% missing values
host_identity_verified    host_identity_verified: 0.13% missing values
bathrooms_text                    bathrooms_text: 0.09% missing values
dtype: object

Since there are more than 40% missing values in the three columns 'host_response_time', 'host_response_rate' and 'host_acceptance_rate' we drop these as well, since there is just to much data missing.

In [557]:
df = df.drop(columns=['host_response_time','host_response_rate','host_acceptance_rate']).copy()

In [558]:
df_NaN = df.loc[:, df.columns[df.isna().any()]].copy() # type: ignore
df_NaN.head()

Unnamed: 0,host_is_superhost,host_has_profile_pic,host_identity_verified,bathrooms_text
0,f,t,t,1 bath
1,f,t,t,2.5 baths
2,t,t,t,2 baths
3,t,t,t,1 bath
4,t,t,t,1.5 baths


For the rest we jus drop the rows containing the missing data, since the amount is neglectable. 

In [559]:
num_rows_before_drop = df.shape[0]
df = df.dropna(subset=['host_is_superhost','host_has_profile_pic','host_identity_verified','bathrooms_text'], axis=0).copy()
num_rows_after_drop = df.shape[0]

print(f'Number of rows droped: {num_rows_before_drop - num_rows_after_drop}')

Number of rows droped: 34


In [560]:
df.select_dtypes(include=['object']).head()

Unnamed: 0,host_is_superhost,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,neighbourhood_group_cleansed,property_type,room_type,bathrooms_text,amenities,has_availability,instant_bookable
0,f,"['email', 'phone']",t,t,Prenzlauer Berg Südwest,Pankow,Entire rental unit,Entire home/apt,1 bath,"[""Smoke alarm"", ""Kitchen"", ""Private hot tub"", ...",t,f
1,f,"['email', 'phone']",t,t,Prenzlauer Berg Südwest,Pankow,Entire rental unit,Entire home/apt,2.5 baths,"[""Smoke alarm"", ""Bed linens"", ""Oven"", ""Kitchen...",t,f
2,t,"['email', 'phone']",t,t,Prenzlauer Berg Nordwest,Pankow,Entire rental unit,Entire home/apt,2 baths,"[""Smoke alarm"", ""Bed linens"", ""Oven"", ""Kitchen...",t,f
3,t,"['email', 'phone', 'work_email']",t,t,Prenzlauer Berg Nordwest,Pankow,Entire rental unit,Entire home/apt,1 bath,"[""Smoke alarm"", ""Dishes and silverware"", ""Host...",t,f
4,t,"['email', 'phone']",t,t,Prenzlauer Berg Süd,Pankow,Entire rental unit,Entire home/apt,1.5 baths,"[""Bed linens"", ""TV with standard cable"", ""Dish...",t,f


The columns 'host_verifications' and 'amenities' are actual strings of lists, so we need to convert them to actual lists containing separate strings to be able to get dummy variables for them.

In [561]:
df['host_verifications'] = df['host_verifications'].apply(eval).copy()
df['amenities'] = df['amenities'].apply(eval).copy()

After continuing with this dataframe, substituting the other categorical columns also with dummy variables, the model has shown to be significantly overfitted with a r_squared value of -2.484865441020741e+27. To combat overfitting we reduce the number of features by selecting those seeming relevant and discarding the rest.

In [562]:
df = df.drop(columns=['amenities']).copy()

After that we can get the dummy variables.

In [563]:
dummy_cols_hv = pd.get_dummies(df['host_verifications'].apply(pd.Series).stack()).sum(level=0)
# dummy_cols_am = pd.get_dummies(df['amenities'].apply(pd.Series).stack()).sum(level=0)

  dummy_cols_hv = pd.get_dummies(df['host_verifications'].apply(pd.Series).stack()).sum(level=0)
  dummy_cols_hv = pd.get_dummies(df['host_verifications'].apply(pd.Series).stack()).sum(level=0)


And append them to the dataframe.

In [564]:
df = pd.concat([df, dummy_cols_hv], axis=1)

In [565]:
# df = pd.concat([df, dummy_cols_am], axis=1)

After that we can drop the original columns containing the lists of strings.

In [566]:
df = df.drop(columns=['host_verifications']).copy()

Checking for missing values again:

In [567]:
df.columns[df.isna().any()].tolist()

['email', 'phone', 'photographer', 'work_email']

After appending the dummy variables for the both columns containing lists there are several missing values back in our dataframe. Since the information shouldn't be to influential (since it's mostly different kinds of soap) we just give them the value 0 and treat them as they weren't there.

In [568]:
df = df.fillna(0).copy()

In [569]:
df.isna().any().any()

False

Now we have to get the dummy variable of all other categorical columns.

In [570]:
df.select_dtypes(include=['object']).nunique()

host_is_superhost                 2
host_has_profile_pic              2
host_identity_verified            2
neighbourhood_cleansed          138
neighbourhood_group_cleansed     12
property_type                    66
room_type                         4
bathrooms_text                   27
has_availability                  2
instant_bookable                  2
dtype: int64

In [571]:
catvars = df.select_dtypes(include=['object']).columns.to_list()

In [572]:
df = pd.get_dummies(df, columns=catvars, drop_first=True)

After prepping the data we split the dataset into the X Matrix and the respondent y, and further into sub-datasets used for training and testing the model.

In [573]:
print(df.price.isnull().sum())
y = df.price

0


In [574]:
X = df.drop(columns=['price']).copy()

Number of categorical features:

In [575]:
print(f"Number of categorical features: {len(X.select_dtypes(include='object').columns.to_list())}")
X.shape

Number of categorical features: 0


(15683, 283)

Substitute all categorical features with dummy vaiables:

In [576]:
cat_vars = X.select_dtypes(include=['object']).copy().columns
for var in  cat_vars:
    # for each cat add dummy var, drop original column
    X = pd.concat([X.drop(var, axis=1), pd.get_dummies(X[var], prefix=var, prefix_sep='_', drop_first=True)], axis=1)

Get an overview over the new dimensions of the dataframes:

In [577]:
print(f"Number of categorical features: {len(X.select_dtypes(include='object').columns.to_list())}")
print(f"Dimensions of the dataframe: \nFeatures: {X.shape[1]} - Rows: {X.shape[0]}")

Number of categorical features: 0
Dimensions of the dataframe: 
Features: 283 - Rows: 15683


In [578]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30)

In [579]:
print(X_test.shape, X_train.shape, y_train.shape, y_test.shape, X.shape, y.shape)

(4705, 283) (10978, 283) (10978,) (4705,) (15683, 283) (15683,)


In [580]:
lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




LinearRegression(normalize=True)

In [581]:
#Predict using your model
y_test_preds = lm_model.predict(X_test)
y_train_preds = lm_model.predict(X_train)

#Score using your model
test_score = r2_score(y_test, y_test_preds)
train_score = r2_score(y_train, y_train_preds)

print(test_score, train_score)

-2.9612077311497045e+24 0.023684199646298087


In [582]:
#cutoffs here pertains to the number of missing values allowed in the used columns.
#Therefore, lower values for the cutoff provides more predictors in the model.
cutoffs = [5000, 3500, 2500, 1000, 100, 50, 30, 20, 10, 5]

r2_scores_test, r2_scores_train, lm_model, X_train, X_test, y_train, y_test = find_optimal_lm_mod(X, y, cutoffs, plot=False)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

In [583]:
print(X_test.shape, X_train.shape, y_train.shape, y_test.shape, X.shape, y.shape)
print(X.isna().any().any(), X_train.isna().any().any(), X_test.isna().any().any())
print(y.isna().any().any(), y_train.isna().any().any(), y_test.isna().any().any())

(4705, 42) (10978, 42) (10978,) (4705,) (15683, 283) (15683,)
False False False
False False False
