In [3121]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import random

np.random.seed(38)
random.seed(38)

In [3122]:
df = pd.read_csv('train.csv')

In [3123]:
df['review_scores_cleanliness'].isnull().sum()

7679

In [3124]:
df['review_scores_cleanliness'].unique()

array([10.,  9., nan,  8.,  5.,  6.,  7.,  2.,  4.,  3.])

In [3125]:
df = df.drop(columns=['name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview',\
                      'notes', 'transit', 'access', 'interaction', 'house_rules', 'host_id', 'host_name',\
                      'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate',\
                      'host_acceptance_rate', 'host_is_superhost', 'host_neighbourhood', 'host_listings_count',\
                      'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood_cleansed',\
                      'city', 'state', 'zipcode', 'market', 'country_code', 'country', 'amenities', 'first_review',\
                      'last_review', 'is_business_travel_ready', 'require_guest_profile_picture',\
                      'require_guest_phone_verification', 'reviews_per_month',\
                      'square_feet', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',\
                      'review_scores_checkin', 'review_scores_communication', 'review_scores_location',\
                      'review_scores_value', 'id', 'instant_bookable', 'property_type', 'cancellation_policy',\
                      'maximum_nights'])

values = {'bathrooms': 1.141376, 'bedrooms': 1.176869, 'beds': 1.571313}
df = df.fillna(value=values)

df['extra_people'] = df['extra_people'].replace('[\$,]', '', regex=True).astype(float)

df['logreviews'] = np.log(1 + df['number_of_reviews'])


df.head()

Unnamed: 0,neighbourhood_group_cleansed,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,price,guests_included,extra_people,minimum_nights,number_of_reviews,calculated_host_listings_count,logreviews
0,Brooklyn,Entire home/apt,2,1.0,1.0,1.0,Real Bed,145.0,2,30.0,7,6,1,1.94591
1,Manhattan,Entire home/apt,8,1.0,3.0,5.0,Real Bed,175.0,3,19.0,3,137,3,4.927254
2,Manhattan,Entire home/apt,2,1.0,0.0,1.0,Real Bed,180.0,1,0.0,2,3,1,1.386294
3,Brooklyn,Private room,1,1.0,1.0,1.0,Real Bed,42.0,1,0.0,3,0,1,0.0
4,Brooklyn,Private room,2,1.0,1.0,1.0,Real Bed,80.0,1,14.0,1,144,1,4.976734


In [3126]:
df.describe()

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,price,guests_included,extra_people,minimum_nights,number_of_reviews,calculated_host_listings_count,logreviews
count,33538.0,33538.0,33538.0,33538.0,33538.0,33538.0,33538.0,33538.0,33538.0,33538.0,33538.0
mean,2.856789,1.141376,1.176869,1.571313,145.177291,1.50319,14.2329,6.843074,21.635548,3.956288,1.966263
std,1.848491,0.424623,0.743999,1.053739,133.085709,1.114226,24.120633,19.781187,40.488277,12.687366,1.536282
min,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
25%,2.0,1.0,1.0,1.0,69.0,1.0,0.0,1.0,1.0,1.0,0.693147
50%,2.0,1.0,1.0,1.0,109.0,1.0,0.0,2.0,5.0,1.0,1.791759
75%,4.0,1.0,1.0,2.0,175.0,2.0,25.0,5.0,23.0,2.0,3.178054
max,16.0,16.5,10.0,18.0,1999.0,16.0,300.0,1250.0,557.0,132.0,6.324359


In [3127]:
df.isnull().sum()

neighbourhood_group_cleansed      0
room_type                         0
accommodates                      0
bathrooms                         0
bedrooms                          0
beds                              0
bed_type                          0
price                             0
guests_included                   0
extra_people                      0
minimum_nights                    0
number_of_reviews                 0
calculated_host_listings_count    0
logreviews                        0
dtype: int64

In [3128]:
df = df.copy()
print('amount of data original: ' + str(len(df)))
df = df[df['number_of_reviews'] > 11]
print('amount of data after removing "few reviews" properties: ' + str(len(df)))

amount of data original: 33538
amount of data after removing "few reviews" properties: 12058


In [3129]:
df = df.dropna()
df.isnull().sum()

neighbourhood_group_cleansed      0
room_type                         0
accommodates                      0
bathrooms                         0
bedrooms                          0
beds                              0
bed_type                          0
price                             0
guests_included                   0
extra_people                      0
minimum_nights                    0
number_of_reviews                 0
calculated_host_listings_count    0
logreviews                        0
dtype: int64

In [3130]:
df = pd.get_dummies(df)
df.head()

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,price,guests_included,extra_people,minimum_nights,number_of_reviews,calculated_host_listings_count,...,neighbourhood_group_cleansed_Queens,neighbourhood_group_cleansed_Staten Island,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,bed_type_Airbed,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed
1,8,1.0,3.0,5.0,175.0,3,19.0,3,137,3,...,0,0,1,0,0,0,0,0,0,1
4,2,1.0,1.0,1.0,80.0,1,14.0,1,144,1,...,0,0,0,1,0,0,0,0,0,1
6,4,1.0,1.0,1.0,177.0,2,25.0,3,12,1,...,0,0,1,0,0,0,0,0,0,1
9,4,1.0,1.0,2.0,106.0,2,25.0,1,30,1,...,0,0,1,0,0,0,0,0,0,1
17,4,2.0,2.0,2.0,285.0,2,20.0,3,118,3,...,0,0,1,0,0,0,0,0,0,1


In [3131]:
#df_ohe = df[['neighbourhood_group_cleansed', 'room_type', 'bed_type']].copy()
#df = df.drop(columns=['neighbourhood_group_cleansed', 'room_type', 'bed_type'])

X = df.copy().drop('price', axis = 1)
y = df['price'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [3132]:
#df[['accommodates', 'bathrooms', 'bedrooms', 'price', 'guests_included', 'extra_people', 'minimum_nights', 'number_of_reviews', 'logreviews']] = StandardScaler().fit_transform(df[['accommodates', 'bathrooms', 'bedrooms', 'price', 'guests_included', 'extra_people', 'minimum_nights', 'number_of_reviews', 'logreviews']])
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [3133]:
#df = pd.concat([df, df_ohe], axis=1)
#df

In [3134]:
df = pd.get_dummies(df)
df.head()

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,price,guests_included,extra_people,minimum_nights,number_of_reviews,calculated_host_listings_count,...,neighbourhood_group_cleansed_Queens,neighbourhood_group_cleansed_Staten Island,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,bed_type_Airbed,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed
1,8,1.0,3.0,5.0,175.0,3,19.0,3,137,3,...,0,0,1,0,0,0,0,0,0,1
4,2,1.0,1.0,1.0,80.0,1,14.0,1,144,1,...,0,0,0,1,0,0,0,0,0,1
6,4,1.0,1.0,1.0,177.0,2,25.0,3,12,1,...,0,0,1,0,0,0,0,0,0,1
9,4,1.0,1.0,2.0,106.0,2,25.0,1,30,1,...,0,0,1,0,0,0,0,0,0,1
17,4,2.0,2.0,2.0,285.0,2,20.0,3,118,3,...,0,0,1,0,0,0,0,0,0,1


In [3135]:
baseline = y_train.median()
print('Median value baseline: ' + str(baseline))

Median value baseline: 109.0


In [3136]:
baseline_error = np.sqrt(mean_squared_error(y_pred=np.ones_like(y_test) * baseline, y_true=y_test))
print('And we will be right +- ' + str(baseline_error))

And we will be right +- 126.96068782136055


In [3137]:
lr = LinearRegression()

In [3138]:
test_df = pd.read_csv('test.csv')
test_df = test_df.drop(columns=['name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview',\
                      'notes', 'transit', 'access', 'interaction', 'house_rules', 'host_id', 'host_name',\
                      'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate',\
                      'host_acceptance_rate', 'host_is_superhost', 'host_neighbourhood', 'host_listings_count',\
                      'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood_cleansed',\
                      'city', 'state', 'zipcode', 'market', 'country_code', 'country', 'amenities', 'first_review',\
                      'last_review', 'is_business_travel_ready', 'require_guest_profile_picture',\
                      'require_guest_phone_verification','reviews_per_month',\
                      'square_feet', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',\
                      'review_scores_checkin', 'review_scores_communication', 'review_scores_location',\
                      'review_scores_value', 'id', 'instant_bookable', 'property_type', 'cancellation_policy',\
                      'maximum_nights'])

values = {'bathrooms': 1.141376, 'bedrooms': 1.176869, 'beds': 1.571313}
test_df = test_df.fillna(value=values)

test_df['extra_people'] = test_df['extra_people'].replace('[\$,]', '', regex=True).astype(float)

test_df['logreviews'] = np.log(1 + test_df['number_of_reviews'])


test_df.head()

test_df = pd.get_dummies(test_df)
test_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,accommodates,bathrooms,bedrooms,beds,guests_included,extra_people,minimum_nights,number_of_reviews,calculated_host_listings_count,logreviews,...,neighbourhood_group_cleansed_Queens,neighbourhood_group_cleansed_Staten Island,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,bed_type_Airbed,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed
0,4,2.0,2.0,2.0,1,0.0,2,0,1,0.0,...,0,0,1,0,0,0,0,0,0,1
1,2,1.0,2.0,1.0,1,0.0,2,22,1,3.135494,...,0,0,1,0,0,0,0,0,0,1
2,1,1.0,1.0,1.0,1,20.0,1,6,1,1.94591,...,0,0,0,1,0,0,0,0,0,1
3,2,1.0,1.0,1.0,1,0.0,1,7,1,2.079442,...,0,0,0,1,0,0,0,0,0,1
4,3,2.0,1.0,2.0,2,10.0,2,14,9,2.70805,...,0,0,0,1,0,0,0,0,0,1


In [3139]:
test_df = test_df.copy()
print('amount of data original: ' + str(len(test_df)))
test_df

amount of data original: 17337


Unnamed: 0,accommodates,bathrooms,bedrooms,beds,guests_included,extra_people,minimum_nights,number_of_reviews,calculated_host_listings_count,logreviews,...,neighbourhood_group_cleansed_Queens,neighbourhood_group_cleansed_Staten Island,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,bed_type_Airbed,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed
0,4,2.0,2.0,2.0,1,0.0,2,0,1,0.000000,...,0,0,1,0,0,0,0,0,0,1
1,2,1.0,2.0,1.0,1,0.0,2,22,1,3.135494,...,0,0,1,0,0,0,0,0,0,1
2,1,1.0,1.0,1.0,1,20.0,1,6,1,1.945910,...,0,0,0,1,0,0,0,0,0,1
3,2,1.0,1.0,1.0,1,0.0,1,7,1,2.079442,...,0,0,0,1,0,0,0,0,0,1
4,3,2.0,1.0,2.0,2,10.0,2,14,9,2.708050,...,0,0,0,1,0,0,0,0,0,1
5,1,1.0,1.0,1.0,1,50.0,1,13,1,2.639057,...,0,0,1,0,0,0,0,0,0,1
6,2,1.0,0.0,1.0,1,0.0,24,1,1,0.693147,...,0,0,1,0,0,0,0,0,0,1
7,3,1.0,2.0,1.0,3,0.0,5,1,1,0.693147,...,0,0,1,0,0,0,0,0,0,1
8,6,2.0,3.0,4.0,6,15.0,4,17,1,2.890372,...,0,0,1,0,0,0,0,0,0,1
9,4,1.0,1.0,2.0,1,0.0,5,0,1,0.000000,...,0,0,1,0,0,0,0,0,0,1


In [3140]:
test_df.isnull().sum()

accommodates                                  0
bathrooms                                     0
bedrooms                                      0
beds                                          0
guests_included                               0
extra_people                                  0
minimum_nights                                0
number_of_reviews                             0
calculated_host_listings_count                0
logreviews                                    0
neighbourhood_group_cleansed_Bronx            0
neighbourhood_group_cleansed_Brooklyn         0
neighbourhood_group_cleansed_Manhattan        0
neighbourhood_group_cleansed_Queens           0
neighbourhood_group_cleansed_Staten Island    0
room_type_Entire home/apt                     0
room_type_Private room                        0
room_type_Shared room                         0
bed_type_Airbed                               0
bed_type_Couch                                0
bed_type_Futon                          

In [3141]:
X.columns

Index(['accommodates', 'bathrooms', 'bedrooms', 'beds', 'guests_included',
       'extra_people', 'minimum_nights', 'number_of_reviews',
       'calculated_host_listings_count', 'logreviews',
       'neighbourhood_group_cleansed_Bronx',
       'neighbourhood_group_cleansed_Brooklyn',
       'neighbourhood_group_cleansed_Manhattan',
       'neighbourhood_group_cleansed_Queens',
       'neighbourhood_group_cleansed_Staten Island',
       'room_type_Entire home/apt', 'room_type_Private room',
       'room_type_Shared room', 'bed_type_Airbed', 'bed_type_Couch',
       'bed_type_Futon', 'bed_type_Pull-out Sofa', 'bed_type_Real Bed'],
      dtype='object')

In [3142]:
scaler = StandardScaler()
scaler.fit(test_df)
test_df_scaled = scaler.transform(test_df)

In [3143]:
lr = LinearRegression()
alphas = [1000, 100, 50, 20, 10, 1, 0.1, 0.01]
l1_ratios = [0.001, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9]
ridge = RidgeCV(alphas=alphas)
lasso = LassoCV(alphas=alphas, max_iter=10000)
elastic = ElasticNetCV(alphas=alphas, l1_ratio=l1_ratios)

In [3144]:
pred_arr = []
for model, name in zip([lr, ridge, lasso, elastic], ['LinearRegression', 'Ridge', 'Lasso', 'ElasticNet']):
   
    """
    
    model.fit(X_train_scaled, y_train)
    
    y_pred_train = model.predict(X_train_scaled)
    mrse_train = np.sqrt(mean_squared_error(y_pred=y_pred_train, y_true=y_train))
    
    y_pred = model.predict(test_df_scaled)
    mrse_test = 0
    #mrse_test = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_test))
    
    
    """
    
    model.fit(X_train_scaled, y_train)
    y_pred_train = model.predict(X_train_scaled)
    mrse_train = np.sqrt(mean_squared_error(y_pred=y_pred_train, y_true=y_train))
    y_pred = model.predict(X_test_scaled)
    mrse_test = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_test))
    
  

    
    best_alpha = ''
    if name != 'LinearRegression':
        best_alpha = ' best alpha: ' + str(model.alpha_)
    best_l1 = ''
    if name == 'ElasticNet':
        best_l1 = ' best l1: '+ str(model.l1_ratio_)
    print(name + ' mrse_train: ' + str(mrse_train) + ', mrse_test: ' + str(mrse_test) + best_alpha + best_l1)

LinearRegression mrse_train: 80.53398432519057, mrse_test: 86.86852603204208
Ridge mrse_train: 80.42877302613276, mrse_test: 86.69530469633256 best alpha: 100.0
Lasso mrse_train: 80.53725261688314, mrse_test: 86.69516591391705 best alpha: 1.0
ElasticNet mrse_train: 80.4529946754577, mrse_test: 86.74343387280334 best alpha: 0.1 best l1: 0.7


In [3145]:
y_pred[0:10]

array([ 91.16513775,  21.2351014 ,  33.82685183,  62.01534642,
        38.68740315, 122.15958996,  77.69731699, 236.86268   ,
       134.7448332 ,  59.39136887])

In [3146]:
len(y_pred)

3015

In [3147]:
#X_test

In [3148]:
test_ids = []
test_df = pd.read_csv('test.csv')
for (idx, row) in test_df.iterrows():
    test_ids.append(row.id)
len(test_ids)

  interactivity=interactivity, compiler=compiler, result=result)


17337

In [3149]:
output_df = pd.DataFrame()
output_df['Id'] = test_ids
output_df['Predicted'] = y_pred
output_df.to_csv('final.csv', index = False)

ValueError: Length of values does not match length of index

In [None]:
output_df.head()

In [None]:
og_df = pd.read_csv('train.csv')


In [None]:
high_error = X_train[np.abs(diff) > 80]
print('size high error: ' + str(len(high_error)))
low_error = X_train[np.abs(diff) < 10]
print('size low error: ' + str(len(low_error)))

In [None]:
for c in high_error.columns:
    plt.figure(figsize=(7,3))
    plt.subplot(121)
    plt.hist(low_error[c], color='b')
    plt.title(c + ' low_error')
    plt.subplot(122)
    plt.hist(high_error[c], color='r')
    plt.title(c + ' high_error')
    plt.show()