Importing the required libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment import SentimentIntensityAnalyzer
%matplotlib inline

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import r2_score


loading the data

In [None]:
df=pd.read_csv('listings.csv')

In [None]:
df.head()

Cannot see all the column names along with the data types it contains.

Lets see the percentage of nulls in each column

In [None]:
for i in df.columns:
    print(f"{i:<32}" +' %  Nulls: ' + f"{str(int((df[i].isnull().sum()/len(df))*100)) : >10}")

Many columns have data that provide no value and no value can be extracted through feature engineering.

Will do an initial drop and inspect the rest of the columns accordingly

In [None]:
df=df.drop(['listing_url','thumbnail_url','medium_url' , 'picture_url','xl_picture_url','scrape_id','last_scraped',
            'experiences_offered','picture_url','requires_license','license','jurisdiction_names','state','city','market'
            ,'smart_location','country_code','country','longitude','latitude','square_feet','has_availability',
            'calendar_last_scraped','host_url','host_name','host_location','host_about','host_thumbnail_url',
            'host_picture_url','street','availability_30','availability_60','availability_90','first_review',
            'last_review','minimum_nights','maximum_nights','extra_people'],axis=1)

Lets look at the data types

In [None]:
for i in df.columns:
    print(f"{i:<32}" +'  : '+ str(df[i].dtypes))

Many numerical columns are formated as objects, will need to change them to float for easier exploration

In [None]:
def numeric_converter(df):
    for i in df.columns:
        df[i]=df[i].str.strip('% $ ,')
        df[i]=df[i].str.replace(',', '')
        df[i]=pd.to_numeric(df[i],downcast='float')
    return df

df[['host_response_rate','price','security_deposit','cleaning_fee']]=numeric_converter(df[['host_response_rate','price','security_deposit','cleaning_fee']])



Instead of dropping the below columns directly, will try to see if there's a correlation between a positive language and the unit price

The below columns have no sentiment as per definition, so will check if the more a person describes the unit the higher the demand and therefore the price

In [None]:
cols=['notes','transit']

In [None]:
for i in cols:
    df[i]=df[i].str.len()

In [None]:
df_sent1=df[['id','name','summary','space','description']]

Filling the nulls with a neutral word because the sentiment analyzer does not work with nulls

In [None]:
df.head()

In [None]:
df_sent1=df_sent1.fillna('none')

Defining a function and using it to replace each of the columns defined above in (df_sent1) with their sentiment value

There seem to be no correlation between any of the above columns and price

In [None]:
#plt.scatter(x=analyzed_df['description'],y=df['price']);
#analyzed_df['description'].corr(df['price'])

In [None]:
df1= df[['id', 'number_of_reviews','review_scores_rating','review_scores_accuracy','review_scores_cleanliness',
         'review_scores_checkin','review_scores_communication','review_scores_value']]
df1.fillna(0,inplace=True)

df1['review_scores_total']=df1['review_scores_rating']+df1['review_scores_accuracy']+df1['review_scores_cleanliness']+df1.review_scores_checkin
+df1.review_scores_communication+df1.review_scores_value;

plt.scatter(x=df1['review_scores_total'],y=df['price']);
df1['review_scores_total'].corr(df['price'])

Drop the columns as they provide no value

In [None]:
df=df.drop(['name','summary','space','description','notes','transit','number_of_reviews','review_scores_rating','review_scores_accuracy','review_scores_cleanliness',
         'review_scores_checkin','review_scores_communication','review_scores_value','review_scores_location','neighborhood_overview'],axis=1)

Create a function that splits columns with multiple strings into separate columns and another function to count them

In [None]:
def str_split(df,col):
    
    df[col]=df[col].str.replace('"','')
    df[col]=df[col].str.strip(" []''")
    df=df[col].str.split(',',expand=True)
    
    return df

def row_count(df):    
    df['count']=df.apply(lambda x: x.count(), axis=1)
    return df

Use the above two functions on the Amenities column to check if there's a correlation between the number of amenities and price

In [None]:
df2=df[['amenities']]
df2['amenities']=row_count(str_split(df2,'amenities'))[:]['count']


df3=pd.concat([df['id'],df2],axis=1)

df['amenities']=df3['amenities']

In [None]:
for column in df.select_dtypes(include=['object']).columns:
    display(pd.crosstab(index=df[column],
                        columns='% observations', 
                        normalize='columns'))
    print("# of unique values {}".format(df[column].nunique()))

# show summary statistics
display(df.describe())

# build histograms for each numeric feature
%matplotlib inline
hist = df.hist(bins=30, sharey=False, figsize=(15, 10))

In [None]:
df.columns

In [None]:
host_ver_df=df[['host_verifications']]

In [None]:
df_dummies=df[['host_response_time','host_is_superhost','host_identity_verified','neighbourhood_group_cleansed','zipcode',
      'is_location_exact','property_type','room_type','bed_type','instant_bookable','cancellation_policy',
       'require_guest_profile_picture','require_guest_phone_verification']]

In [None]:
df=df.drop(['host_since','host_acceptance_rate','host_neighbourhood','host_has_profile_pic','neighbourhood_cleansed',
      'neighbourhood','weekly_price','monthly_price','calendar_updated'],axis=1)

In [None]:
df_numeric=df.select_dtypes(include=['int64','float64','float32','int32'])

df_numeric

In [None]:
df.shape[1]-df_dummies.shape[1]-df_numeric.shape[1]

In [None]:
df_dummies

In [None]:
df['host_verifications']=row_count(str_split(host_ver_df,'host_verifications'))[:]['count']

In [None]:
df.head()

In [None]:
for i in df.columns:
    print(f"{i:<32}" +'  : '+ str(df[i].dtypes))

In [None]:
df_dummies=pd.get_dummies(df_dummies,dummy_na=True)

In [None]:
plt.figure(figsize = (15, 10))
sns.heatmap(df_numeric.corr(), annot=True)

In [None]:
df_numeric=df_numeric.drop(['id','host_id','availability_365', 'host_response_rate','host_listings_count','host_total_listings_count','calculated_host_listings_count','reviews_per_month'],axis=1)

In [None]:
df_numeric=df_numeric.drop(['beds','bathrooms'],axis=1)

In [None]:
plt.figure(figsize = (15, 10))
sns.heatmap(df_numeric.corr(), annot=True)

In [None]:
y=df_numeric['price']
df_numeric=df_numeric.drop(['price'],axis=1)

In [None]:
def normalize(df):
    for i in df.columns:
        df[i]=StandardScaler().fit_transform(df[[i]])[:,0]
    return df
df_numeric=normalize(df_numeric)

In [None]:
df_model=pd.concat([df_numeric,df_dummies],axis=1)

In [None]:
df_model.fillna( 0 ,inplace=True)

In [None]:
df_model

In [None]:
X=df_model

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [None]:
lr=LinearRegression()

In [None]:
lr.fit(X_train,y_train)

In [None]:
y_pred=lr.predict(X_test)

In [None]:
r2_score(y_test,y_pred)

In [None]:
RF=RandomForestRegressor()

In [None]:
RF.fit(X_train,y_train)

In [None]:
y_pred_RF=RF.predict(X_test)

In [None]:
r2_score(y_test,y_pred_RF)

In [None]:
lr.coeff._