In [62]:
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [108]:
class Model:
    def __init__(self, filename, target, random_value, split_percentage, drop_columns):
        self.filename = filename
        self.target = target
        self.random_value = random_value
        self.split_percentage = split_percentage
        self.drop_columns = drop_columns
        
    
    def read_to_dataframe(self):
        return pd.read_csv(self.filename)
      
    
    def select_feature_n_target(self):
        df = self.read_to_dataframe()
        df=df.drop(self.drop_columns, axis=1)
        y = df[self.target]
        X=df.drop([self.target], axis=1)
        return X, y
    
    
    def transformer(self):
        X, y = self.select_feature_n_target()
        enc = LabelEncoder()
        for col in X.columns:
            if X[col].dtype == 'object':
                X[col] = X[col].astype('str')
                X[col] = enc.fit_transform(X[col])
        return X,y
    
    def scale_trainingset(self):
        X,y =self.transformer()
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        X_new = pd.DataFrame(X_scaled, columns=X.columns)
        return X_new,y

    
    def split_dataset(self):
        X,y = self.transformer()
        X_train, x_test, y_train, y_test = train_test_split(X,y, random_state= self.random_value, test_size=self.split_percentage)
        return X_train, x_test, y_train, y_test  
        
    
    def train_model(self):
        X_train, x_test, y_train, y_test = self.split_dataset()
        model = LinearRegression()
        model.fit(X_train,y_train)
        return model
    
    def evaluate(self):
        X_train, x_test, y_train, y_test = self.split_dataset()
        model = self.train_model()
        pred = model.predict(x_test)
        mae = mean_absolute_error(y_test, pred)
        return mae
       
    
def model_predict(model, sampledata):
    pred = model.predict(sampledata)
    sampledata['predictions'] = pred
    return sampledata

In [94]:
filename='AB_NYC_2019.csv' #file that contains the dataset
target='price'                      #The dependent variable we are trying to predict                              
random_value=0                            #set the seed to zero to have consistence during reproducive
split_percentage=0.3                      # percentage of the dataset for testing set
drop_columns =['id', 'name', 'host_id','host_name', 'last_review','reviews_per_month'] #list of columsn to drop

In [95]:
PriceML = Model(filename, target,random_value, split_percentage, drop_columns)

In [96]:
df = PriceML.read_to_dataframe()
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [86]:
df.isnull().sum()

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [97]:
features,target =PriceML.select_feature_n_target()
print('----features----')
print(features.head())
print(' ')
print('-----target-------')
target.head()

----features----
  neighbourhood_group neighbourhood  latitude  longitude        room_type  \
0            Brooklyn    Kensington  40.64749  -73.97237     Private room   
1           Manhattan       Midtown  40.75362  -73.98377  Entire home/apt   
2           Manhattan        Harlem  40.80902  -73.94190     Private room   
3            Brooklyn  Clinton Hill  40.68514  -73.95976  Entire home/apt   
4           Manhattan   East Harlem  40.79851  -73.94399  Entire home/apt   

   minimum_nights  number_of_reviews  calculated_host_listings_count  \
0               1                  9                               6   
1               1                 45                               2   
2               3                  0                               1   
3               1                270                               1   
4              10                  9                               1   

   availability_365  
0               365  
1               355  
2               365  

0    149
1    225
2    150
3     89
4     80
Name: price, dtype: int64

In [98]:
transformed_features, transformed_target = PriceML.transformer()
print('----transformed_features----')
transformed_features.head()

----transformed_features----


Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365
0,1,108,40.64749,-73.97237,1,1,9,6,365
1,2,127,40.75362,-73.98377,0,1,45,2,355
2,2,94,40.80902,-73.9419,1,3,0,1,365
3,1,41,40.68514,-73.95976,0,1,270,1,194
4,2,61,40.79851,-73.94399,0,10,9,1,0


In [89]:
scaled_features, target = PriceML.scale_trainingset()
print('----Scaled_features----')
scaled_features.head()

----Scaled_features----


Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365
0,-0.917828,0.012762,-1.493849,-0.437652,0.909359,-0.293996,-0.320414,-0.034716,1.91625
1,0.441222,0.289156,0.452436,-0.684639,-0.924247,-0.293996,0.487665,-0.156104,1.840275
2,0.441222,-0.190897,1.468399,0.222497,0.909359,-0.196484,-0.522433,-0.186451,1.91625
3,-0.917828,-0.961892,-0.803398,-0.16445,-0.924247,-0.293996,5.538156,-0.186451,0.617065
4,0.441222,-0.67095,1.27566,0.177216,-0.924247,0.144807,-0.320414,-0.186451,-0.856865


In [106]:
X_train, X_test, y_train, y_test  = PriceML.split_dataset()
print('---Trainset for features----')
print(X_train.shape, X_test.shape)
print('')
print('-----Trainingset for target-----')
print(y_train.head())

---Trainset for features----
(34226, 9) (14669, 9)

-----Trainingset for target-----
13115    180
10214    100
577      110
40078     61
33301    165
Name: price, dtype: int64


In [107]:
model = PriceML.train_model()

In [101]:
print(f'Mean absolute error: {PriceML.evaluate()}')

Mean absolute error: 75.62451849582862


In [109]:
result = model_predict(model, X_test)
result.head()

Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365,predictions
43813,2,197,40.7243,-74.0111,0,3,0,1,42,241.969579
32734,2,144,40.72555,-73.99283,0,1,5,1,75,231.897505
25276,1,214,40.71687,-73.95012,0,5,5,3,31,190.97192
36084,1,190,40.64036,-74.00822,1,1,13,5,141,131.379977
17736,1,13,40.6837,-73.93325,0,2,4,1,0,155.947258
