# Imports and Display Options

In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import numpy as np

In [2]:
pd.set_option('display.max_rows', 20000)
pd.set_option('display.max_columns', 20000)

# Dataframes

In [3]:
df = pd.read_csv("listings.csv")
df2 = pd.read_csv("listings.csv.gz")               

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
#features used in model
features = ['id', 'bedrooms', 'bathrooms', 'amenities', 'price']
#model dataframe
model_df = df2[features]
#model data, remove empty amenities
model_data = model_df[(model_df['amenities'] != '{}') & (model_df['bedrooms'] != 0) & (model_df['bathrooms'] != 0)]

# Helper functions

In [23]:
model_data.head()

Unnamed: 0,id,bedrooms,bathrooms,amenities,price,new_amenities,Groups,New_data
0,6,3.0,2.0,"{TV,Internet,Wifi,""Air conditioning"",Kitchen,""...",$295.00,"[TV, Internet, Wifi, Air conditioning, Kitchen...",TV Wifi Air conditioning Kitchen Free parking ...,"[TV, Wifi, Air conditioning, Kitchen, Free par..."
1,5570,2.0,1.0,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Free par...","$2,050.00","[TV, Cable TV, Internet, Wifi, Kitchen, Free p...",TV Wifi Kitchen Free parking on premises Heati...,"[TV, Wifi, Kitchen, Free parking on premises, ..."
2,38245,1.0,1.0,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",$75.00,"[TV, Cable TV, Internet, Wifi, Air conditionin...",TV Wifi Air conditioning Heating Dryer Smoke d...,"[TV, Wifi, Air conditioning, Heating, Dryer, S..."
3,39516,1.0,1.0,"{Internet,Wifi,Pool,Kitchen,""Pets live on this...",$74.00,"[Internet, Wifi, Pool, Kitchen, Pets live on t...",Wifi Kitchen Heating Smoke detector Carbon mon...,"[Wifi, Kitchen, Heating, Smoke detector, Carbo..."
4,52286,2.0,2.0,"{TV,""Cable TV"",Wifi,Kitchen,""Free parking on p...",$550.00,"[TV, Cable TV, Wifi, Kitchen, Free parking on ...",TV Wifi Kitchen Free parking on premises Washe...,"[TV, Wifi, Kitchen, Free parking on premises, ..."


### Function that changes the amenities data in the dataframe from json to a list

In [5]:
def json_to_list(df):
    new_data = []
    data = df['amenities'].values
    for num, amen in enumerate(data):
        new_data.append([word.strip('"') for word in amen.strip("{").strip("}").split(",")])
    df['new_amenities'] = new_data
    return df

In [6]:
model_data = json_to_list(model_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


### Function that figures out the unique amenities

In [7]:
def unique_amenities(df, column):
    uniques = []
    for data in df[column]:
        for d in data:
            if d in uniques:
                continue
            else:
                uniques.append(d)
    return uniques

In [8]:
amenities = unique_amenities(model_data, 'new_amenities')

In [11]:
amenities

['TV',
 'Internet',
 'Wifi',
 'Air conditioning',
 'Kitchen',
 'Free parking on premises',
 'Pets allowed',
 'Free street parking',
 'Heating',
 'Family/kid friendly',
 'Washer',
 'Dryer',
 'Smoke detector',
 'Carbon monoxide detector',
 'First aid kit',
 'Fire extinguisher',
 'Essentials',
 'Shampoo',
 '24-hour check-in',
 'Hangers',
 'Hair dryer',
 'Iron',
 'Laptop friendly workspace',
 'Baby monitor',
 'Outlet covers',
 'Bathtub',
 'Changing table',
 'High chair',
 'Children’s books and toys',
 'Babysitter recommendations',
 'Crib',
 'Children’s dinnerware',
 'Hot water',
 'Luggage dropoff allowed',
 'Other',
 'Cable TV',
 'Elevator',
 'Microwave',
 'Coffee maker',
 'Refrigerator',
 'Dishwasher',
 'Dishes and silverware',
 'Cooking basics',
 'Oven',
 'Stove',
 'BBQ grill',
 'Patio or balcony',
 'Long term stays allowed',
 'Waterfront',
 'Beachfront',
 'Pets live on this property',
 'translation missing: en.hosting_amenity_50',
 'Self check-in',
 'Keypad',
 'Wide hallways',
 'Well-li

### Function that gathers the counts of every amenity

In [12]:
def amen_counts(amen_lst, df, column):
    counts = [0 for num in range(len(amen_lst))]
    amen_data = df[column].values
    for num, data in enumerate(amen_data):
        for d in data:
            num = amen_lst.index(d)
            counts[num] += 1
    return counts

In [13]:
counts = amen_counts(amenities, model_data, 'new_amenities')

### Function that returns top 20 amenities 

In [14]:
def top_20(counts):
    largest = sorted(counts)[-20:]
    count_indexes = []
    for l in largest:
        count_indexes.append(counts.index(l))
        amens = []
    for c in count_indexes:
        amens.append(amenities[c])
    return largest, amens      

In [15]:
largest, top = top_20(counts)

In [16]:
largest, top

([5562,
  5810,
  6656,
  7070,
  7308,
  7945,
  8500,
  9066,
  9229,
  9287,
  9365,
  9566,
  9797,
  10114,
  10176,
  10499,
  10811,
  11084,
  11289,
  11560],
 ['Microwave',
  'Refrigerator',
  'Air conditioning',
  'Fire extinguisher',
  'Hot water',
  'Free parking on premises',
  'Laptop friendly workspace',
  'Iron',
  'Hair dryer',
  'Dryer',
  'Washer',
  'Shampoo',
  'Carbon monoxide detector',
  'Hangers',
  'Heating',
  'TV',
  'Kitchen',
  'Smoke detector',
  'Essentials',
  'Wifi'])

### Function that create columns with only the top amenities

In [17]:
def reduce_dimensions(df, column, top):
    all_data = []
    all_strings = []
    amens = df[column].values
    bedrooms = df['bedrooms'].values
    bathrooms = df['bathrooms'].values
    for num, data in enumerate(amens):
        new_data = []
        new_string = ""
        for d in data:
            if d in top:
                new_data.append(d)
                new_string += d + " "
            else:
                continue
        new_string += str(bedrooms[num])
        new_string += str(bathrooms[num])
        all_data.append(new_data)
        all_strings.append(new_string)
    df['Groups'] = all_strings
    df['New_data'] = all_data
    return df

In [18]:
model_data = reduce_dimensions(model_data, "new_amenities", top)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [19]:
model_data = model_data.dropna()

### Grouping system

In [20]:
encoder = OrdinalEncoder()

In [21]:
groups = encoder.fit_transform(model_data[['Groups']].values)

In [22]:
model_data['OrdinalGroups'] = groups

In [23]:
def optimal_pricing(df):
    groups = [word for word in df['OrdinalGroups'].values]
    ids = [word for word in df['id'].values]
    prices = [word for word in df['id'].values]
    for g in groups:
        data = []
        max_df = df[df['OrdinalGroups'] == g]
        max_price = max_df['price'].max()
        group_ids = [word for word in max_df['id'].values]
        indexes = []
        for i in group_ids:
            indexes.append(ids.index(i))
        for index in indexes:
            prices[index] = max_price
    df['price'] = prices
    return df

In [24]:
df = optimal_pricing(model_data)

In [25]:
def string_to_int(df, column):
    new_data = []
    data = df[column].values
    for d in data:
        d = d.strip("$")
        d = d.replace(",", "")
        d = d.split(".")[0]
        d = int(d)
        new_data.append(d)
    new_data = np.array(new_data)
    df[column] = new_data
    return df

In [26]:
df = string_to_int(df, "price")

In [27]:
def onehotencoder(df, column, amen_lst):
    for amen in amen_lst:
        new_data = []
        data = df[column].values
        for d in data:
            if amen in d:
                new_data.append(1)
            else:
                new_data.append(0)
        new_data = np.array(new_data)
        df[amen] = new_data
    return df

In [28]:
df = onehotencoder(df, "New_data",  top)

# Model

In [29]:
features = top + ["bedrooms", "bathrooms"]
target = 'price'
draft_features = df[features]
draft_target = df[target]

In [30]:
draft_features.shape, draft_target.shape

((11769, 22), (11769,))

In [31]:
X_train, X_test, y_train, y_test = train_test_split(draft_features, draft_target, train_size = .8, test_size = .2, random_state= 42)

In [32]:
X_train.shape, y_train.shape

((9415, 22), (9415,))

In [33]:
model = LinearRegression()
model.fit(draft_features, draft_target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [34]:
def model_coefs():
    coefs = model.coef_
    positive_c = []
    negative_c = []
    positive_f = []
    negative_f = []
    for num, coef in enumerate(coefs):
        if coef > 0:
            positive_c.append(coef)
            positive_f.append(features[num])
        else:
            negative_c.append(coef)
            negative_f.append(features[num])
    print("Positives")
    for num, p in enumerate(positive_c):
        print(f"Feature: {positive_f[num]}, Coef: {p}")
    print("Negatives")
    for num, p in enumerate(negative_c):
        print(f"Feature: {negative_f[num]}, Coef: {p}")
    return

In [35]:
model_coefs()

Positives
Feature: Microwave, Coef: 14.47389031524099
Feature: Air conditioning, Coef: 10.296484933707688
Feature: Laptop friendly workspace, Coef: 2.0750380014746814
Feature: Hair dryer, Coef: 13.723780472538975
Feature: Washer, Coef: 61.38119855364762
Feature: Carbon monoxide detector, Coef: 28.49524508031815
Feature: Heating, Coef: 27.41358673641551
Feature: TV, Coef: 7.588903608757145
Feature: bedrooms, Coef: 128.0552528277956
Feature: bathrooms, Coef: 69.79291458526092
Negatives
Feature: Refrigerator, Coef: -2.3677015298022166
Feature: Fire extinguisher, Coef: -18.451445823887568
Feature: Hot water, Coef: -39.27699999085979
Feature: Free parking on premises, Coef: -24.863846517545678
Feature: Iron, Coef: -4.120649524803768
Feature: Dryer, Coef: -19.491921189397313
Feature: Shampoo, Coef: -4.3133885983305555
Feature: Hangers, Coef: -14.729056502341004
Feature: Kitchen, Coef: -0.9808039146240726
Feature: Smoke detector, Coef: -21.67907313358683
Feature: Essentials, Coef: -6.60124441