## Import Req'd Libraries 

In [1]:
import pandas as pd
import os
import seaborn as sns

In [2]:
#Getting all the file_names in a given directory
def get_file_names(folder):
    #Listing entries present in given folder
    entries = os.listdir(folder)
    for i in entries:
        if 'csv' not in i:
            entries.remove(i)
    return sorted(entries, reverse=True)

#Saving the file
def save_file(root, name_of_file, my_dataframe):
    #Test if save directory exists
    try:
        my_dataframe.to_csv(root+'processed_data/'+ name_of_file, index=False)
    #Otherwise make the directory and then save
    except:
        os.mkdir(root+'processed_data')
        my_dataframe.to_csv(root+'processed_data/'+ name_of_file, index=False)

In [3]:
def initialization(filepath):
    df = pd.read_csv(filepath, 
                     usecols = ['id','last_scraped', 'host_is_superhost', 
                            'latitude','longitude', 'property_type',
                           'room_type','accommodates','bathrooms',
                           'bedrooms','beds','amenities', 'price',
                            'instant_bookable','cancellation_policy'])
    df["last_scraped"] = pd.to_datetime(df["last_scraped"])
    df["price"] = df["price"].apply(lambda x: x.replace('$','').replace(',', '').replace('.00', '')).astype("int")
    return df

## Concatenating All Data Together

In [107]:
# get list of filenames
root = '../'
name = get_file_names(root+'original_dataset')
accum_kept = 0
accum_removed = 0

# combine all files into one
print("COMBINING INTO ONE DATAFRAME...")
frames = []
for i in name:
    print("--Processing: ", i)
    filepath = root+'original_dataset/'+i
    frames.append(initialization(filepath))
full_df = pd.concat(frames, sort=False)

initial_samples = len(full_df)
print("Initial # of Samples: ", len(full_df))

print(" ")

# Drop missing values
print("DELETING SAMPLES WITH MISSING VALUES...")
full_df = full_df.dropna()
samples1 = len(full_df)
print("Number of Samples Removed: ", initial_samples-samples1)

# reset index
full_df = df.reset_index(drop=True)

COMBINING INTO ONE DATAFRAME...
--Processing:  19_08_08_listings.csv
--Processing:  19_07_08_listings.csv
--Processing:  19_06_04_listings.csv
--Processing:  19_05_06_listings.csv
--Processing:  19_04_08_listings.csv
--Processing:  19_03_07_listings.csv
--Processing:  19_02_04_listings.csv
--Processing:  19_01_13_listings.csv
--Processing:  18_12_06_listings.csv
--Processing:  18_11_04_listings.csv
--Processing:  18_10_06_listings.csv
--Processing:  18_09_08_listings.csv
Initial # of Samples:  236507
 
DELETING SAMPLES WITH MISSING VALUES...
Number of Samples Removed:  559


## Numericalize Categorical Data

#### Mapping for Categorical Features

In [108]:
cancellation_policies = {
    "flexible": 1,
    "moderate": 5,
    "strict_14_with_grace_period": 14,
    "super_strict_30": 60,
    "super_strict_60": 120
}
room_types = {
    "Entire home/apt": 0,
    "Private room": 0.5,
    "Shared room": 1
}
true_false = {
    't': 1,
    'f': 0
}

In [109]:
# convert t/f into 1/0
print("NUMERICALIZING TRUE/FALSE...")
for i in ["host_is_superhost", "instant_bookable"]:
    full_df[i] = full_df[i].map(true_false)
    print(full_df[i].value_counts())

print(" ")

# cancellation policy
print("NUMERICALIZING CANCELLATION POLICY...")
print(full_df["cancellation_policy"].value_counts())
full_df["cancellation_policy"] = full_df["cancellation_policy"].map(cancellation_policies)

print(" ")

# room_type
print("NUMERICALIZING ROOM TYPE...")
print(full_df["room_type"].value_counts())
full_df["room_type"] = full_df["room_type"].map(room_types)

NUMERICALIZING TRUE/FALSE...
0.0    176235
1.0     60160
Name: host_is_superhost, dtype: int64
0    147847
1     88660
Name: instant_bookable, dtype: int64
 
NUMERICALIZING CANCELLATION POLICY...
strict_14_with_grace_period    94224
moderate                       71084
flexible                       70711
super_strict_30                  345
super_strict_60                  143
Name: cancellation_policy, dtype: int64
 
NUMERICALIZING ROOM TYPE...
Entire home/apt    152547
Private room        79799
Shared room          4161
Name: room_type, dtype: int64


In [111]:
corr = full_df.corrwith(full_df["price"])

In [112]:
corr

id                     0.011635
host_is_superhost      0.041779
latitude              -0.271832
longitude              0.021700
room_type             -0.506367
accommodates           0.523217
bathrooms              0.295164
bedrooms               0.435255
beds                   0.422576
price                  1.000000
instant_bookable      -0.076184
cancellation_policy    0.160786
dtype: float64