In [None]:
from __future__ import print_function
import json
import zipfile
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from scipy import ndimage
import descartes
import geopandas as gpd
import seaborn as sns
import googlemaps
import gmaps
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.preprocessing import MultiLabelBinarizer
porter = PorterStemmer()
lancaster=LancasterStemmer()

sns.set_style('darkgrid')
from shapely.geometry import Point, Polygon

%matplotlib inline

In [None]:
# desc: plot histogram according to the param Series and a name 
def histplot(data, name=None, title=None, kde_flag=True, hist_kws=None):
    plt.figure(figsize = (6,4))
    sns.distplot(data, kde=kde_flag, hist_kws=hist_kws).set(xlabel = name +" histogram", title = title)
    
# desc: use RE to filter (clean) text features for further processing (ie. stemming)  
def clean_text(list_t):
    temp = []
    for text in list_t:
        # remove backslash-apostrophe 
        text = re.sub("\'", "", text) 
        # remove everything except alphabets 
        text = re.sub("[^a-zA-Z]"," ",text) 
        # remove whitespaces 
        text = ' '.join(text.split()) 
        # convert text to lowercase 
        text = text.lower() 
        temp.append(text)
    
    return temp

# desc: find outliers from data according to a threshold, return their indeces
def findOutlierIndex(data, threshold):
    z = np.abs(stats.zscore(data))
    index_outliers = np.where(z > threshold)
    return index_outliers


In [None]:
# load data from zip file
d = None  
data = None  
with zipfile.ZipFile("train.json.zip", "r") as z:
   for filename in z.namelist():    
      with z.open(filename) as f:  
         data = f.read()  
         d = json.loads(data.decode("utf-8"))
        
data = pd.DataFrame.from_dict(d)

rental_train = data.copy()

# <b>Exploratory data analysis(Use the training dataset to perform EDA)</b>
*   Plot histograms for the following numeric columns: Price, Latitude & Longitude. 
*   Plot hour-wise listing trend and find out the top 5 busiest hours of postings. 
*   Visualization to show the proportion of target variable values. 


In [None]:
print(rental_train.shape)
numOfRow = rental_train.shape[0]
# clean price
price_front_percentile = np.percentile(rental_train.price, 0.5)
price_end_percentile = np.percentile(rental_train.price, 99.5)
price_clean = rental_train[(rental_train['price'] < np.int(price_end_percentile)) & (rental_train['price'] > np.int(price_front_percentile))]

price_clean = price_clean['price']

# clean lat&lon
lat_up = 40.95
lat_down = 40.5
lon_left = -74.1
lon_right = -73.8

lat_clean = rental_train[ (lat_up >= rental_train['latitude']) & (rental_train['latitude'] >= lat_down)]
lon_clean = rental_train[ (lon_right >= rental_train['longitude']) & (rental_train['longitude'] >= lon_left)]
lat_lon_clean = rental_train[ (lat_up >= rental_train['latitude']) & (rental_train['latitude'] >= lat_down) & (lon_right >= rental_train['longitude']) & (rental_train['longitude'] >= lon_left)]
print("number of records cleaned out by lat: ", numOfRow-lat_clean.shape[0])
print("number of records cleaned out by lon: ", numOfRow-lon_clean.shape[0])
print("number of records cleaned out by lat and lon: ", numOfRow-lat_lon_clean.shape[0])


lat_clean = lat_clean['latitude']
lon_clean = lon_clean['longitude']




In [None]:
# 1. Plot histograms for the following numeric columns: Price, Latitude & Longitude.
sns.distplot(np.clip(rental_train["price"], 0, 15000), bins=333, color='b', hist_kws=dict(alpha=0.5)).set(xlabel = "price histogram", title='Price histogram before clean, but put right outliers in one bin')

histplot(price_clean, name='price', title='Price histogram after clean out front and end 0.5%')
# histplot(lat_clean, 'latitude')
# histplot(lon_clean, 'longitude')

histplot(lat_lon_clean['latitude'], 'latitude')
histplot(lat_lon_clean['longitude'], 'longitude')

In [None]:
# 2.Plot hour-wise listing trend and find out the top 5 busiest hours of postings. 
rental_train['created'] = pd.to_datetime(rental_train['created'])
# rental_train.iloc[0]['created']
rental_train['created_hour'] = rental_train['created'].dt.hour
histplot(rental_train['created'].dt.hour, 'hour', False)

# counts_df = pd.Series(range(24))
counts = np.zeros(24)
for i in range(rental_train.shape[0]):
    counts[rental_train.iloc[i]['created'].hour-1] += 1 
counts = pd.Series(counts)

In [None]:
counts.index = np.arange(1, len(counts)+1)
print("the top 5 busiest hours of postings:\ntime counts")
print(counts.nlargest(5))

In [None]:
# 3.Visualization to show the proportion of target variable values. 
rental_train_group = rental_train.groupby('interest_level')
rental_train_group.size()
interests = pd.DataFrame(rental_train_group.size())
interests.columns = ['count']
interests.plot.pie(y='count',figsize=(8,10),fontsize=20,autopct='%1.1f%%')
plt.title('PieChart of Interest Level', fontsize=20)

plt.figure(figsize = (6,4))
index = pd.Series.tolist(interests.index)
value = pd.Series.tolist(rental_train_group.size())
y_pos = np.arange(len(index))
plt.bar(y_pos, value)
plt.xticks(y_pos, index,fontsize=15)
plt.ylabel('counts')
plt.title('Barplot of Interest Level', fontsize=20)

In [None]:
#*4.Extract prices of each interest level and visaulize them accordingly

# before excluding price outliers
print("Price before excluding price outliers:\n",rental_train_group.agg({'price':['min','max','mean']}),"\n")

# after cleaning
price_front_percentile = np.percentile(rental_train.price, 0.5)
price_end_percentile = np.percentile(rental_train.price, 99.5)

price_clean = rental_train[(rental_train['price'] < np.int(price_end_percentile)) & (rental_train['price'] > np.int(price_front_percentile))]
print("Price after excluding price outliers:\n",price_clean.groupby('interest_level').agg({'price':['min','max','mean']}))

# separated histograms of prices after cleaning
high = price_clean[price_clean['interest_level']=='high']
low = price_clean[price_clean['interest_level']=='low']
medium = price_clean[price_clean['interest_level']=='medium']

sns.distplot(high['price'], color='b', kde=False, hist_kws=dict(alpha=1))
sns.distplot(low['price'], color='g', kde=False, hist_kws=dict(alpha=0.1))
sns.distplot(medium['price'], color='r', kde=False, hist_kws=dict(alpha=0.3)).legend(['high','low','medium'])

# plt.figure(figsize = (6,4))
# sns.boxplot(x='interest_level', y='price', data=rental_train)

In [None]:
# Visualize each record on a google map of NYC
import gmaps

with open('API_key.txt') as f:
    api_key = f.readline()
    f.close

gmaps_key = googlemaps.Client(key=api_key)

lat_lon_clean = lat_lon_clean[['latitude', 'longitude']]

sample_coordinates = lat_lon_clean.sample(frac=0.05, replace=False, random_state=1)

gmaps.configure(api_key=api_key)

lat_lon_layer = gmaps.symbol_layer(
    sample_coordinates, fill_color='green', stroke_color='green', scale=2
)
fig = gmaps.figure()
fig.add_layer(lat_lon_layer)
fig


# <b> Dealing with missing values,outliers</b>
*    Find out the number of missing values in each variable. 
*    Find out the number of outliers in each variable. Plot visualizations to
    demonstrate them.You can either remove the outliers or provide a
    short argument as to why outlier detection is not meaningful for that attribute. 
*    Can we safely drop the missing values? If not, how will you deal withthem? 


In [None]:
'''
    1.Find out the number of missing values in each variable.
    2.Find out the number of outliers in each variable. 
      Plot visualizations to demonstrate them.
      You can either remove the outliers or provide a short argument as to why outlier detection is not meaningful for that attribute.
    3.Can we safely drop the missing values? If not, how will you deal withthem?
'''

# zero bath/bedroom value would not be considered as "missing", 
# and possible outliers are reansonalbe therefore not dropped
bath_bed_missing = rental_train[(rental_train['bathrooms'] == 0.0) | (rental_train['bedrooms'] == 0.0)]
print("Number of missing values in either zero bathrooms or zero bedrooms:", len(bath_bed_missing))

# missing value of buildingID would not be dropped, as they can be recovered, or they are somewhat irrelavant 
# (and no numerical outliers)
buildingID_missing = rental_train[rental_train['building_id'] == "0"]
print("Number of missing values in buildingID:", len(buildingID_missing))

# created_hour have no missing data
# and possible outliers are reansonalbe therefore not dropped
created_hour_missing = rental_train[rental_train['created_hour'] == None]
print("Number of missing values in created_hour:", len(created_hour_missing))

# outliers & missing values in lat/longitude could be fixed by street address therefore not dropped
lat_lon_missing = rental_train[(rental_train['latitude']==0.0) | (rental_train['longitude']==0.0)]
print("Number of missing values in either zero latitude or zero longitude:", len(lat_lon_missing))

#sns.scatterplot(x='latitude', y='longitude', data=rental_train).set(title='lat/longitude scatterplot')
plt.scatter(rental_train['latitude'], rental_train['longitude'],marker='.')
plt.title('lat/longitude scatterplot')

# ManagerID have no missing data
# (and no numerical outliers)
# (considering to extract names from these IDs)
managerID_missing = rental_train[rental_train['manager_id'] == '0']
print("Number of missing values in managerID:", len(managerID_missing))
#print(rental_train.groupby('manager_id').count())

# price have no missing data, 
# but the outlier could be dropped as intuitively, they would contribute very little to the classifier
price_missing = rental_train[rental_train['price'] == 0]
print("Number of missing values in price:", len(price_missing))
print("Number of price outliers:", numOfRow-len(price_clean))

# Other missing values(and possible outliers) in text (ie, address, description) would be further processed
# so that they could be decided to drop or not. 
# (Mostly not, as shown in the next step that no value would not be considered as missing)



In [None]:
# Following are the statistical(according to Z-score) outliers of lat, lon, and price 
Outlier_indexlist = findOutlierIndex(rental_train['latitude'], 1.5)
print("\nLatitude outliers (threshold 1.5):\ntotal number: ", len(Outlier_indexlist[0]), "\nIndex     value")
print(rental_train.ix[Outlier_indexlist]['latitude'])

Outlier_indexlist = findOutlierIndex(rental_train['longitude'], 1.5)
print("\nLongitude outliers (threshold 1.5):\ntotal number: ", len(Outlier_indexlist[0]), "\nIndex     value")
print(rental_train.ix[Outlier_indexlist]['longitude'])

Outlier_indexlist = findOutlierIndex(rental_train['price'], 0.5)
print("\nPrice outliers (threshold 0.5):\ntotal number: ", len(Outlier_indexlist[0]), "\nIndex     value")
print(rental_train.ix[Outlier_indexlist]['price'])
rental_train.ix[Outlier_indexlist].index

In [None]:
rental_train = rental_train.drop(rental_train.ix[Outlier_indexlist].index)

# <b> Feature extraction from images and text</b>
*  Extract features from the images and transform it into data that’s ready to be
    used in the model for classification.
*  Extract features from the text data and transform it into data that’s ready to be
    used in the model for classification. 


In [None]:
# 1.Extract features from the images and transform it into data that’s ready to be used in the model for classification.

# Get number of photos of each rental posting 
def count_num_photos(photo_list):
    return len(photo_list)

rental_train['num_photos'] = rental_train['photos'].apply(count_num_photos)
rental_train.head()

# Other approaches are in the Logo_extraction.ipynb and is briefly discussed in report.

In [None]:
# 2.Extract features from the text data and transform it into data that’s ready to be used in the model for classification. 

data['features'] = data['features'].apply(lambda x : clean_text(x))

ps = PorterStemmer()
def porter_stemmer(list_t):
    temp = []
    for i in list_t:
        temp.append(ps.stem(i))
    return temp

data['stemmed_features'] = data['features'].apply(porter_stemmer)

dic = {}
for i in range(len(data)):
    for j in data["stemmed_features"][i]:
        if j in dic:
            dic[j]+=1
        else:
            dic.setdefault(j,1)
            
dic = {key:val for key, val in dic.items() if val > 5000}
values = dic.values()

In [None]:
#plt.boxplot(values)

In [None]:
keys = list(dic.keys())

for i in keys:
    data[i] = 0
    
for index,words in enumerate(data['stemmed_features']):
    for i in words:
        if i in keys:
            data[i][index] = 1

In [None]:
# features extracted from text, stored as binary values in the last 15 columns 
data

In [None]:
features = data.axes[1][0:]