## Data Import

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
import warnings  
warnings.filterwarnings('ignore')

In [None]:
df_original = pd.read_json('../input/train.json')
df = df_original.copy()
df.index = df['listing_id']
df = df.drop('listing_id',1)
df['interest_level'] = df['interest_level'].map({'medium':0, 'low' : -1, 'high': 1})
df.head()

## Data Cleaning
-  __Drop: created, display_address, street_address, latitude, longtitude__


In [None]:
df = df.drop(['created', 'display_address', 'street_address', 'latitude', 'longitude'], 1)
df.head()

- __latitude, longtitude, price, bedroom, description__

In [None]:
def price_beds(df):
    '''For bedroom and price, build price/bedrooms and drop the two columns.'''
    beds = np.array(df['bedrooms'])
    beds += 1
    price = np.array(df['price'])
    p_b = price/beds
    df['price/bedrooms'] = p_b
    df['price/bedrooms'] = preprocessing.scale(df['price/bedrooms'])
    df = df.drop('bedrooms', 1)
    df = df.drop('price', 1)
    return df

def des_length(df):
    '''For description, change the column into the length of the description.'''
    des_len = []
    for i in df['description']:
        des_len.append(len(i))
    df['des_length'] = des_len
    df['des_length'] = preprocessing.scale(df['des_length'])
    df = df.drop('description', 1)
    return df

In [None]:
df = des_length(df)
df = price_beds(df)
df.head()

- __photos__

In [None]:
df['Photo Number'] = df['photos'].apply(len)
df = df.drop('photos',1)
df.head()

- __features__

In [None]:
import string
table = str.maketrans('','',string.punctuation)
#Building a cleaned list and a merged cleaned string for the features column of each instance
df['Features List'] = df['features'].apply(lambda y : list(map(lambda x: x.upper().translate(table), y)))
df['Features String'] = df['Features List'].apply(' '.join)
df = df.drop('features',1)
df.head()

In [None]:
#Build 24 feature columns containing 0 which occur most frequently. 
feature = []
biglist = []
for i in range(len(df)):
    biglist += df.iloc[i]['Features List']
    for w in df.iloc[i]['Features List']:
        if w not in feature:
            feature.append(w)
bigdic = {i : biglist.count(i) for i in feature}

df_feature_count = pd.DataFrame(bigdic, index = ['Count']).T
add_ones = list(df_feature_count.sort_values(by = ['Count'],ascending = False)[:24].index)
for i in add_ones:
    df['f_' + '_'.join(i.split(' '))] = 0

In [None]:
#Change zeros into binary
for i in add_ones:
    onelist = []
    for w in range(len(df)):
        onelist.append(i in df.iloc[w]['Features String'])
    print('f_' + '_'.join(i.split(' ')))
    df['f_' + '_'.join(i.split(' '))] = onelist

In [None]:
df = df.drop('Features List', 1)
df = df.drop('Features String', 1)
all_features = df.columns.tolist()
all_features.remove('interest_level')
all_features = all_features + ['interest_level']
df = df[all_features]
df.head()

- __bathrooms, features normalization__

In [None]:
norm_list = ['f_' + '_'.join(i.split(' ')) for i in add_ones] + ['bathrooms','Photo Number']
for i in norm_list:
    df[i] = preprocessing.scale(df[i])

In [None]:
df.head()

- __manager expected level, building expected level（after train test split）__

In [None]:
def manager_building_level(x_train, x_test):    
    
    gb_m = x_train.groupby(x_train['manager_id'])
    m_dic = {m_id : np.mean(gb_m.get_group(m_id)['interest_level']) for m_id in gb_m.groups.keys()}
    x_train['manager_level'] = [m_dic[i] for i in x_train['manager_id']]
    x_test['manager_level'] = [m_dic[i] if i in gb_m.groups.keys() else np.mean(x_train['interest_level']) for i in x_test['manager_id']]
    
    
    gb_b = x_train.groupby(x_train['building_id'])
    b_dic = {b_id : np.mean(gb_b.get_group(b_id)['interest_level']) for b_id in gb_b.groups.keys()}
    x_train['building_level'] = [b_dic[i] for i in x_train['building_id']]
    x_test['building_level'] = [b_dic[i] if i in gb_b.groups.keys() else np.mean(x_train['interest_level']) for i in x_test['building_id']]
    
    
    x_train = x_train.drop({'manager_id', 'interest_level','building_id'}, 1)
    x_test = x_test.drop({'manager_id', 'interest_level','building_id'}, 1)

    
    return x_train, x_test

- __Train Test Split__

In [None]:
from sklearn.model_selection import train_test_split
x = df
y = df['interest_level']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)

x_train, x_test = manager_building_level(x_train, x_test) 

In [None]:
x_train.head()

## Model Selection

In [None]:
#Model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import auc
from sklearn import metrics
from sklearn.model_selection import KFold

### Linear Regression

In [None]:
parameters = [100, 1]
scores = []
c_range = [0.0001, 0.001, 0.0025, 0.005, 0.01, 0.05, 0.1, 1]
for c in c_range:
    kf = KFold(n_splits = 3, shuffle = True)
    score = []
    for train_index, test_index in kf.split(x):
        x_train, x_test = x.iloc[train_index, :], x.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        x_train, x_test = manager_building_level(x_train, x_test)
            
        lr = LogisticRegression(C = c, multi_class='multinomial', solver='lbfgs').fit(x_train, y_train)
        pred_proba = lr.predict_proba(x_test)
        score.append(log_loss(y_test, pred_proba))
        
    mean_score = sum(score)/len(score)
    scores.append(mean_score)
    if mean_score < parameters[0]:
        parameters = [mean_score, c]

In [None]:
plt.plot(np.arange(8), scores)
plt.xticks(np.arange(8), c_range)
plt.xlabel("C")
plt.ylabel("Log loss")
plt.title("Tuning the Regularization for Logistic Regression")
plt.annotate('Best C=0.0025',xy=(2,parameters[0]), xytext=(2.5,0.677),
             arrowprops=dict(facecolor='red'))
plt.show()

### K Nearest Neighbor(KNN) Classifier

In [None]:
result_log = pd.DataFrame(np.zeros((6,7)))
result_log.index = ['euclidean','manhattan','minkowski','euclidean_w','manhattan_w','minkowski_w']
result_log.columns = [50, 100, 150, 200, 250, 300, 350]

In [None]:
fig = plt.figure(figsize = (10, 8))
for i in [50, 100, 150, 200, 250, 300, 350]:
    print(i)
    for w in ['euclidean','manhattan','minkowski']:
        
        kf = KFold(n_splits = 3, shuffle = True)
        score = []
        score_w = []
        for train_index, test_index in kf.split(x):
            x_train, x_test = x.iloc[train_index, :], x.iloc[test_index, :]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            x_train, x_test = manager_building_level(x_train, x_test)

            KNN = KNeighborsClassifier(n_neighbors  = i, weights = 'uniform', metric = w).fit(x_train, y_train)
            KNN_w = KNeighborsClassifier(n_neighbors  = i, weights = 'distance', metric = w).fit(x_train, y_train)
            pred_proba_KNN = KNN.predict_proba(x_test)
            pred_proba_KNN_w = KNN_w.predict_proba(x_test)
            
            score.append(log_loss(y_test, pred_proba_KNN))
            score_w.append(log_loss(y_test, pred_proba_KNN_w))

            mean_score = sum(score)/len(score)
            mean_score_w = sum(score_w)/len(score_w)
            
            result_log.loc[w,i] = mean_score
            result_log.loc[w+'_w',i] = mean_score_w

In [None]:
result_log

In [None]:
fig = plt.figure(figsize = (10, 8))
for i in result_log.index:
    plt.plot([50, 100, 150, 200, 250, 300, 350], result_log.loc[i,:], label = i)
plt.legend()
plt.title('Log Loss',fontdict = {'fontsize': 15})
plt.xlabel('K')
plt.ylabel('Log Loss')

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
n_estimators_ =  [200, 400, 600]
max_depth_ =  [7,8,9]
min_samples_split_ = [100, 150, 200]
min_samples_leaf_ = [25, 50, 75]

In [None]:
result = []
fig = plt.figure(figsize = (10, 8))
for est in n_estimators_:
    print(est)
    for dep in max_depth_:
        for spl in min_samples_split_:
            for lea in min_samples_leaf_:
                
                kf = KFold(n_splits = 3, shuffle = True)
                score = []
                for train_index, test_index in kf.split(x):
                    x_train, x_test = x.iloc[train_index, :], x.iloc[test_index, :]
                    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                    x_train, x_test = manager_building_level(x_train, x_test)

                    RF = RandomForestClassifier(n_estimators=est, max_depth = dep, min_samples_split = spl, min_samples_leaf = lea, criterion = 'entropy').fit(x_train, y_train)
                    
                    pred_proba_RF = RF.predict_proba(x_test)
                    score.append(log_loss(y_test, pred_proba_RF))
                mean_score = sum(score)/len(score)
                result.append(mean_score)

In [None]:
index = ['200_7','200_8','200_9','400_7','400_8','400_9','600_7','600_8','600_9']
columns = ['100_25','100_50','100_75','150_25','150_50','150_75','200_25','200_50','200_75']
aray_log = np.array(result).reshape(9,9)
df_log = pd.DataFrame(np.array(result).reshape(9,9))
df_log.index = index
df_log.columns = columns
df_log

In [None]:
fig = plt.figure(figsize=(20,10))
ax1 = plt.subplot(121)
im1 = ax1.imshow(aray_log)
cbar = ax1.figure.colorbar(im1, ax=ax1)
ax1.set_xticks(np.arange(9))
ax1.set_yticks(np.arange(9))
ax1.set_xticklabels(['100_25','100_50','100_75','150_25','150_50','150_75','200_25','200_50','200_75'], fontsize=9)
ax1.set_yticklabels(['200_7','200_8','200_9','400_7','400_8','400_9','600_7','600_8','600_9'], fontsize=13)
for i in range(9):
    for j in range(9):
        text = ax1.text(j, i, round(aray_log[i][j], 4), ha="center", va="center", color="w")
ax1.set_xlabel('min_split/min_leaf', fontsize=16)
ax1.set_ylabel('n_estimaters/max depth', fontsize=16)
ax1.set_title('Log Loss for Random Forest Grid Search', fontsize=16)

In [None]:
x = df
y = df['interest_level']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)

x_train, x_test = manager_building_level(x_train, x_test) 
x_train.head()

lr = LogisticRegression(C = 0.0025, multi_class='multinomial', solver='lbfgs')
lr.fit(x_train, y_train)
preds = lr.predict_proba(x_test)
print('log_loss of LR (test): %.4f' % log_loss(y_test, preds))
preds_train = lr.predict_proba(x_train)
print('log_loss of LR (train): %.4f' % log_loss(y_train, preds_train))
print()

rf = RandomForestClassifier(n_estimators=400,max_depth=9, min_samples_split=100,min_samples_leaf=25,criterion='entropy')
rf.fit(x_train, y_train)
preds = rf.predict_proba(x_test)
print('log_loss of RF (test): %.4f' % log_loss(y_test, preds))
preds_train = rf.predict_proba(x_train)
print('log_loss of RF (train): %.4f' % log_loss(y_train, preds_train))
print()

knn = KNeighborsClassifier(n_neighbors  = 130, metric = 'manhattan')
knn.fit(x_train, y_train)
preds = knn.predict_proba(x_test)
print('log_loss of kNN (test): %.4f' % log_loss(y_test, preds))
preds_train = knn.predict_proba(x_train)
print('log_loss of kNN (train): %.4f' % log_loss(y_train, preds_train))

## Graphs at Data Cleaning

In [None]:
fig = plt.figure(figsize = (8, 6))
    
low = sum(df_original['interest_level'] == 'low')
medium = sum(df_original['interest_level'] == 'medium')
high = sum(df_original['interest_level'] == 'high')
count = [low, medium, high]
bars = plt.bar( [1,2,3],count, width = 0.4)
a = 0
for bar in bars:
    num = count[a]/(sum(count))
    plt.gca().text(bar.get_x() + bar.get_width()/2, bar.get_height() -1500, str(format(num*100,'.2f')) + '%', 
             ha='center', color='w', fontsize=13)
    a += 1
plt.xticks([1,2,3], ['Low','Medium','High'], fontsize = 13)
plt.title('Distribution of Target Variable', fontsize = 15)

In [None]:
fig = plt.figure(figsize = (10, 8))

df_original['created'] = pd.to_datetime(df_original['created'], format = '%Y-%m-%d %H:%M:%S')

barwidth = 0.3
for i in [4,5,6]:
    
    low = sum(df_original[df_original['created'].dt.month == i]['interest_level'] == 'low')
    medium = sum(df_original[df_original['created'].dt.month == i]['interest_level'] == 'medium')
    high = sum(df_original[df_original['created'].dt.month == i]['interest_level'] == 'high')
    count = [low,medium,high]
    bars = plt.bar([w + (i-5)*barwidth for w in [1,2,3]],count,width = barwidth, label = '2016.'+str(i))
    a = 0
    for bar in bars:
        num = count[a]/(sum(count))
        plt.gca().text(bar.get_x() + bar.get_width()/2, bar.get_height() - 400, str(format(num*100,'.2f')) + '%', 
                 ha='center', color='w', fontsize=12)
        a += 1
    
    
plt.xticks([1,2,3], ['Low','Medium','High'])
plt.legend()
plt.title('Distribution by Created Time', fontsize = 16)


In [None]:
def lat_long(l):
    '''For latitude and longtitude, delete outlier and normalize.'''
    l = l.loc[l['latitude']<40.95, :]
    l = l.loc[l['latitude']>40.55, :]
    l = l.loc[l['longitude']<-73.80, :]
    l = l.loc[l['longitude']>-74.05, :]
    l['latitude'] = preprocessing.scale(l['latitude'])
    l['longitude'] = preprocessing.scale(l['longitude'])
    return l

df_original = lat_long(df_original)

high = df_original.loc[df_original['interest_level']=='high', :]
medium = df_original.loc[df_original['interest_level']=='medium', :]
low = df_original.loc[df_original['interest_level']=='low', :]

plt.figure(figsize=(17,3.5))

plt.subplot(131)
plt.scatter(high['longitude'], high['latitude'], 5, c='blue', label='high')
plt.title('high')

plt.subplot(132)
plt.scatter(medium['longitude'], medium['latitude'], 5, c='red', label='medium')
plt.title('medium')

plt.subplot(133)
plt.scatter(low['longitude'], low['latitude'], 5, c='orange', label='low')
plt.title('low')

plt.show()

In [None]:
df_original['Features List'] = df_original['features'].apply(lambda y : list(map(lambda x: x.upper().translate(table), y)))
df_original['Features String'] = df_original['Features List'].apply(' '.join)
biglist = ' '.join(df_original['Features String'])

from wordcloud import WordCloud

wordcloud = WordCloud( background_color='white',collocations=True).generate(biglist)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('Key Words in Features', fontsize=16, y=1.05)
plt.show()