In [1]:
from __future__ import print_function
import json
import zipfile
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from scipy import ndimage
import descartes
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn import metrics
from math import cos, asin, sqrt, pi
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
# import geopandas as gpd
import seaborn as sns
import re
import sys
import seaborn
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.preprocessing import MultiLabelBinarizer
porter = PorterStemmer()
lancaster=LancasterStemmer()

sns.set_style('darkgrid')
from shapely.geometry import Point, Polygon

%matplotlib inline



def clean_text(list_t):
    temp = []
    for text in list_t:
        # remove backslash-apostrophe 
        text = re.sub("\'", "", text) 
        # remove everything except alphabets 
        text = re.sub("[^a-zA-Z]"," ",text) 
        # remove whitespaces 
        text = ' '.join(text.split()) 
        # convert text to lowercase 
        text = text.lower() 
        temp.append(text)
    
    return temp

def clean_description(text):
    text = re.sub('<.*?>', '', text)
    text = re.sub('\w+_\w+', '', text)
    text = re.sub(r'[^\w]', ' ', text)
    text = text.lower()
    text = ' '.join(text.split())
    return text

from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))

def findOutlierIndex(data, threshold):
    z = np.abs(stats.zscore(data))
    index_outliers = np.where(z > threshold)
    return index_outliers

In [2]:
# load data from zip file
d = None  
data = None  
with zipfile.ZipFile("train.json.zip", "r") as z:
   for filename in z.namelist():    
      with z.open(filename) as f:  
         data = f.read()  
         d = json.loads(data.decode("utf-8"))
        
data = pd.DataFrame.from_dict(d)

# remove = []

# remove += findOutlierIndex(data['latitude'], 0.95)[0].tolist()

# # data = data.drop(data.ix[Outlier_indexlist].index)
# # print(len(Outlier_indexlist[0]))

# remove += findOutlierIndex(data['longitude'], 0.95)[0].tolist()
# # data = data.drop(data.ix[Outlier_indexlist].index)
# # print(len(Outlier_indexlist[0]))
# remove += findOutlierIndex(data['price'], 0.95)[0].tolist()

# data = data.drop(data.ix[np.unique(remove)].index)
# # # print(len(Outlier_indexlist[0]))

# # data = data[(data['latitude']!=0.0) & (data['longitude']!=0.0)]

# # data = data[data['price'] != 0]

# data = data.reset_index(drop=True)


price_front_percentile = np.percentile(data.price, 1)
price_end_percentile = np.percentile(data.price, 99)
data = data[(data['price'] < np.int(price_end_percentile)) & (data['price'] > np.int(price_front_percentile))]

latitude_front_percentile = np.percentile(data.latitude, 1)
latitude_end_percentile = np.percentile(data.latitude, 99)
data = data[(data['latitude'] < np.float(latitude_end_percentile)) & (data['latitude'] > np.float(latitude_front_percentile))]


longitude_front_percentile = np.percentile(data.longitude, 1)
longitude_end_percentile = np.percentile(data.longitude, 99)
data = data[(data['longitude'] < np.float(longitude_end_percentile)) & (data['longitude'] > np.float(longitude_front_percentile))]






rental_train = data.copy()
data

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address,interest_level
4,1.0,1,8579a0b0d54db803821a35a4a615e97a,2016-06-16 05:55:27,Spacious 1 Bedroom 1 Bathroom in Williamsburg!...,145 Borinquen Place,"[Dining Room, Pre-War, Laundry in Building, Di...",40.7108,7170325,-73.9539,a10db4590843d78c784171a107bdacb4,[https://photos.renthop.com/2/7170325_3bb5ac84...,2400,145 Borinquen Place,medium
6,1.0,2,b8e75fc949a6cd8225b455648a951712,2016-06-01 05:44:33,BRAND NEW GUT RENOVATED TRUE 2 BEDROOMFind you...,East 44th,"[Doorman, Elevator, Laundry in Building, Dishw...",40.7513,7092344,-73.9722,955db33477af4f40004820b4aed804a0,[https://photos.renthop.com/2/7092344_7663c19a...,3800,230 East 44th,low
9,1.0,2,cd759a988b8f23924b5a2058d5ab2b49,2016-06-14 15:19:59,**FLEX 2 BEDROOM WITH FULL PRESSURIZED WALL**L...,East 56th Street,"[Doorman, Elevator, Laundry in Building, Laund...",40.7575,7158677,-73.9625,c8b10a317b766204f08e613cef4ce7a0,[https://photos.renthop.com/2/7158677_c897a134...,3495,405 East 56th Street,medium
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,medium
15,1.0,0,bfb9405149bfff42a92980b594c28234,2016-06-28 03:50:23,Over-sized Studio w abundant closets. Availabl...,East 34th Street,"[Doorman, Elevator, Fitness Center, Laundry in...",40.7439,7225292,-73.9743,2c3b41f588fbb5234d8a1e885a436cfa,[https://photos.renthop.com/2/7225292_901f1984...,2795,340 East 34th Street,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123998,1.0,0,0,2016-04-02 01:29:32,This studio apartment is located in Hell's Kit...,West 49th Street,"[Fireplace, Pre-War, Dogs Allowed, Cats Allowed]",40.7640,6812513,-73.9917,8b53ccf4338806ab1be3dd0267711649,[https://photos.renthop.com/2/6812513_ff385b2e...,2175,465 West 49th Street,low
124000,1.0,3,92bbbf38baadfde0576fc496bd41749c,2016-04-05 03:58:33,There is 700 square feet of recently renovated...,W 171 Street,"[Elevator, Dishwasher, Hardwood Floors]",40.8433,6824800,-73.9396,a61e21da3ba18c7a3d54cfdcc247e1f8,[https://photos.renthop.com/2/6824800_0682be16...,2800,620 W 171 Street,low
124002,1.0,2,5565db9b7cba3603834c4aa6f2950960,2016-04-02 02:25:31,"2 bedroom apartment with updated kitchen, rece...",Broadway,"[Common Outdoor Space, Cats Allowed, Dogs Allo...",40.8198,6813268,-73.9578,8f90e5e10e8a2d7cf997f016d89230eb,[https://photos.renthop.com/2/6813268_1e6fcc32...,2395,3333 Broadway,medium
124008,1.0,2,3c0574a740154806c18bdf1fddd3d966,2016-04-19 02:47:33,Wonderful Bright Chelsea 2 Bedroom apartment o...,West 21st Street,"[Pre-War, Laundry in Unit, Dishwasher, No Fee,...",40.7448,6892816,-74.0017,c3cd45f4381ac371507090e9ffabea80,[https://photos.renthop.com/2/6892816_1a8d087a...,4195,350 West 21st Street,medium


# <b> Feature extraction from images and text</b>
*  Extract features from the images and transform it into data that’s ready to be
    used in the model for classification.
*  Extract features from the text data and transform it into data that’s ready to be
    used in the model for classification. 


In [3]:
# 1.Extract features from the images and transform it into data that’s ready to be used in the model for classification.

# Get number of photos of each rental posting 
def count_num_photos(photo_list):
    return len(photo_list)

rental_train['num_photos'] = rental_train['photos'].apply(count_num_photos)
# rental_train.head()

# Other approaches are in the Logo_extraction.ipynb and is briefly discussed in report.

In [4]:
rental_train = rental_train.reset_index(drop=True)

In [5]:
# 2.Extract features from the text data and transform it into data that’s ready to be used in the model for classification. 

data = rental_train

data['features'] = data['features'].apply(lambda x : clean_text(x))

ps = PorterStemmer()
def porter_stemmer(list_t):
    temp = []
    for i in list_t:
        temp.append(ps.stem(i))
    return temp

data['stemmed_features'] = data['features'].apply(porter_stemmer)

dic = {}
for i in range(len(data)):
    for j in data["stemmed_features"][i]:
        if j in dic:
            dic[j]+=1
        else:
            dic.setdefault(j,1)
            
dic = {key:val for key, val in dic.items() if val > 20000}
values = dic.values()

In [6]:
keys = list(dic.keys())
keys_for_class = keys

for i in keys:
    data[i] = 0
    
for index,words in enumerate(data['stemmed_features']):
    for i in words:
        if i in keys:
            data[i][index] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [7]:
data['cleaned_description'] = data['description'].apply(clean_description)

for index, plot in enumerate(data['cleaned_description']):
    temp = plot.split()
    temp_list = ''
    for i in temp:
        if i not in stop_words:
            temp_list = temp_list + ' ' + i
    data['cleaned_description'][index] = temp_list


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [8]:
data['cleaned_description_porter'] = ''

def stemSentence_porter(sentence):
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

def token_plot_porter(data):
    for index,plot in enumerate(data['cleaned_description']):
        data['cleaned_description_porter'][index] = stemSentence_porter(plot)
    return data

data = token_plot_porter(data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [9]:
dic = {}
for i in data['cleaned_description_porter']:
    token_words=word_tokenize(i)
    for j in range(len(token_words)-1):
        temp = token_words[j] + ' ' +token_words[j+1]
        if temp not in dic:
            dic.update({temp:1})
        else:
            dic[temp] += 1

In [10]:
features = list(data.axes[1][:])

dic = dict((k, v) for k, v in dic.items() if v >= 13000 and v < 16000)
dic_filtered = {k: v for k, v in dic.items() if k not in features}
dic_filtered = {k: v for k, v in sorted(dic_filtered.items(), key=lambda item: item[1], reverse = True)}
dic_filtered

{'stainless steel': 15288}

In [11]:
keys = list(dic_filtered.keys())
for i in keys:
    data[i] = 0

In [12]:
for index,sentence in enumerate(data['cleaned_description_porter']):
    token_words=sentence.split(' ')
    for i in range( len(token_words) - 1 ):
        temp = token_words[i] + ' ' + token_words[i+1]
#         print(temp)
        if temp in keys:
            data[temp][index] = 1
data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,...,interest_level,num_photos,stemmed_features,hardwood floor,dogs allow,cats allow,elev,cleaned_description,cleaned_description_porter,stainless steel
0,1.0,1,8579a0b0d54db803821a35a4a615e97a,2016-06-16 05:55:27,Spacious 1 Bedroom 1 Bathroom in Williamsburg!...,145 Borinquen Place,"[dining room, pre war, laundry in building, di...",40.7108,7170325,-73.9539,...,medium,12,"[dining room, pre war, laundry in build, dishw...",1,1,1,0,spacious 1 bedroom 1 bathroom williamsburg ap...,spaciou 1 bedroom 1 bathroom williamsburg apar...,0
1,1.0,2,b8e75fc949a6cd8225b455648a951712,2016-06-01 05:44:33,BRAND NEW GUT RENOVATED TRUE 2 BEDROOMFind you...,East 44th,"[doorman, elevator, laundry in building, dishw...",40.7513,7092344,-73.9722,...,low,6,"[doorman, elev, laundry in build, dishwash, ha...",1,0,0,1,brand new gut renovated true 2 bedroomfind ho...,brand new gut renov true 2 bedroomfind home ce...,0
2,1.0,2,cd759a988b8f23924b5a2058d5ab2b49,2016-06-14 15:19:59,**FLEX 2 BEDROOM WITH FULL PRESSURIZED WALL**L...,East 56th Street,"[doorman, elevator, laundry in building, laund...",40.7575,7158677,-73.9625,...,medium,6,"[doorman, elev, laundry in build, laundry in u...",1,0,0,1,flex 2 bedroom full pressurized wall looking ...,flex 2 bedroom full pressur wall look perfect ...,1
3,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],40.7145,7211212,-73.9425,...,medium,5,[],0,0,0,0,brand new 3 bedroom 1 5 bath apartmentenjoy f...,brand new 3 bedroom 1 5 bath apartmentenjoy fo...,0
4,1.0,0,bfb9405149bfff42a92980b594c28234,2016-06-28 03:50:23,Over-sized Studio w abundant closets. Availabl...,East 34th Street,"[doorman, elevator, fitness center, laundry in...",40.7439,7225292,-73.9743,...,low,4,"[doorman, elev, fitness cent, laundry in build]",0,0,0,1,sized studio w abundant closets available imm...,size studio w abund closet avail immedi rent s...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46408,1.0,0,0,2016-04-02 01:29:32,This studio apartment is located in Hell's Kit...,West 49th Street,"[fireplace, pre war, dogs allowed, cats allowed]",40.7640,6812513,-73.9917,...,low,1,"[fireplac, pre war, dogs allow, cats allow]",0,1,1,0,studio apartment located hell kitchen apartme...,studio apart locat hell kitchen apart locat vo...,0
46409,1.0,3,92bbbf38baadfde0576fc496bd41749c,2016-04-05 03:58:33,There is 700 square feet of recently renovated...,W 171 Street,"[elevator, dishwasher, hardwood floors]",40.8433,6824800,-73.9396,...,low,5,"[elev, dishwash, hardwood floor]",1,0,0,1,700 square feet recently renovated space beau...,700 squar feet recent renov space beauti one b...,1
46410,1.0,2,5565db9b7cba3603834c4aa6f2950960,2016-04-02 02:25:31,"2 bedroom apartment with updated kitchen, rece...",Broadway,"[common outdoor space, cats allowed, dogs allo...",40.8198,6813268,-73.9578,...,medium,5,"[common outdoor spac, cats allow, dogs allow, ...",0,1,1,1,2 bedroom apartment updated kitchen recently ...,2 bedroom apart updat kitchen recent instal ha...,0
46411,1.0,2,3c0574a740154806c18bdf1fddd3d966,2016-04-19 02:47:33,Wonderful Bright Chelsea 2 Bedroom apartment o...,West 21st Street,"[pre war, laundry in unit, dishwasher, no fee,...",40.7448,6892816,-74.0017,...,medium,5,"[pre war, laundry in unit, dishwash, no fe, ou...",0,0,0,0,wonderful bright chelsea 2 bedroom apartment ...,wonder bright chelsea 2 bedroom apart quiet tr...,0


In [13]:
new_features = keys_for_class + keys
# new_features = keys_for_class
# new_features
# keys
new_features

['hardwood floor', 'dogs allow', 'cats allow', 'elev', 'stainless steel']

In [14]:
# test_data = test_data[test_data['building_id'] != "0"]


# test_data = test_data[test_data['created_hour'] != None]

# remove = []

# remove += findOutlierIndex(test_data['latitude'], 0.99)[0].tolist()

# # data = data.drop(data.ix[Outlier_indexlist].index)
# # print(len(Outlier_indexlist[0]))

# remove += findOutlierIndex(test_data['longitude'], 0.99)[0].tolist()
# # data = data.drop(data.ix[Outlier_indexlist].index)
# # print(len(Outlier_indexlist[0]))
# remove += findOutlierIndex(test_data['price'], 0.99)[0].tolist()

# test_data = test_data.drop(test_data.ix[np.unique(remove)].index)
# # # print(len(Outlier_indexlist[0]))

# # test_data = test_data[(test_data['latitude']!=0.0) & (test_data['longitude']!=0.0)]

# # test_data = test_data[test_data['price'] != 0]

# test_data = test_data.reset_index(drop=True)


In [15]:
subway = pd.read_csv("subway.csv")
def distance_for_twopoints(lat1, lon1, lat2, lon2):
    p = pi/180     #Pi/180
    a = 0.5 - cos((lat2 - lat1) * p)/2 + cos(lat1 * p) * cos(lat2 * p) * (1 - cos((lon2 - lon1) * p)) / 2
    return 12742 * asin(sqrt(a)) #2*R*asin...

vec_distance_for_two = np.vectorize(distance_for_twopoints)

def distance(location):
    distance = vec_distance_for_two(location[0],location[1],subway['Station Latitude'],subway['Station Longitude']);
    min_dis = min(distance)
    return min_dis

In [16]:

# subway["Station Latitude"]
# subway["Station Longitude"]
data['location'] = data[['latitude','longitude']].values.tolist()
data['distance'] = data['location'].apply(distance)



# data

In [17]:
data['room'] = data['bedrooms']+data['bathrooms']
data['room'] = data['room'].replace(0,1)
data['price_over_room'] = data['price']/data['room']

In [191]:
features = data.axes[1][0:]
selected_features = data[['bathrooms','bedrooms','latitude','longitude','price','distance']].axes[1][:].tolist() + new_features
target_added = selected_features.append(features[14])
dt_set = data[selected_features]
# dt_set.interest_level = data.interest_level
# dt_set["interest_level"] = dt_set["interest_level"].apply(lambda x: 0 if x=="low" else 1 if x=="medium" else 2)
dt_set

Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price,hardwood floor,dogs allow,cats allow,elev,stainless steel,interest_level
0,1.0,1,40.7108,-73.9539,2400,1,1,1,0,0,medium
1,1.0,2,40.7513,-73.9722,3800,1,0,0,1,0,low
2,1.0,2,40.7575,-73.9625,3495,1,0,0,1,1,medium
3,1.5,3,40.7145,-73.9425,3000,0,0,0,0,0,medium
4,1.0,0,40.7439,-73.9743,2795,0,0,0,1,0,low
...,...,...,...,...,...,...,...,...,...,...,...
46408,1.0,0,40.7640,-73.9917,2175,0,1,1,0,0,low
46409,1.0,3,40.8433,-73.9396,2800,1,0,0,1,1,low
46410,1.0,2,40.8198,-73.9578,2395,0,1,1,1,0,medium
46411,1.0,2,40.7448,-74.0017,4195,0,0,0,0,0,medium


In [192]:
x = dt_set.iloc[:,:-1]
y = dt_set.iloc[:,-1]
# x_new = SelectKBest(chi2, k=25).fit_transform(x, y)

# train_x, test_x, train_y, test_y = train_test_split(x, y)

In [193]:
# from sklearn.metrics import classification_report, accuracy_score, make_scorer

# def classification_report_with_accuracy_score(y_true, y_pred):

#     print(classification_report(y_true, y_pred)) # print classification report
#     return accuracy_score(y_true, y_pred) # return accuracy score

# # Nested CV with parameter optimization
# nested_score = cross_val_score(clf, X=x, y=y,scoring=make_scorer(classification_report_with_accuracy_score))
# print(nested_score) 

In [194]:
clf = Pipeline([
  ('normalization', MinMaxScaler(feature_range = (0,1))),
  ('feature_selection', SelectKBest(f_classif, k=len(dt_set.axes[1])-1)),
#   ('classification', RandomForestClassifier()),
  ('logistic regression', LogisticRegression(multi_class = 'multinomial', solver='newton-cg'))
])
#     clf.fit(x, y)
scores = cross_val_score(clf, x, y, cv=5)
# target_names = ['high', 'low', 'medium']
# print(classification_report(y, clf.predict(x), target_names=target_names))

print(scores.mean())
# 0.702626366964721
# 0.7023462874157692

0.7023462874157692


In [22]:
scores

array([0.7066681 , 0.69923516, 0.70128191, 0.70211161, 0.69952596])

In [23]:
# for i in range(1,30):
#     clf = Pipeline([
#       ('normalization', MinMaxScaler(feature_range = (0,1))),
#       ('feature_selection', SelectKBest(f_classif, k=i)),
#     #   ('classification', RandomForestClassifier()),
#       ('logistic regression', LogisticRegression(multi_class = 'multinomial', solver='newton-cg'))
#     ])
#     #     clf.fit(x, y)
#     scores = cross_val_score(clf, x, y, cv=5)

#     print(i,scores.mean())

In [24]:
d = None  
test_data = None  
with zipfile.ZipFile("test.json.zip", "r") as z:
   for filename in z.namelist():    
      with z.open(filename) as f:  
         test_data = f.read()  
         d = json.loads(test_data.decode("utf-8"))
        
test_data = pd.DataFrame.from_dict(d)

In [25]:
test_data['num_photos'] = test_data['photos'].apply(count_num_photos)
test_data['features'] = test_data['features'].apply(lambda x : clean_text(x))

ps = PorterStemmer()
def porter_stemmer(list_t):
    temp = []
    for i in list_t:
        temp.append(ps.stem(i))
    return temp

test_data['stemmed_features'] = test_data['features'].apply(porter_stemmer)

In [26]:
for i in new_features:
    test_data[i] = 0

In [27]:
for index,words in enumerate(test_data['stemmed_features']):
    for i in words:
        if i in keys_for_class:
            test_data[i][index] = 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [28]:
test_data['cleaned_description'] = test_data['description'].apply(clean_description)

for index, plot in enumerate(test_data['cleaned_description']):
    temp = plot.split()
    temp_list = ''
    for i in temp:
        if i not in stop_words:
            temp_list = temp_list + ' ' + i
    test_data['cleaned_description'][index] = temp_list

    
test_data['cleaned_description_porter'] = ''

def stemSentence_porter(sentence):
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

def token_plot_porter(data):
    for index,plot in enumerate(data['cleaned_description']):
        data['cleaned_description_porter'][index] = stemSentence_porter(plot)
    return data

test_data = token_plot_porter(test_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [29]:
for index,sentence in enumerate(test_data['cleaned_description_porter']):
    token_words=sentence.split(' ')
    for i in range( len(token_words) - 1 ):
        temp = token_words[i] + ' ' + token_words[i+1]
#         print(temp)
        if temp in keys:
            test_data[temp][index] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [30]:
test_data['location'] = test_data[['latitude','longitude']].values.tolist()
test_data['distance'] = test_data['location'].apply(distance)

In [31]:
test_data['room'] = test_data['bedrooms']+test_data['bathrooms']
test_data['room'] = test_data['room'].replace(0,1)
test_data['price_over_room'] = test_data['price']/test_data['room']

In [32]:
test_data

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,...,dogs allow,cats allow,elev,stainless steel,cleaned_description,cleaned_description_porter,location,distance,room,price_over_room
0,1.0,1,79780be1514f645d7e6be99a3de696c5,2016-06-11 05:29:41,Large with awesome terrace--accessible via bed...,Suffolk Street,"[elevator, laundry in building, laundry in uni...",40.7185,7142618,-73.9865,...,0,0,1,0,large awesome terrace accessible via bedroom ...,larg awesom terrac access via bedroom live roo...,"[40.7185, -73.9865]",0.081603,2.0,1475.000000
1,1.0,2,0,2016-06-24 06:36:34,Prime Soho - between Bleecker and Houston - Ne...,Thompson Street,"[pre war, dogs allowed, cats allowed]",40.7278,7210040,-74.0000,...,1,1,0,1,prime soho bleecker houston newly renovated s...,prime soho bleecker houston newli renov stainl...,"[40.7278, -74.0]",0.360366,3.0,950.000000
2,1.0,0,0,2016-06-17 01:23:39,Spacious studio in Prime Location. Cleanbuildi...,Sullivan Street,"[pre war, dogs allowed, cats allowed]",40.7260,7174566,-74.0026,...,1,1,0,0,spacious studio prime location cleanbuilding ...,spaciou studio prime locat cleanbuild hand man...,"[40.726, -74.0026]",0.099244,1.0,2295.000000
3,1.0,2,f9c826104b91d868e69bd25746448c0c,2016-06-21 05:06:02,For immediate access call Bryan.<br /><br />Bo...,Jones Street,"[hardwood floors, dogs allowed, cats allowed]",40.7321,7191391,-74.0028,...,1,1,0,0,immediate access call bryan bond new york rea...,immedi access call bryan bond new york real es...,"[40.7321, -74.0028]",0.147271,3.0,966.666667
5,1.0,1,81062936e12ee5fa6cd2b965698e17d5,2016-06-16 07:24:27,Beautiful TRUE 1 bedroom in a luxury building ...,Exchange Place,"[roof deck, doorman, elevator, fitness center,...",40.7054,7171695,-74.0095,...,1,1,1,1,beautiful true 1 bedroom luxury building fina...,beauti true 1 bedroom luxuri build financi dis...,"[40.7054, -74.0095]",0.161565,2.0,1627.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124003,1.0,1,bd863d28a6b119ac3bc72d5f27b07f24,2016-04-26 16:09:55,BRAND NEW TO MARKET 1BDR \r107TH AND LEXINGTON...,150 EAST 107TH STREET,[],40.7925,6928108,-73.9454,...,0,0,0,0,brand new market 1bdr 107th lexington seconds...,brand new market 1bdr 107th lexington second 6...,"[40.7925, -73.9454]",0.274295,2.0,850.000000
124005,1.0,2,9174b75c0cd978eb0e5aa93afbad754b,2016-04-21 05:06:19,Convertible 2BR apartment features a brand new...,E 33rd St.,"[doorman, elevator, laundry in building, dishw...",40.7456,6906674,-73.9797,...,1,1,1,0,convertible 2br apartment features brand new ...,convert 2br apart featur brand new marbl bathr...,"[40.7456, -73.9797]",0.207183,3.0,1398.333333
124006,1.0,0,0,2016-04-20 01:31:52,"Let's get you in to see this $2,400/mo, recent...",Lexington Avenue,"[dogs allowed, cats allowed]",40.7416,6897967,-73.9829,...,1,1,0,1,let get see 2 400 mo recently renovated spaci...,let get see 2 400 mo recent renov spaciou stud...,"[40.7416, -73.9829]",0.199808,1.0,2400.000000
124007,2.0,2,c90c010e5505365676538e64d02aa1e0,2016-04-08 02:26:45,CooperCooper.com :: Web ID #171357; Access 100...,Park Avenue,"[doorman, elevator, cats allowed, dogs allowed]",40.7485,6842183,-73.9800,...,1,1,1,0,coopercooper com web id 171357 access 1000s u...,coopercoop com web id 171357 access 1000 uniqu...,"[40.7485, -73.98]",0.320835,4.0,1723.750000


In [33]:
features = test_data.axes[1][0:]
selected_features = test_data[['bathrooms','bedrooms','latitude','longitude','price','distance']].axes[1][:].tolist() + new_features
# target_added = selected_features.append(features[14])
dt_set_test = test_data[selected_features]
# dt_set.interest_level = data.interest_level
# dt_set_test["interest_level"] = dt_set_test["interest_level"].apply(lambda x: 0 if x=="low" else 1 if x=="medium" else 2)
dt_set_test

Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price,distance,hardwood floor,dogs allow,cats allow,elev,stainless steel
0,1.0,1,40.7185,-73.9865,2950,0.081603,1,0,0,1,0
1,1.0,2,40.7278,-74.0000,2850,0.360366,0,1,1,0,1
2,1.0,0,40.7260,-74.0026,2295,0.099244,0,1,1,0,0
3,1.0,2,40.7321,-74.0028,2900,0.147271,1,1,1,0,0
5,1.0,1,40.7054,-74.0095,3254,0.161565,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
124003,1.0,1,40.7925,-73.9454,1700,0.274295,0,0,0,0,0
124005,1.0,2,40.7456,-73.9797,4195,0.207183,1,1,1,1,0
124006,1.0,0,40.7416,-73.9829,2400,0.199808,0,1,1,0,1
124007,2.0,2,40.7485,-73.9800,6895,0.320835,0,1,1,1,0


In [181]:
clf = Pipeline([
  ('normalization', MinMaxScaler(feature_range = (0,1))),
  ('feature_selection', SelectKBest(f_classif, k=5)),
#   ('classification', RandomForestClassifier()),
  ('logistic regression', LogisticRegression(multi_class = 'multinomial', solver='newton-cg'))
])
# clf.fit(x, y)

In [124]:
result = clf.predict_proba(dt_set_test)
result

NotFittedError: This MinMaxScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [36]:
submit = {'listing_id':test_data['listing_id'], 'high': result[:,0], 'medium':result[:,2], 'low':result[:,1]}

In [37]:
# pd.DataFrame(submit).to_csv('submission.csv',index = False)
check = pd.DataFrame(submit)
check

Unnamed: 0,listing_id,high,medium,low
0,7142618,0.094008,0.322841,0.583150
1,7210040,0.060479,0.177376,0.762146
2,7174566,0.086461,0.208710,0.704828
3,7191391,0.088425,0.316956,0.594619
5,7171695,0.046119,0.155857,0.798024
...,...,...,...,...
124003,6928108,0.135556,0.250007,0.614437
124005,6906674,0.039640,0.223813,0.736548
124006,6897967,0.080928,0.202684,0.716389
124007,6842183,0.006910,0.067595,0.925495


In [38]:
# dt_set.to_csv("train.csv")
# dt_set_test.to_csv("test.csv")
aaa =clf.predict(dt_set_test)
aaa[aaa=='high'].shape
# np.where(aaa==2)
# y

(6,)

In [39]:
for i in range(x.shape[1]):
    selector = SelectKBest(f_classif, k=i)
    selector.fit(x, y)
    # Get columns to keep and create new dataframe with those only
    cols = selector.get_support(indices=True)
    features_df_new = x.iloc[:,cols]
    print(i,features_df_new.axes[1][:])

0 Index([], dtype='object')
1 Index(['price'], dtype='object')
2 Index(['price', 'hardwood floor'], dtype='object')
3 Index(['price', 'hardwood floor', 'dogs allow'], dtype='object')
4 Index(['price', 'hardwood floor', 'dogs allow', 'cats allow'], dtype='object')
5 Index(['bathrooms', 'price', 'hardwood floor', 'dogs allow', 'cats allow'], dtype='object')
6 Index(['bathrooms', 'bedrooms', 'price', 'hardwood floor', 'dogs allow',
       'cats allow'],
      dtype='object')
7 Index(['bathrooms', 'bedrooms', 'price', 'hardwood floor', 'dogs allow',
       'cats allow', 'stainless steel'],
      dtype='object')
8 Index(['bathrooms', 'bedrooms', 'latitude', 'price', 'hardwood floor',
       'dogs allow', 'cats allow', 'stainless steel'],
      dtype='object')
9 Index(['bathrooms', 'bedrooms', 'latitude', 'price', 'hardwood floor',
       'dogs allow', 'cats allow', 'elev', 'stainless steel'],
      dtype='object')
10 Index(['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price',
       '

In [40]:
# pd.DataFrame(submit).to_csv('submission.csv',index = False)
# x.shape[1]

In [182]:
y = y.apply(lambda x: 0 if x=="low" else 1 if x=="medium" else 2)


In [183]:
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)

In [187]:
from sklearn.metrics import f1_score
y_true = np.array(y_test)
y_scores = np.array(clf.fit(X_train,y_train).predict(X_test))
f1_score(y_true, y_scores, average='micro')

0.7051016890727335