In [260]:
from __future__ import print_function
import json
import zipfile
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from scipy import ndimage
import descartes
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn import metrics
from math import cos, asin, sqrt, pi
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
# import geopandas as gpd
import seaborn as sns
import re
import sys
import seaborn
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.preprocessing import MultiLabelBinarizer
porter = PorterStemmer()
lancaster=LancasterStemmer()

sns.set_style('darkgrid')
from shapely.geometry import Point, Polygon

%matplotlib inline



def clean_text(list_t):
    temp = []
    for text in list_t:
        # remove backslash-apostrophe 
        text = re.sub("\'", "", text) 
        # remove everything except alphabets 
        text = re.sub("[^a-zA-Z]"," ",text) 
        # remove whitespaces 
        text = ' '.join(text.split()) 
        # convert text to lowercase 
        text = text.lower() 
        temp.append(text)
    
    return temp

def clean_description(text):
    text = re.sub('<.*?>', '', text)
    text = re.sub('\w+_\w+', '', text)
    text = re.sub(r'[^\w]', ' ', text)
    text = text.lower()
    text = ' '.join(text.split())
    return text

from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))



In [2]:
# desc: plot histogram according to the param Series and a name 
def histplot(data, name=None, title=None, kde_flag=True, hist_kws=None):
    plt.figure(figsize = (6,4))
    sns.distplot(data, kde=kde_flag, hist_kws=hist_kws).set(xlabel = name +" histogram", title = title)
    
# desc: use RE to filter (clean) text features for further processing (ie. stemming)  
def clean_text(list_t):
    temp = []
    for text in list_t:
        # remove backslash-apostrophe 
        text = re.sub("\'", "", text) 
        # remove everything except alphabets 
        text = re.sub("[^a-zA-Z]"," ",text) 
        # remove whitespaces 
        text = ' '.join(text.split()) 
        # convert text to lowercase 
        text = text.lower() 
        temp.append(text)
    
    return temp

# desc: find outliers from data according to a threshold, return their indeces
def findOutlierIndex(data, threshold):
    z = np.abs(stats.zscore(data))
    index_outliers = np.where(z > threshold)
    return index_outliers


In [3]:
# load data from zip file
d = None  
data = None  
with zipfile.ZipFile("train.json.zip", "r") as z:
   for filename in z.namelist():    
      with z.open(filename) as f:  
         data = f.read()  
         d = json.loads(data.decode("utf-8"))
        
data = pd.DataFrame.from_dict(d)

data = data[(data['latitude']!=0.0) & (data['longitude']!=0.0)]


data = data[data['price'] != 0]


data = data.reset_index(drop=True)

rental_train = data.copy()

# <b> Feature extraction from images and text</b>
*  Extract features from the images and transform it into data that’s ready to be
    used in the model for classification.
*  Extract features from the text data and transform it into data that’s ready to be
    used in the model for classification. 


In [4]:
# 1.Extract features from the images and transform it into data that’s ready to be used in the model for classification.

# Get number of photos of each rental posting 
def count_num_photos(photo_list):
    return len(photo_list)

rental_train['num_photos'] = rental_train['photos'].apply(count_num_photos)
rental_train.head()

# Other approaches are in the Logo_extraction.ipynb and is briefly discussed in report.

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address,interest_level,num_photos
0,1.0,1,8579a0b0d54db803821a35a4a615e97a,2016-06-16 05:55:27,Spacious 1 Bedroom 1 Bathroom in Williamsburg!...,145 Borinquen Place,"[Dining Room, Pre-War, Laundry in Building, Di...",40.7108,7170325,-73.9539,a10db4590843d78c784171a107bdacb4,[https://photos.renthop.com/2/7170325_3bb5ac84...,2400,145 Borinquen Place,medium,12
1,1.0,2,b8e75fc949a6cd8225b455648a951712,2016-06-01 05:44:33,BRAND NEW GUT RENOVATED TRUE 2 BEDROOMFind you...,East 44th,"[Doorman, Elevator, Laundry in Building, Dishw...",40.7513,7092344,-73.9722,955db33477af4f40004820b4aed804a0,[https://photos.renthop.com/2/7092344_7663c19a...,3800,230 East 44th,low,6
2,1.0,2,cd759a988b8f23924b5a2058d5ab2b49,2016-06-14 15:19:59,**FLEX 2 BEDROOM WITH FULL PRESSURIZED WALL**L...,East 56th Street,"[Doorman, Elevator, Laundry in Building, Laund...",40.7575,7158677,-73.9625,c8b10a317b766204f08e613cef4ce7a0,[https://photos.renthop.com/2/7158677_c897a134...,3495,405 East 56th Street,medium,6
3,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,medium,5
4,1.0,0,bfb9405149bfff42a92980b594c28234,2016-06-28 03:50:23,Over-sized Studio w abundant closets. Availabl...,East 34th Street,"[Doorman, Elevator, Fitness Center, Laundry in...",40.7439,7225292,-73.9743,2c3b41f588fbb5234d8a1e885a436cfa,[https://photos.renthop.com/2/7225292_901f1984...,2795,340 East 34th Street,low,4


In [5]:
rental_train = rental_train.reset_index(drop=True)

In [6]:
# 2.Extract features from the text data and transform it into data that’s ready to be used in the model for classification. 

data = rental_train

data['features'] = data['features'].apply(lambda x : clean_text(x))

ps = PorterStemmer()
def porter_stemmer(list_t):
    temp = []
    for i in list_t:
        temp.append(ps.stem(i))
    return temp

data['stemmed_features'] = data['features'].apply(porter_stemmer)

dic = {}
for i in range(len(data)):
    for j in data["stemmed_features"][i]:
        if j in dic:
            dic[j]+=1
        else:
            dic.setdefault(j,1)
            
dic = {key:val for key, val in dic.items() if val > 5000}
values = dic.values()

In [7]:
#plt.boxplot(values)

In [8]:
keys = list(dic.keys())
keys_for_class = keys

for i in keys:
    data[i] = 0
    
for index,words in enumerate(data['stemmed_features']):
    for i in words:
        if i in keys:
            data[i][index] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [9]:
# features extracted from text, stored as binary values in the last 15 columns 
data

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,...,hardwood floor,dogs allow,cats allow,doorman,elev,no fe,laundry in unit,fitness cent,roof deck,outdoor spac
0,1.0,1,8579a0b0d54db803821a35a4a615e97a,2016-06-16 05:55:27,Spacious 1 Bedroom 1 Bathroom in Williamsburg!...,145 Borinquen Place,"[dining room, pre war, laundry in building, di...",40.7108,7170325,-73.9539,...,1,1,1,0,0,0,0,0,0,0
1,1.0,2,b8e75fc949a6cd8225b455648a951712,2016-06-01 05:44:33,BRAND NEW GUT RENOVATED TRUE 2 BEDROOMFind you...,East 44th,"[doorman, elevator, laundry in building, dishw...",40.7513,7092344,-73.9722,...,1,0,0,1,1,1,0,0,0,0
2,1.0,2,cd759a988b8f23924b5a2058d5ab2b49,2016-06-14 15:19:59,**FLEX 2 BEDROOM WITH FULL PRESSURIZED WALL**L...,East 56th Street,"[doorman, elevator, laundry in building, laund...",40.7575,7158677,-73.9625,...,1,0,0,1,1,0,1,0,0,0
3,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],40.7145,7211212,-73.9425,...,0,0,0,0,0,0,0,0,0,0
4,1.0,0,bfb9405149bfff42a92980b594c28234,2016-06-28 03:50:23,Over-sized Studio w abundant closets. Availabl...,East 34th Street,"[doorman, elevator, fitness center, laundry in...",40.7439,7225292,-73.9743,...,0,0,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49335,1.0,3,92bbbf38baadfde0576fc496bd41749c,2016-04-05 03:58:33,There is 700 square feet of recently renovated...,W 171 Street,"[elevator, dishwasher, hardwood floors]",40.8433,6824800,-73.9396,...,1,0,0,0,1,0,0,0,0,0
49336,1.0,2,5565db9b7cba3603834c4aa6f2950960,2016-04-02 02:25:31,"2 bedroom apartment with updated kitchen, rece...",Broadway,"[common outdoor space, cats allowed, dogs allo...",40.8198,6813268,-73.9578,...,0,1,1,1,1,1,0,0,0,0
49337,1.0,1,67997a128056ee1ed7d046bbb856e3c7,2016-04-26 05:42:03,No Brokers Fee * Never Lived 1 Bedroom 1 Bathr...,210 Brighton 15th St,"[dining room, elevator, pre war, laundry in bu...",40.5765,6927093,-73.9554,...,1,1,1,0,1,1,1,0,0,0
49338,1.0,2,3c0574a740154806c18bdf1fddd3d966,2016-04-19 02:47:33,Wonderful Bright Chelsea 2 Bedroom apartment o...,West 21st Street,"[pre war, laundry in unit, dishwasher, no fee,...",40.7448,6892816,-74.0017,...,0,0,0,0,0,1,1,0,0,1


In [10]:
data['cleaned_description'] = data['description'].apply(clean_description)

for index, plot in enumerate(data['cleaned_description']):
    temp = plot.split()
    temp_list = ''
    for i in temp:
        if i not in stop_words:
            temp_list = temp_list + ' ' + i
    data['cleaned_description'][index] = temp_list


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [11]:
data['cleaned_description_porter'] = ''

def stemSentence_porter(sentence):
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

def token_plot_porter(data):
    for index,plot in enumerate(data['cleaned_description']):
        data['cleaned_description_porter'][index] = stemSentence_porter(plot)
    return data

data = token_plot_porter(data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [12]:
dic = {}
for i in data['cleaned_description_porter']:
    token_words=word_tokenize(i)
    for j in range(len(token_words)-1):
        temp = token_words[j] + ' ' +token_words[j+1]
        if temp not in dic:
            dic.update({temp:1})
        else:
            dic[temp] += 1

In [13]:
features = list(data.axes[1][:])

dic = dict((k, v) for k, v in dic.items() if v >= 7000 and v <=16000)
dic_filtered = {k: v for k, v in dic.items() if k not in features}
dic_filtered = {k: v for k, v in sorted(dic_filtered.items(), key=lambda item: item[1], reverse = True)}
dic_filtered

{'stainless steel': 15948,
 'steel applianc': 13556,
 'new york': 10404,
 'high ceil': 9336,
 'closet space': 8448,
 'live room': 7825,
 'real estat': 7777,
 'call text': 7710,
 'washer dryer': 7477,
 'apart featur': 7006}

In [14]:
keys = list(dic_filtered.keys())
for i in keys:
    data[i] = 0

In [15]:
for index,sentence in enumerate(data['cleaned_description_porter']):
    token_words=sentence.split(' ')
    for i in range( len(token_words) - 1 ):
        temp = token_words[i] + ' ' + token_words[i+1]
#         print(temp)
        if temp in keys:
            data[temp][index] = 1
data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,...,stainless steel,steel applianc,new york,high ceil,closet space,live room,real estat,call text,washer dryer,apart featur
0,1.0,1,8579a0b0d54db803821a35a4a615e97a,2016-06-16 05:55:27,Spacious 1 Bedroom 1 Bathroom in Williamsburg!...,145 Borinquen Place,"[dining room, pre war, laundry in building, di...",40.7108,7170325,-73.9539,...,0,0,0,0,1,0,0,0,0,1
1,1.0,2,b8e75fc949a6cd8225b455648a951712,2016-06-01 05:44:33,BRAND NEW GUT RENOVATED TRUE 2 BEDROOMFind you...,East 44th,"[doorman, elevator, laundry in building, dishw...",40.7513,7092344,-73.9722,...,0,0,0,0,0,0,0,1,0,0
2,1.0,2,cd759a988b8f23924b5a2058d5ab2b49,2016-06-14 15:19:59,**FLEX 2 BEDROOM WITH FULL PRESSURIZED WALL**L...,East 56th Street,"[doorman, elevator, laundry in building, laund...",40.7575,7158677,-73.9625,...,1,1,0,0,1,0,0,1,0,0
3,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],40.7145,7211212,-73.9425,...,0,1,0,0,0,0,0,0,1,1
4,1.0,0,bfb9405149bfff42a92980b594c28234,2016-06-28 03:50:23,Over-sized Studio w abundant closets. Availabl...,East 34th Street,"[doorman, elevator, fitness center, laundry in...",40.7439,7225292,-73.9743,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49335,1.0,3,92bbbf38baadfde0576fc496bd41749c,2016-04-05 03:58:33,There is 700 square feet of recently renovated...,W 171 Street,"[elevator, dishwasher, hardwood floors]",40.8433,6824800,-73.9396,...,1,1,1,0,0,0,1,1,0,0
49336,1.0,2,5565db9b7cba3603834c4aa6f2950960,2016-04-02 02:25:31,"2 bedroom apartment with updated kitchen, rece...",Broadway,"[common outdoor space, cats allowed, dogs allo...",40.8198,6813268,-73.9578,...,0,0,1,0,0,0,0,0,0,0
49337,1.0,1,67997a128056ee1ed7d046bbb856e3c7,2016-04-26 05:42:03,No Brokers Fee * Never Lived 1 Bedroom 1 Bathr...,210 Brighton 15th St,"[dining room, elevator, pre war, laundry in bu...",40.5765,6927093,-73.9554,...,1,1,0,0,1,0,0,0,0,1
49338,1.0,2,3c0574a740154806c18bdf1fddd3d966,2016-04-19 02:47:33,Wonderful Bright Chelsea 2 Bedroom apartment o...,West 21st Street,"[pre war, laundry in unit, dishwasher, no fee,...",40.7448,6892816,-74.0017,...,0,0,0,0,0,0,1,0,1,0


In [16]:
new_features = keys_for_class + keys

In [17]:
d = None  
test_data = None  
with zipfile.ZipFile("test.json.zip", "r") as z:
   for filename in z.namelist():    
      with z.open(filename) as f:  
         test_data = f.read()  
         d = json.loads(test_data.decode("utf-8"))
        
test_data = pd.DataFrame.from_dict(d)

In [18]:
# test_data = test_data[test_data['building_id'] != "0"]


# test_data = test_data[test_data['created_hour'] != None]


test_data = test_data[(test_data['latitude']!=0.0) & (test_data['longitude']!=0.0)]


test_data = test_data[test_data['price'] != 0]


test_data = test_data.reset_index(drop=True)

In [19]:
test_data['num_photos'] = test_data['photos'].apply(count_num_photos)
test_data['features'] = test_data['features'].apply(lambda x : clean_text(x))

ps = PorterStemmer()
def porter_stemmer(list_t):
    temp = []
    for i in list_t:
        temp.append(ps.stem(i))
    return temp

test_data['stemmed_features'] = test_data['features'].apply(porter_stemmer)

In [20]:
for i in new_features:
    test_data[i] = 0

In [21]:
for index,words in enumerate(test_data['stemmed_features']):
    for i in words:
        if i in keys_for_class:
            test_data[i][index] = 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [22]:
test_data['cleaned_description'] = test_data['description'].apply(clean_description)

for index, plot in enumerate(test_data['cleaned_description']):
    temp = plot.split()
    temp_list = ''
    for i in temp:
        if i not in stop_words:
            temp_list = temp_list + ' ' + i
    test_data['cleaned_description'][index] = temp_list

    
test_data['cleaned_description_porter'] = ''

def stemSentence_porter(sentence):
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

def token_plot_porter(data):
    for index,plot in enumerate(data['cleaned_description']):
        data['cleaned_description_porter'][index] = stemSentence_porter(plot)
    return data

test_data = token_plot_porter(test_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [23]:
for index,sentence in enumerate(test_data['cleaned_description_porter']):
    token_words=sentence.split(' ')
    for i in range( len(token_words) - 1 ):
        temp = token_words[i] + ' ' + token_words[i+1]
#         print(temp)
        if temp in keys:
            test_data[temp][index] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [25]:
subway = pd.read_csv("subway.csv")
def distance_for_twopoints(lat1, lon1, lat2, lon2):
    p = pi/180     #Pi/180
    a = 0.5 - cos((lat2 - lat1) * p)/2 + cos(lat1 * p) * cos(lat2 * p) * (1 - cos((lon2 - lon1) * p)) / 2
    return 12742 * asin(sqrt(a)) #2*R*asin...

vec_distance_for_two = np.vectorize(distance_for_twopoints)

def distance(location):
    distance = vec_distance_for_two(location[0],location[1],subway['Station Latitude'],subway['Station Longitude']);
    min_dis = min(distance)
    return min_dis

In [26]:

# subway["Station Latitude"]
# subway["Station Longitude"]
data['location'] = data[['latitude','longitude']].values.tolist()
data['distance'] = data['location'].apply(distance)

test_data['location'] = test_data[['latitude','longitude']].values.tolist()
test_data['distance'] = test_data['location'].apply(distance)

# data

In [179]:
features = data.axes[1][0:]
selected_features = list(features[0:2]) + [features[7] ,features[9] ,features[12] ,features[15]] + keys_for_class + keys
target_added = selected_features.append(features[14])
dt_set = data[selected_features]
# dt_set.interest_level = data.interest_level
dt_set["interest_level"] = dt_set["interest_level"].apply(lambda x: 0 if x=="low" else 1 if x=="medium" else 2)
dt_set

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price,num_photos,dining room,pre war,laundry in build,dishwash,...,steel applianc,new york,high ceil,closet space,live room,real estat,call text,washer dryer,apart featur,interest_level
0,1.0,1,40.7108,-73.9539,2400,12,1,1,1,1,...,0,0,0,1,0,0,0,0,1,1
1,1.0,2,40.7513,-73.9722,3800,6,0,0,1,1,...,0,0,0,0,0,0,1,0,0,0
2,1.0,2,40.7575,-73.9625,3495,6,0,0,1,1,...,1,0,0,1,0,0,1,0,0,1
3,1.5,3,40.7145,-73.9425,3000,5,0,0,0,0,...,1,0,0,0,0,0,0,1,1,1
4,1.0,0,40.7439,-73.9743,2795,4,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49335,1.0,3,40.8433,-73.9396,2800,5,0,0,0,1,...,1,1,0,0,0,1,1,0,0,0
49336,1.0,2,40.8198,-73.9578,2395,5,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
49337,1.0,1,40.5765,-73.9554,1850,3,1,1,1,1,...,1,0,0,1,0,0,0,0,1,1
49338,1.0,2,40.7448,-74.0017,4195,5,0,1,0,1,...,0,0,0,0,0,1,0,1,0,1


In [180]:

# dt_set["bedrooms"] = scaler.fit_transform(np.array(dt_set["bedrooms"]).reshape(-1,1))
# dt_set["latitude"] = scaler.fit_transform(np.array(dt_set["latitude"]).reshape(-1,1))
# dt_set["longitude"] = scaler.fit_transform(np.array(dt_set["longitude"]).reshape(-1,1))
# dt_set["price"] = scaler.fit_transform(np.array(dt_set["price"]).reshape(-1,1))
# dt_set["num_photos"] = scaler.fit_transform(np.array(dt_set["num_photos"]).reshape(-1,1))
dt_set

Unnamed: 0,bathrooms,bedrooms,latitude,longitude,price,num_photos,dining room,pre war,laundry in build,dishwash,...,steel applianc,new york,high ceil,closet space,live room,real estat,call text,washer dryer,apart featur,interest_level
0,1.0,1,40.7108,-73.9539,2400,12,1,1,1,1,...,0,0,0,1,0,0,0,0,1,1
1,1.0,2,40.7513,-73.9722,3800,6,0,0,1,1,...,0,0,0,0,0,0,1,0,0,0
2,1.0,2,40.7575,-73.9625,3495,6,0,0,1,1,...,1,0,0,1,0,0,1,0,0,1
3,1.5,3,40.7145,-73.9425,3000,5,0,0,0,0,...,1,0,0,0,0,0,0,1,1,1
4,1.0,0,40.7439,-73.9743,2795,4,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49335,1.0,3,40.8433,-73.9396,2800,5,0,0,0,1,...,1,1,0,0,0,1,1,0,0,0
49336,1.0,2,40.8198,-73.9578,2395,5,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
49337,1.0,1,40.5765,-73.9554,1850,3,1,1,1,1,...,1,0,0,1,0,0,0,0,1,1
49338,1.0,2,40.7448,-74.0017,4195,5,0,1,0,1,...,0,0,0,0,0,1,0,1,0,1


In [283]:
x = dt_set.iloc[:,:30]
y = dt_set.iloc[:,30]
# x_new = SelectKBest(chi2, k=25).fit_transform(x, y)

train_x, test_x, train_y, test_y = train_test_split(x, y)

In [284]:
# features = test_data.axes[1][0:]
# selected_features = list(features[0:2]) + [features[7] ,features[9] ,features[12] ,features[14]] + keys_for_class + keys
# target_added = selected_features
# dt_set_test = data[selected_features]
# # dt_set.interest_level = data.interest_level
# dt_set_test

In [285]:
# logistic = linear_model.LogisticRegression()

# pca = decomposition.PCA()

# normalization = MinMaxScaler(feature_range = (0,1)
                             
# # rf = RandomForestClassifier()

# pipe = Pipeline(steps=[('random forest',rf),('normalization', normalization),('pca', pca), ('logistic', logistic)])

In [286]:
param_grid = [
    {'classifier' : [LogisticRegression(multi_class = 'multinomial')],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['newton-cg']},
    {'classifier' : [RandomForestClassifier()],
    'classifier__n_estimators' : list(range(10,101,10)),
    'classifier__max_features' : list(range(6,32,5))}
]

In [287]:
# for i in range(1,31):
clf = Pipeline([
  ('normalization', MinMaxScaler(feature_range = (0,1))),
  ('feature_selection', SelectKBest(chi2, k=10)),
#   ('classification', RandomForestClassifier()),
  ('logistic regression', LogisticRegression(multi_class = 'multinomial', solver='newton-cg'))
])
clf.fit(train_x, train_y)
scores = cross_val_score(clf, test_x, test_y, cv=5)

scores.mean()

0.6920145926226186

In [252]:
from sklearn.model_selection import cross_val_score
# mul_lr = LogisticRegression(multi_class = 'multinomial', solver='newton-cg').fit(x,y)
# scores = cross_val_score(mul_lr, dt_set.iloc[:,:30], dt_set.iloc[:,30], cv=5)
scores = cross_val_score(clf, test_x, test_y, cv=5)

scores.mean()

0.6948520470206729

In [237]:
# train_x, test_x, train_y, test_y = train_test_split(
#     dt_set.iloc[train,:28], dt_set.iloc[train,28], test_size=0.1, random_state=0)

# train_x, test_x, train_y, test_y = train_test_split(
#     dt_set.iloc[:,:30], dt_set.iloc[:,30], test_size=0.1, random_state = 0)

# Train multinomial logistic regression
# mul_lr = LogisticRegression(multi_class='multinomial', solver='newton-cg').fit(train_x, train_y)

In [239]:
# lr = linear_model.LogisticRegression()
# lr.fit(train_x, train_y)

# Train multinomial logistic regression model
# mul_lr = LogisticRegression(multi_class = 'multinomial', solver='newton-cg').fit(train_x, train_y)

# mul_lr.predict_proba(new_observation)

# mul_lr.score(test_x,test_y)

# print(lr.score(test_y, lr.predict(test_x)))

# print(mul_lr.score(train_y, mul_lr.predict(train_x))
# print(mul_lr.score(test_y, mul_lr.predict(test_x)))
# 0.6876012965964343 without solver='newton-cg'
# 0.6896272285251216 default multi_class 
# 0.6961102106969206 multi_class = 'multinomial'
# 0.6896272285251216 Normalized
# 0.6896272285251216 unnormalized

In [None]:
# kf = KFold(n_splits=10)
# for train, test in kf.split(dt_set):
# #     clf = LogisticRegression(random_state=0).fit(dt_set.iloc[train,:28], dt_set.iloc[train,28])
#     print("%s %s" % (train, test))
# #     https://scikit-learn.org/stable/modules/cross_validation.html

In [None]:
# # Code source: Gaël Varoquaux
# # Modified for documentation by Jaques Grobler
# # License: BSD 3 clause

# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.linear_model import LogisticRegression
# from sklearn import datasets

# # import some data to play with

# X = dt_set.iloc[train,:28]  # we only take the first two features.
# Y = dt_set.iloc[train,28]

# logreg = LogisticRegression(C=1e5)

# # Create an instance of Logistic Regression Classifier and fit the data.
# logreg.fit(X, Y)

# # Plot the decision boundary. For that, we will assign a color to each
# # point in the mesh [x_min, x_max]x[y_min, y_max].
# x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
# y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
# h = .02  # step size in the mesh
# xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])

# # Put the result into a color plot
# Z = Z.reshape(xx.shape)
# plt.figure(1, figsize=(4, 3))
# plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

# # Plot also the training points
# plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
# plt.xlabel('Sepal length')
# plt.ylabel('Sepal width')

# plt.xlim(xx.min(), xx.max())
# plt.ylim(yy.min(), yy.max())
# plt.xticks(())
# plt.yticks(())

# plt.show()
