In [1]:
# import pyspark modules
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import *       # for datatype conversion
from pyspark.sql.functions import *   # for col() function
from pyspark.ml.linalg import DenseVector
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import LinearRegression
import pandas as pd

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("wisc_breast_cancer_analysis") \
    .config("spark.executor.memory", '8g') \
    .config('spark.executor.cores', '4') \
    .config('spark.cores.max', '4') \
    .config("spark.driver.memory",'8g') \
    .getOrCreate()

sc = SparkContext.getOrCreate()
sqlCtx = SQLContext(sc)

In [2]:
import pandas as pd
train_data_pd = pd.read_json("data/train.json")

In [3]:
train_data_pd

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address,interest_level
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,medium
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue,low
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street,high
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street,low
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street,low
100014,2.0,4,38a913e46c94a7f46ddf19b756a9640c,2016-04-19 04:24:47,,West 18th Street,[],40.7429,6894514,-74.0028,b209e2c4384a64cc307c26759ee0c651,[https://photos.renthop.com/2/6894514_9abb8592...,7995,350 West 18th Street,medium
100016,1.0,2,3ba49a93260ca5df92fde024cb4ca61f,2016-04-27 03:19:56,Stunning unit with a great location and lots o...,West 107th Street,"[prewar, elevator, Dogs Allowed, Cats Allowed,...",40.8012,6930771,-73.9660,01287194f20de51872e81f660def4784,[https://photos.renthop.com/2/6930771_7e3622b6...,3600,210 West 107th Street,low
100020,2.0,1,0372927bcb6a0949613ef5bf893bbac7,2016-04-13 06:01:42,"This huge sunny ,plenty of lights 1 bed/2 bath...",West 21st Street,"[Doorman, Elevator, Pre-War, Terrace, Laundry ...",40.7427,6867392,-73.9957,e6472c7237327dd3903b3d6f6a94515a,[https://photos.renthop.com/2/6867392_b18283f6...,5645,155 West 21st Street,low
100026,1.0,1,a7efbeb58190aa267b4a9121cd0c88c0,2016-04-20 02:36:35,<p><a website_redacted,Hamilton Terrace,"[Cats Allowed, Dogs Allowed, Elevator, Laundry...",40.8234,6898799,-73.9457,c1a6598437b7db560cde66e5a297a53f,[https://photos.renthop.com/2/6898799_3759be4c...,1725,63 Hamilton Terrace,medium
100027,2.0,4,0,2016-04-02 02:58:15,This is a spacious four bedroom with every bed...,522 E 11th,"[Dishwasher, Hardwood Floors]",40.7278,6814332,-73.9808,23a01ea7717b38875f5b070282d1b9d2,[https://photos.renthop.com/2/6814332_e19a8552...,5800,522 E 11th,low


In [4]:
train_data_df = sqlCtx.createDataFrame(train_data_pd)

In [12]:
train_data_df

DataFrame[bathrooms: double, bedrooms: bigint, building_id: string, created: string, description: string, display_address: string, features: array<string>, latitude: double, listing_id: bigint, longitude: double, manager_id: string, photos: array<string>, price: bigint, street_address: string, interest_level: string]

In [13]:
train_data_rdd = train_data_df.rdd

In [21]:
def feature_processor(row):
    return [row[ticker] for ticker in row.__fields__]

In [22]:
train_data_rdd.map(lambda row: feature_processor(row)).take(1)

[[1.5,
  3,
  '53a5b119ba8f7b61d4e010512e0dfc85',
  '2016-06-24 07:54:24',
  "A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy These Following Apartment Features As You Rent Here? Modern Designed Bathroom w/ a Deep Spa Soaking Tub? Room to Room AC/Heat? Real Oak Hardwood Floors? Rain Forest Shower Head? SS steel Appliances w/ Chef Gas Cook Oven & LG Fridge? washer /dryer in the apt? Cable Internet Ready? Granite Counter Top Kitchen w/ lot of cabinet storage spaceIt's Just A Few blocks To L Train<br /><br />Don't miss out!<br /><br />We have several great apartments in the immediate area.<br /><br />For additional information 687-878-2229<p><a  website_redacted ",
  'Metropolitan Avenue',
  [],
  40.7145,
  7211212,
  -73.9425,
  '5ba989232d0489da1b5f2c45f6688adc',
  ['https://photos.renthop.com/2/7211212_1ed4542ec81621d70d1061aa833e669c.jpg',
   'https://photos.renthop.com/2/7211212_7dfc41dced69245065df83d08eed4a00.jpg',
   'https://photos.renthop.com/2/7211212_c17853c4b869af6f53af08b0f58

In [13]:
import re

In [57]:
def addressProcessor(address):
    addressTokens = address.split(" ")
    result = []
    for ticker in addressTokens:
        if re.match("^[0-9]+$",ticker):
            continue
        else:
            result.append(ticker)
    return " ".join(result).lower().strip().replace(".","")

In [58]:
"a jda 12th 12".split(" ")

['a', 'jda', '12th', '12']

In [59]:
addressProcessor("a jda 12th")

'a jda 12th'

In [61]:
resultDict = dict()
display_address = trainData.get("display_address")
for i in display_address:
    resultDict[i] = addressProcessor( display_address[i] )
resultDict

{'4': 'borinquen place',
 '6': 'east 44th',
 '9': 'east 56th street',
 '10': 'metropolitan avenue',
 '15': 'east 34th street',
 '16': 'east 16th street',
 '18': 'east 13th street',
 '19': 'york avenue',
 '23': 'e street',
 '32': 'hicks street',
 '33': 'e 80th st',
 '36': 'w st',
 '38': 'west 42nd street',
 '39': 'thompson st',
 '42': 'e street',
 '43': 'division avenue',
 '44': 'e street',
 '46': '18th street',
 '49': 'w street',
 '61': 'main street',
 '66': '2nd ave',
 '67': 'west 20th street',
 '69': 'west 56th street',
 '74': 'first ave',
 '78': 'e 79th st',
 '80': 'second ave',
 '82': 'cliff street',
 '83': 'east 81st street',
 '84': 'e street',
 '85': 'w st',
 '87': 'spring st',
 '88': 'w street',
 '89': 'west 109th street',
 '92': 'suffolk street',
 '93': '42nd st',
 '101': 'east 19th street',
 '102': 'tenth avenue',
 '104': 'e street',
 '106': 'president st',
 '107': 'e 25th st',
 '111': 'harman',
 '113': 'west end avenue',
 '115': 'amsterdam avenue',
 '117': 'east 12th street',

In [64]:
trainData.get("latitude")

{'4': 40.7108,
 '6': 40.7513,
 '9': 40.7575,
 '10': 40.7145,
 '15': 40.7439,
 '16': 40.7348,
 '18': 40.7302,
 '19': 40.7769,
 '23': 40.7346,
 '32': 40.699,
 '33': 40.7723,
 '36': 40.753,
 '38': 40.761,
 '39': 40.7277,
 '42': 40.7633,
 '43': 40.7073,
 '44': 40.7528,
 '46': 40.736,
 '49': 40.783,
 '61': 40.7621,
 '66': 40.746,
 '67': 40.7456,
 '69': 40.7685,
 '74': 40.7319,
 '78': 40.7709,
 '80': 40.7352,
 '82': 40.7084,
 '83': 40.7735,
 '84': 40.7708,
 '85': 40.7585,
 '87': 40.7223,
 '88': 40.7743,
 '89': 40.8031,
 '92': 40.7185,
 '93': 40.645,
 '101': 40.6455,
 '102': 40.7633,
 '104': 40.771,
 '106': 40.6678,
 '107': 40.7398,
 '111': 40.6956,
 '113': 40.7751,
 '115': 40.7931,
 '117': 40.7287,
 '118': 40.7528,
 '121': 40.7852,
 '122': 40.7074,
 '133': 40.7024,
 '136': 40.7754,
 '139': 40.7656,
 '140': 40.7381,
 '145': 40.7075,
 '146': 40.7959,
 '151': 40.7641,
 '152': 40.826,
 '158': 40.7474,
 '161': 40.6765,
 '170': 40.7638,
 '171': 40.7396,
 '181': 40.7617,
 '182': 40.6633,
 '186': 40

In [66]:
trainData.get("longitude")

{'4': -73.9539,
 '6': -73.9722,
 '9': -73.9625,
 '10': -73.9425,
 '15': -73.9743,
 '16': -73.9865,
 '18': -73.9826,
 '19': -73.9467,
 '23': -73.9811,
 '32': -73.9943,
 '33': -73.951,
 '36': -73.9959,
 '38': -73.999,
 '39': -74.0,
 '42': -73.9596,
 '43': -73.9665,
 '44': -73.9709,
 '46': -73.986,
 '49': -73.9828,
 '61': -73.9486,
 '66': -73.9754,
 '67': -74.0053,
 '69': -73.9895,
 '74': -73.9817,
 '78': -73.9496,
 '80': -73.9832,
 '82': -74.0048,
 '83': -73.9509,
 '84': -73.9576,
 '85': -73.9913,
 '87': -73.9966,
 '88': -73.9875,
 '89': -73.9653,
 '92': -73.9865,
 '93': -73.9984,
 '101': -73.961,
 '102': -73.9932,
 '104': -73.9553,
 '106': -73.9398,
 '107': -73.9811,
 '111': -73.9227,
 '113': -73.9886,
 '115': -73.9715,
 '117': -73.981,
 '118': -73.9709,
 '121': -73.949,
 '122': -74.0069,
 '133': -73.9279,
 '136': -73.9509,
 '139': -73.9582,
 '140': -73.9917,
 '145': -74.0079,
 '146': -73.9742,
 '151': -73.9592,
 '152': -73.9518,
 '158': -73.9566,
 '161': -73.9524,
 '170': -73.994,
 '17

In [62]:
import collections    
def has_duplicates(list_of_values):  
    value_dict = collections.defaultdict(int)  
    for item in list_of_values:  
        value_dict[item] += 1  
    return any(val > 1 for val in value_dict.itervalues())  

In [15]:
streetList = list(trainData.get("street_address").values())

In [17]:
streetList

['145 Borinquen Place',
 '230 East 44th',
 '405 East 56th Street',
 '792 Metropolitan Avenue',
 '340 East 34th Street',
 '145 East 16th Street',
 '410 East 13th Street',
 '1661 York Avenue',
 '346 E 19 Street',
 '94 Hicks Street',
 '420 E 80th St.\r',
 '360 W 34 St.',
 '620 West 42nd Street',
 '174 Thompson St.',
 '360 E 65 Street',
 '63A Division Avenue',
 '235 E 46 Street',
 '170 East 18th Street',
 '308 W 77 Street',
 '576 Main Street',
 '655 2nd Ave',
 '460 West 20th Street',
 '500 West 56th Street',
 '252 First Ave',
 '510 E 79th St',
 '317 Second Ave',
 'Cliff Street',
 '407 East 81st Street',
 '242 E 75 Street',
 '350 W 43 St.',
 '55 Spring St',
 '244 W 64 Street',
 '225 West 109th Street',
 '99 Suffolk Street',
 '830 42nd St',
 '165 East 19th Street',
 '697 Tenth Avenue',
 '347 E 76 Street',
 '1475 President St.',
 '219 E 25th St.',
 '98 Harman',
 '101 West End Avenue',
 '706 Amsterdam Avenue',
 '504 East 12th Street',
 '235 East 46th Street',
 '215 E 96th St.',
 '2 Gold Street

In [8]:
trainData.get("photos")

{'4': ['https://photos.renthop.com/2/7170325_3bb5ac84a5a10227b17b273e79bd77b4.jpg',
  'https://photos.renthop.com/2/7170325_a29a17a771ee6af213966699b05c8ea2.jpg',
  'https://photos.renthop.com/2/7170325_149a898e8760cac1cad56e30cfe98baa.jpg',
  'https://photos.renthop.com/2/7170325_f74a43d781bcc3c5588e61dd47de81ba.jpg',
  'https://photos.renthop.com/2/7170325_e677d9d249ac99abe01aa5454c6e9f59.jpg',
  'https://photos.renthop.com/2/7170325_960ea0e180bf2f15467b68b455db6172.jpg',
  'https://photos.renthop.com/2/7170325_cbc1b8437155dbf7f5d63b3a0b5a45a3.jpg',
  'https://photos.renthop.com/2/7170325_9a9f2adc2ce922e1d5394727efdf64bb.jpg',
  'https://photos.renthop.com/2/7170325_aae2a39d536103eebb282775fab1c315.jpg',
  'https://photos.renthop.com/2/7170325_cd290d0051b9f08e3482195dcbf6b5a6.jpg',
  'https://photos.renthop.com/2/7170325_a2b599da7880eea1edd10c4b04250dc1.jpg',
  'https://photos.renthop.com/2/7170325_6b83fa82d662bcb09733ac3a8a107113.jpg'],
 '6': ['https://photos.renthop.com/2/7092344_7