In [1]:
import pandas as pd
import category_encoders as ce
import sklearn.preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns

In [2]:
df = pd.read_csv("Resources/cleaned_data.csv")

In [3]:
df
# NOTES
# Drop Unnamed 0 column
# Beds and Square Footages have NAs that will need to be replaced
# Property type and city must be quantified
# Zip code may or may not be relevant

Unnamed: 0.1,Unnamed: 0,Price,Beds,Bathrooms,Square Footage,Property type,City,Zip Code
0,0,453200,6,4.0,2168.0,- Foreclosure,Jamaica,11433.0
1,1,119000,0,1.0,450.0,- Condo for sale,New York,10019.0
2,2,1295000,4,3.0,2598.0,- For sale by owner,Brooklyn,11209.0
3,3,65000,1,1.0,700.0,- Condo for sale,Bronx,10473.0
4,4,379000,2,3.0,1800.0,- Condo for sale,New York,10022.0
...,...,...,...,...,...,...,...,...
784,792,318000,2,1.0,,- Condo for sale,Oakland Gardens,11364.0
785,793,915000,4,4.0,2400.0,- House for sale,Staten Island,10309.0
786,794,355000,2,1.0,850.0,- Condo for sale,Brooklyn,11221.0
787,795,1880000,4,5.0,2600.0,- New construction,Flushing,11360.0


In [4]:
index_names = df[df['Zip Code'].isnull()].index #Remove some Naans
df.drop(index_names, inplace = True)

In [5]:
df['Property type'].unique()

array(['- Foreclosure', '- Condo for sale', '- For sale by owner',
       '- House for sale', '- Multi-family home for sale',
       '- Apartment for sale', '- New construction',
       '- Townhouse for sale', '- Auction', '- Off market: Zestimate',
       '- Coming soon'], dtype=object)

In [6]:
df['City'].unique()

array(['Jamaica', 'New York', 'Brooklyn', 'Bronx', 'Staten Island',
       'Bayside', 'Howard Beach', 'Elmhurst', 'Astoria', 'Corona',
       'Flushing', 'Middle Village', 'Forest Hills', 'South Ozone Park',
       'Saint Albans', 'Laurelton', 'New Hyde Park',
       'South Richmond Hill', 'Little Neck', 'Far Rockaway',
       'East Elmhurst', 'Kew Gardens', 'Springfield Gardens',
       'Ozone Park', 'Queens Village', 'Oakland Gardens', 'Whitestone',
       'Bellerose', 'Ridgewood', 'Rego Park', 'Hollis', 'Woodside',
       'Jackson Heights', 'Floral Park', 'Richmond Hill',
       'Long Island City', 'Rockaway Park', 'Cambria Heights', 'Maspeth',
       'Woodhaven', 'Fresh Meadows'], dtype=object)

In [7]:
df["Property type"] = df["Property type"].astype('category')
df["City"] = df["City"].astype('category')

In [8]:
df["Property type"] = df["Property type"].cat.codes
df["City"] = df["City"].cat.codes

In [9]:
df.drop(columns = 'Unnamed: 0', inplace=True)

In [10]:
#Create object for one-hot encoding
encoder=ce.OneHotEncoder(cols=['City','Property type'],handle_unknown='return_nan',return_df=True,use_cat_names=True)
df = encoder.fit_transform(df)

In [11]:
df.drop(columns = 'Zip Code', inplace=True)

In [12]:
df['Square Footage'] = df['Square Footage'].fillna(df['Square Footage'].mean())

# Standardize

In [16]:
X = df.drop(columns = 'Price')
y = df['Price'].values.reshape(-1, 1)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [19]:
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [24]:
X_train_scaled[1]

array([-0.56108982, -0.88464351,  0.27209392, -0.12415417,  1.24333974,
       -0.24568714, -0.68109389, -0.40734408, -0.08745389, -0.12415417,
       -0.2184347 , -0.06172134, -0.04360207, -0.04360207, -0.18805174,
        2.09225979, -0.48094533, -0.4041725 , -0.55762467, -0.15264656,
       -0.11602387, -0.09787004, -0.10731409, -0.09787004, -0.13907681,
       -0.08745389, -0.09787004, -0.06172134, -0.06172134, -0.04360207,
       -0.04360207,  0.        , -0.12415417, -0.06172134, -0.04360207,
       -0.09787004, -0.04360207, -0.06172134, -0.09787004, -0.11602387,
       -0.09787004, -0.04360207, -0.04360207, -0.04360207, -0.04360207,
       -0.04360207, -0.04360207, -0.04360207, -0.04360207, -0.06172134,
       -0.06172134, -0.04360207, -0.06172134, -0.04360207, -0.04360207])

In [21]:
y_train_scaled

array([-0.14519828])

In [None]:
ax = sns.scatterplot(data=df[['Price','Square Footage']], x="Square Footage", y="Price")

# Normalization

In [None]:
square_normalized = sklearn.preprocessing.normalize([df['Square Footage']], norm = 'l2')
price_normalized = sklearn.preprocessing.normalize([df['Price']],  norm = 'l2')
beds_normalized = sklearn.preprocessing.normalize([df['Beds']],  norm = 'l2')
bath_normalized = sklearn.preprocessing.normalize([df['Bathrooms']],  norm = 'l2')

In [None]:
df['Square Footage'] = square_normalized[0]
df['Price'] = price_normalized[0]
df['Beds'] = beds_normalized[0]
df['Bathrooms'] = bath_normalized[0]

In [None]:
X = df.drop(columns = 'Price')
y = df['Price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
regr = LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

In [None]:
for i in y_pred:
    print(i)

In [None]:
index_names = df[df['Square Footage'] >= 0.9].index #Remove some Naans
df.drop(index_names, inplace = True)

In [None]:
ax = sns.scatterplot(data=df[['Price','Square Footage']], x="Square Footage", y="Price")

In [None]:
df