In [5]:
import pandas as pd

file_path = "NY-House-Dataset.csv"
try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print(f"File '{file_path}' not found.")

print("\nFirst few rows of the dataset:")
print(df.head())

print("\nSummary statistics of the dataset:")
print(df.describe())

print("\nInformation about the dataset:")
print(df.info())


Dataset loaded successfully.

First few rows of the dataset:
                 TYPE      PRICE  BEDS       BATH  PROPERTYSQFT  \
0      Condo for sale     315000     2   2.000000        1400.0   
1      Condo for sale  195000000     7  10.000000       17545.0   
2      House for sale     260000     4   2.000000        2015.0   
3      Condo for sale      69000     3   1.000000         445.0   
4  Townhouse for sale   55000000     7   2.373861       14175.0   

                     STATE                                       MAIN_ADDRESS  \
0       New York, NY 10022             2 E 55th St Unit 803New York, NY 10022   
1       New York, NY 10019  Central Park Tower Penthouse-217 W 57th New Yo...   
2  Staten Island, NY 10312            620 Sinclair AveStaten Island, NY 10312   
3      Manhattan, NY 10022         2 E 55th St Unit 908W33Manhattan, NY 10022   
4       New York, NY 10065                      5 E 64th StNew York, NY 10065   

  ADMINISTRATIVE_AREA_LEVEL_2  LOCALITY      SUBL

- **Overview**: 
  - The dataset contains 4801 entries and 13 columns.
  - Features include house type, price, bedrooms, bathrooms, property square footage, and address details.

- **Summary Statistics**:
  - Prices range from 2,494 to 2,147,484,000.
  - Average house has 3.36 bedrooms, 2.37 bathrooms, and 2184.21 square feet.

- **Interesting Relations**:
  - Explore correlations between price and numerical features.
  - Analyze geographical distribution of house prices using latitude and longitude.
  - Investigate impact of distance from iconic spot on house prices.


- I would like to predict house prices based on features such as:

  - Number of bedrooms
  - Number of bathrooms
  - Property square footage
  - Possibly, the distance from an iconic spot in New York City.

In [7]:
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

df.drop_duplicates(inplace=True)

df.dropna(inplace=True)

# Handle outliers
df = df[df['PRICE'] <= 10000000]

print("Shape of the cleaned dataset:", df.shape)

Missing values in each column:
TYPE                           0
PRICE                          0
BEDS                           0
BATH                           0
PROPERTYSQFT                   0
STATE                          0
MAIN_ADDRESS                   0
ADMINISTRATIVE_AREA_LEVEL_2    0
LOCALITY                       0
SUBLOCALITY                    0
STREET_NAME                    0
LATITUDE                       0
LONGITUDE                      0
dtype: int64
Shape of the cleaned dataset: (4438, 13)


In [14]:
from sklearn.model_selection import train_test_split
import numpy as np

# Haversine formula to calculate distance between two points given their coordinates
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat / 2) ** 2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c 
    return distance

iconic_landmark_coords = (40.7580, -73.9855)  # Times Square coordinates

df['DISTANCE_TO_ICONIC_LANDMARK'] = df.apply(lambda row: haversine(row['LATITUDE'], row['LONGITUDE'], iconic_landmark_coords[0], iconic_landmark_coords[1]), axis=1)

X = df[['BEDS', 'BATH', 'PROPERTYSQFT', 'LATITUDE', 'LONGITUDE', 'DISTANCE_TO_ICONIC_LANDMARK']]
y = df['PRICE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


Shape of X_train: (3550, 6)
Shape of X_test: (888, 6)
Shape of y_train: (3550,)
Shape of y_test: (888,)
