# Data modeling
***we are going to find the effect of zip code and number of bedrooms to rental prices***  

In [2]:
import patsy

***Patsy***  
Patsy helps you write cleaner and more concise code for statistical modeling. Instead of manually handling the complexities of creating design matrices and transformations, you can use simple and intuitive formulas.

In [4]:
import statsmodels

***Statsmodels***  
statsmodels is a Python module that provides classes and functions for the estimation of many different statistical models, as well as for conducting statistical tests, and statistical data exploration.

In [5]:
import pandas as pd
import numbers as np

In [20]:
df = pd.read_csv('D:\Python Machine lerning blueprint_Projects\Application-to-identify-underpriced-apartments\preprocessed_data')

In [21]:
df

Unnamed: 0.1,Unnamed: 0,url,address,neibhourehood,rent,no_of_beds,no_of_bathrooms
0,0,https://www.renthop.com/listings/201-stanton-s...,201StantonStreet,"LowerEastSide,DowntownManhattan,Manhattan",5295,2,1.0
1,1,https://www.renthop.com/listings/bedford-ave-m...,"1655BedfordAvenue,Apt2B","CrownHeights,CentralBrooklyn,Brooklyn",3800,2,1.0
2,2,https://www.renthop.com/listings/95-wall-stree...,"95WallStreet,Apt1001","FinancialDistrict,DowntownManhattan,Manhattan",3914,0,1.0
3,3,https://www.renthop.com/listings/775-columbus-...,"775ColumbusAvenue,Apt07A","ManhattanValley,UpperWestSide,UpperManhattan,M...",4820,1,1.0
4,4,https://www.renthop.com/listings/10-hanover-sq...,"10HanoverSquare,Apt04R","FinancialDistrict,DowntownManhattan,Manhattan",3965,0,1.0
...,...,...,...,...,...,...,...
16799,16799,https://www.renthop.com/listings/232-e-54th-st...,"232E54thSt,Apt6D","MidtownEast,MidtownManhattan,Manhattan",4018,0,1.0
16800,16800,https://www.renthop.com/listings/148-west-68th...,148West68thStreet,"LincolnSquare,UpperWestSide,UpperManhattan,Man...",2650,0,1.0
16801,16801,https://www.renthop.com/listings/37-wall-stree...,"37WallStreet,Apt8M","FinancialDistrict,DowntownManhattan,Manhattan",3700,1,1.0
16802,16802,https://www.renthop.com/listings/east-78th-str...,East78thStreet,"UpperEastSide,UpperManhattan,Manhattan",2695,0,1.0


In [22]:
zip_data = pd.read_csv('D:\Python Machine lerning blueprint_Projects\Application-to-identify-underpriced-apartments\zip_code.csv')

In [23]:
zip_data

Unnamed: 0,0
0,
1,
2,
3,
4,10005.0
...,...
16799,
16800,
16801,
16802,10075.0


In [25]:
df['zip_code'] = zip_data

In [26]:
# dropping null rows of zipcode from df

data = df.dropna(subset=['zip_code'])

In [27]:
data

Unnamed: 0.1,Unnamed: 0,url,address,neibhourehood,rent,no_of_beds,no_of_bathrooms,zip_code
4,4,https://www.renthop.com/listings/10-hanover-sq...,"10HanoverSquare,Apt04R","FinancialDistrict,DowntownManhattan,Manhattan",3965,0,1.0,10005.0
6,6,https://www.renthop.com/listings/120-w-21st/15...,"120W.21st,Apt1506","Chelsea,MidtownManhattan,Manhattan",5369,1,1.0,10011.0
7,7,https://www.renthop.com/listings/113-mulberry-...,113MulberryStreet,"LittleItaly,DowntownManhattan,Manhattan",5295,3,1.0,10013.0
10,10,https://www.renthop.com/listings/31-19-37th-st...,"31-1937thSt,Apt3A","Astoria,NorthwesternQueens,Queens",4000,2,1.0,11101.0
11,11,https://www.renthop.com/listings/10-hanover-sq...,"10HanoverSquare,Apt16P","FinancialDistrict,DowntownManhattan,Manhattan",3885,0,1.0,10005.0
...,...,...,...,...,...,...,...,...
16792,16792,https://www.renthop.com/listings/east-56th-str...,East56thStreet,"SuttonPlace,MidtownEast,MidtownManhattan,Manha...",3700,1,1.0,10022.0
16793,16793,https://www.renthop.com/listings/2nd-avenue-26...,2ndAvenue/26thst,"RoseHill,KipsBay,MidtownManhattan,Manhattan",3000,1,1.0,10016.0
16794,16794,https://www.renthop.com/listings/huntington-st...,HuntingtonStreet,"CarrollGardens,SouthBrooklyn,Brooklyn",5750,2,1.0,11231.0
16798,16798,https://www.renthop.com/listings/175-e-96th-st...,"175E96thSt,Apt27H","EastHarlem,Harlem,UpperManhattan,Manhattan",5100,1,1.0,10029.0


In [28]:
data.columns

Index(['Unnamed: 0', 'url', 'address', 'neibhourehood', 'rent', 'no_of_beds',
       'no_of_bathrooms', 'zip_code'],
      dtype='object')

In [31]:
import statsmodels.api as sm

In [33]:
# function = ' dependent variable ~ predictor ' comes with patsy
f = ' rent ~ zip_code + no_of_beds'

# we have made the function but now we need to extract the x and y vector for our formula according to the used variable
y, x = patsy.dmatrices(f, data, return_type='dataframe') # here we pass the complete df, it automatically get the columns 

# model construction using OLS() and fitting data with fit()
results = sm.OLS(y, x).fit()
results.summary()

0,1,2,3
Dep. Variable:,rent,R-squared:,0.176
Model:,OLS,Adj. R-squared:,0.175
Method:,Least Squares,F-statistic:,782.6
Date:,"Wed, 08 Jan 2025",Prob (F-statistic):,7.5e-309
Time:,16:37:15,Log-Likelihood:,-71858.0
No. Observations:,7350,AIC:,143700.0
Df Residuals:,7347,BIC:,143700.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3608.1202,219.284,16.454,0.000,3178.261,4037.979
zip_code,-0.0740,0.020,-3.711,0.000,-0.113,-0.035
no_of_beds,1848.3824,46.793,39.501,0.000,1756.655,1940.110

0,1,2,3
Omnibus:,10075.972,Durbin-Watson:,1.987
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3215154.332
Skew:,7.848,Prob(JB):,0.0
Kurtosis:,104.253,Cond. No.,47000.0


***Explaination of above table***  

first of all the above ml model is of linear regression  



# Forecasting

In [37]:
x.head

<bound method NDFrame.head of        Intercept  zip_code  no_of_beds
4            1.0   10005.0         0.0
6            1.0   10011.0         1.0
7            1.0   10013.0         3.0
10           1.0   11101.0         2.0
11           1.0   10005.0         0.0
...          ...       ...         ...
16792        1.0   10022.0         1.0
16793        1.0   10016.0         1.0
16794        1.0   11231.0         2.0
16798        1.0   10029.0         1.0
16802        1.0   10075.0         0.0

[7350 rows x 3 columns]>

In [58]:
# now for prediction we need to again forma a matrix using dmatrices 
test_values = {'zip_code': 10002, 'no_of_beds': 3}
test_df = pd.DataFrame([test_values])
test_data = patsy.dmatrix('zip_code + no_of_beds', test_df, return_type='dataframe')

In [59]:
predicted_rent = results.predict(test_data)

In [60]:
print(predicted_rent)

0    8412.625166
dtype: float64


i am taking these input from the renthop website with the model trained with the data of date 24/12/25

In [63]:
def predict_rent(): 
    zip_code = int(input("Enter zip code: "))
    no_of_beds = int(input("Enter number of beds: "))
    # Create DataFrame with input values 
    new_data = {'zip_code': zip_code, 'no_of_beds': no_of_beds}
    new_df = pd.DataFrame([new_data])
    # Creating design matrix for new data (including intercept)
    new_X = patsy.dmatrix('zip_code + no_of_beds', new_df, return_type='dataframe')
    # Make prediction 
    predicted_rent = results.predict(new_X)
    print(f"Predicted rent: {predicted_rent[0]:.2f}")

In [66]:
predict_rent()

Predicted rent: 6472.72
