In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from mlxtend.preprocessing import TransactionEncoder

In [3]:
rent = pd.read_csv("House Rent.csv")
rent.head()

Unnamed: 0,squarefoot,location,num_past_occupants,rent,building_type
0,1160,Suburb,2,5315.70072,Apartment
1,4072,Downtown,4,21588.912282,House
2,3392,Suburb,2,13223.827804,House
3,766,Downtown,6,5259.09067,Apartment
4,4726,Rural,6,18101.249014,House


In [5]:
rent.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   squarefoot          5000 non-null   int64  
 1   location            5000 non-null   object 
 2   num_past_occupants  5000 non-null   int64  
 3   rent                5000 non-null   float64
 4   building_type       5000 non-null   object 
dtypes: float64(1), int64(2), object(2)
memory usage: 195.4+ KB


In [None]:
rent = pd.get_dummies(rent, columns = ['location'],drop_first = True)

In [12]:
rent.head()

Unnamed: 0,squarefoot,num_past_occupants,rent,building_type,location_Rural,location_Suburb
0,1160,2,5315.70072,Apartment,False,True
1,4072,4,21588.912282,House,False,False
2,3392,2,13223.827804,House,False,True
3,766,6,5259.09067,Apartment,False,False
4,4726,6,18101.249014,House,True,False


In [14]:
bool_columns = ['location_Rural', 'location_Suburb']
rent[bool_columns] = rent[bool_columns].astype(int)

In [15]:
rent.head()

Unnamed: 0,squarefoot,num_past_occupants,rent,building_type,location_Rural,location_Suburb
0,1160,2,5315.70072,Apartment,0,1
1,4072,4,21588.912282,House,0,0
2,3392,2,13223.827804,House,0,1
3,766,6,5259.09067,Apartment,0,0
4,4726,6,18101.249014,House,1,0


In [30]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
rent['squarefoot']= scaler.fit_transform(rent[['squarefoot']])

In [31]:
X = rent[['squarefoot','location_Rural', 'location_Suburb' ]]
y = rent['rent']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [33]:
model = LinearRegression()
model.fit(X_train, y_train)

### Predictions

In [34]:
y_pred = model.predict(X_test)

In [None]:
y_pred

In [38]:
from sklearn.metrics import mean_squared_error, r2_score

In [39]:
MSE = mean_squared_error(y_pred, y_test)
MSE

5653192.099922772

In [40]:
R2 = r2_score(y_pred, y_test)
R2

0.8832534797011372

In [42]:
X_train_sm = sm.add_constant(X_train) # Adding a constant for the intercept
model_sm = sm.OLS(y_train, X_train_sm).fit()
print(model_sm.summary())

                            OLS Regression Results                            
Dep. Variable:                   rent   R-squared:                       0.898
Model:                            OLS   Adj. R-squared:                  0.898
Method:                 Least Squares   F-statistic:                 1.169e+04
Date:                Tue, 06 Aug 2024   Prob (F-statistic):               0.00
Time:                        11:55:27   Log-Likelihood:                -36719.
No. Observations:                4000   AIC:                         7.345e+04
Df Residuals:                    3996   BIC:                         7.347e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const            1.591e+04     63.999    2

In [43]:
correlation = X.corr()
correlation

Unnamed: 0,squarefoot,location_Rural,location_Suburb
squarefoot,1.0,0.008092,0.004813
location_Rural,0.008092,1.0,-0.494233
location_Suburb,0.004813,-0.494233,1.0
