##**AMAAN JAVED SYED**
** ** ** **
##**TREUE TECHNOLOGIES INTERNSHIP FINAL TASK**

#### **The aim of this project is to create a model that can predict house prices by taking into account various factors outlined in the dataset. The model will be constructed using the Linear Regression algorithm.**

In [1]:
#importing important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
#importing the dataset
df=pd.read_csv("/content/drive/MyDrive/Datasets/BHP.csv")

In [3]:
#displaying the top 5 values from dataset
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [5]:
df.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [6]:
#checking for null/missing values in dataset
df.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [7]:
#handling the missing values
df.dropna(inplace=True)

In [8]:
df.isna().sum()

area_type       0
availability    0
location        0
size            0
society         0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

In [9]:
#checking the rows and columns of dataset
df.shape

(7496, 9)

In [10]:
#dropping the columns as they are not closely related to the price prediction
df=df.drop(['availability','location','society'],axis=1)

In [11]:
df.head()

Unnamed: 0,area_type,size,total_sqft,bath,balcony,price
0,Super built-up Area,2 BHK,1056,2.0,1.0,39.07
1,Plot Area,4 Bedroom,2600,5.0,3.0,120.0
3,Super built-up Area,3 BHK,1521,3.0,1.0,95.0
5,Super built-up Area,2 BHK,1170,2.0,1.0,38.0
11,Plot Area,4 Bedroom,2785,5.0,3.0,295.0


In [12]:
#converting the size column to size_bhk column
df['size_bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))

In [13]:
df.head()

Unnamed: 0,area_type,size,total_sqft,bath,balcony,price,size_bhk
0,Super built-up Area,2 BHK,1056,2.0,1.0,39.07,2
1,Plot Area,4 Bedroom,2600,5.0,3.0,120.0,4
3,Super built-up Area,3 BHK,1521,3.0,1.0,95.0,3
5,Super built-up Area,2 BHK,1170,2.0,1.0,38.0,2
11,Plot Area,4 Bedroom,2785,5.0,3.0,295.0,4


In [14]:
#dropping the size & area_type columns
df=df.drop(['size','area_type'],axis=1)

In [15]:
df.head()

Unnamed: 0,total_sqft,bath,balcony,price,size_bhk
0,1056,2.0,1.0,39.07,2
1,2600,5.0,3.0,120.0,4
3,1521,3.0,1.0,95.0,3
5,1170,2.0,1.0,38.0,2
11,2785,5.0,3.0,295.0,4


In [16]:
#converting the values in total squarefeet column to integers
def convert_sqft_into_int(x):
    token = x.split('-')
    if len(token) == 2:
        return (float(token[0]) + float(token[1])) / 2
    try:
        return float(x)
    except:
        return None

In [17]:
df['total_sqft'] = df['total_sqft'].apply(convert_sqft_into_int)

In [18]:
#checking for null values after the conversion
df.isna().sum()

total_sqft    15
bath           0
balcony        0
price          0
size_bhk       0
dtype: int64

In [19]:
#dropping the null values
df.dropna(inplace=True)

In [20]:
df.head()

Unnamed: 0,total_sqft,bath,balcony,price,size_bhk
0,1056.0,2.0,1.0,39.07,2
1,2600.0,5.0,3.0,120.0,4
3,1521.0,3.0,1.0,95.0,3
5,1170.0,2.0,1.0,38.0,2
11,2785.0,5.0,3.0,295.0,4


In [21]:
#calculating the price per square feet as it will play a major role in predicting the price
df['price_per_sqft']=df['price'] * 100000 / df['total_sqft']

In [22]:
df.head()

Unnamed: 0,total_sqft,bath,balcony,price,size_bhk,price_per_sqft
0,1056.0,2.0,1.0,39.07,2,3699.810606
1,2600.0,5.0,3.0,120.0,4,4615.384615
3,1521.0,3.0,1.0,95.0,3,6245.890861
5,1170.0,2.0,1.0,38.0,2,3247.863248
11,2785.0,5.0,3.0,295.0,4,10592.459605


In [23]:
#dropping the balcony column
df=df.drop(['balcony'],axis=1)

###**OUTLIER TREATMENT**

In [24]:
#checking for outliers in newly created price per sq. feet column
df['price_per_sqft'].describe()

count      7481.000000
mean       5992.058235
std        9160.840277
min         371.428571
25%        4279.131007
50%        5319.148936
75%        6696.629213
max      672727.272727
Name: price_per_sqft, dtype: float64

In [25]:
#function to calculate outliers
def outlier_treatment(col):
    sorted(col)
    Q1,Q3 = np.percentile(col , [25,75])
    IQR = Q3 - Q1
    lower_range = Q1 - (1.5 * IQR)
    upper_range = Q3 + (1.5 * IQR)
    return lower_range,upper_range

In [26]:
#calculating the lower and upper range
lower_range,upper_range = outlier_treatment(df['price_per_sqft'])
print("Lower Range:",lower_range)
print("Upper Range:",upper_range)

Lower Range: 652.883697879296
Upper Range: 10322.876522845458


In [27]:
#outliers of lower range
lower_values = df[df["price_per_sqft"].values < lower_range]
lower_values

Unnamed: 0,total_sqft,bath,price,size_bhk,price_per_sqft
674,35000.0,3.0,130.0,3,371.428571
11748,2400.0,3.0,12.0,3,500.0


In [28]:
#outliers of upper range
upper_values = df[df["price_per_sqft"].values > upper_range]
upper_values

Unnamed: 0,total_sqft,bath,price,size_bhk,price_per_sqft
11,2785.0,5.0,295.0,4,10592.459605
18,2770.0,4.0,290.0,3,10469.314079
22,2800.0,5.0,380.0,4,13571.428571
57,1500.0,2.0,185.0,2,12333.333333
62,5700.0,5.0,650.0,4,11403.508772
...,...,...,...,...,...
13180,3124.0,6.0,349.0,4,11171.574904
13233,1903.0,2.0,293.0,3,15396.741986
13262,1140.0,1.0,185.0,2,16228.070175
13290,4050.0,2.0,450.0,4,11111.111111


In [29]:
#total outliers
lower_outliers = lower_values.value_counts().sum(axis=0)
upper_outliers = upper_values.value_counts().sum(axis=0)
total_outliers = lower_outliers + upper_outliers

print("Total Number of Outliers:",total_outliers)

Total Number of Outliers: 389


In [30]:
# index that contains outliers

lower_index = list(df[df['price_per_sqft'] < lower_range ].index)

upper_index = list(df[df['price_per_sqft'] > upper_range ].index)

total_index = list(lower_index + upper_index)

print(total_index)

[674, 11748, 11, 18, 22, 57, 62, 107, 225, 260, 324, 349, 373, 407, 451, 465, 474, 480, 483, 583, 584, 639, 649, 666, 672, 685, 719, 736, 749, 913, 937, 950, 958, 965, 1065, 1124, 1146, 1165, 1176, 1177, 1217, 1255, 1296, 1311, 1341, 1398, 1405, 1442, 1443, 1504, 1569, 1633, 1665, 1693, 1730, 1774, 1776, 1839, 1847, 1931, 1952, 2004, 2010, 2066, 2238, 2282, 2313, 2331, 2341, 2370, 2426, 2428, 2430, 2593, 2627, 2660, 2667, 2682, 2783, 2911, 2952, 2959, 2981, 3030, 3062, 3072, 3097, 3100, 3117, 3118, 3138, 3151, 3154, 3180, 3193, 3227, 3231, 3289, 3302, 3460, 3484, 3525, 3653, 3654, 3662, 3669, 3697, 3806, 3829, 3866, 3918, 4034, 4189, 4243, 4249, 4263, 4313, 4338, 4371, 4373, 4463, 4498, 4602, 4629, 4632, 4642, 4734, 4753, 4780, 4789, 4828, 4855, 4889, 4891, 4928, 4953, 4962, 5022, 5031, 5058, 5096, 5273, 5300, 5313, 5367, 5444, 5473, 5483, 5519, 5582, 5595, 5606, 5618, 5679, 5695, 5717, 5728, 5799, 5925, 5939, 5975, 5984, 5987, 6028, 6051, 6054, 6055, 6097, 6125, 6128, 6150, 6176, 6188

In [31]:
print("Shape Before Dropping Outlier Rows:", df.shape)

df.drop(total_index, inplace = True)

print("Shape After Dropping Outlier Rows:", df.shape)

Shape Before Dropping Outlier Rows: (7481, 5)
Shape After Dropping Outlier Rows: (7092, 5)


In [32]:
df.head()

Unnamed: 0,total_sqft,bath,price,size_bhk,price_per_sqft
0,1056.0,2.0,39.07,2,3699.810606
1,2600.0,5.0,120.0,4,4615.384615
3,1521.0,3.0,95.0,3,6245.890861
5,1170.0,2.0,38.0,2,3247.863248
12,1000.0,2.0,38.0,2,3800.0


In [33]:
#function used to check how the columns are correlated with each other
df.corr()

Unnamed: 0,total_sqft,bath,price,size_bhk,price_per_sqft
total_sqft,1.0,0.676655,0.803788,0.63692,0.314677
bath,0.676655,1.0,0.72011,0.81582,0.367904
price,0.803788,0.72011,1.0,0.631049,0.690374
size_bhk,0.63692,0.81582,0.631049,1.0,0.281013
price_per_sqft,0.314677,0.367904,0.690374,0.281013,1.0


**MODEL TRAINING AND PREDICTION**

In [34]:
X=df.drop(['price'],axis=1)
y=df['price']

In [35]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [36]:
#using the Linear Regression Model
model=LinearRegression()
model.fit(X_train,y_train)

In [37]:
prediction=model.predict(X_test)

In [38]:
#accuracy of the training dataset
train_accuracy=model.score(X_train,y_train)
train_accuracy

0.8687983560428384

In [39]:
#accuracy of test dataset
test_accuracy=model.score(X_test,y_test)
test_accuracy

0.9167986493612298