In [37]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler




Remove blank using Panda lib. Used in data science

In [38]:
columns_to_keep = ["State", "City", "Property_Type", "Size_in_SqFt", "Price_in_Lakhs"]
cp = pd.read_csv(r'D:\AI_Practice\india_housing_prices.csv', usecols=columns_to_keep)
cp


Unnamed: 0,State,City,Property_Type,Size_in_SqFt,Price_in_Lakhs
0,Tamil Nadu,Chennai,Apartment,4740.0,489.76
1,Maharashtra,Pune,Independent House,2364.0,195.52
2,Punjab,Ludhiana,Apartment,3642.0,183.79
3,Rajasthan,Jodhpur,Independent House,2741.0,300.29
4,Rajasthan,Jaipur,Villa,4823.0,182.90
...,...,...,...,...,...
249995,Gujarat,Ahmedabad,Independent House,1995.0,274.75
249996,Tamil Nadu,Chennai,Apartment,2293.0,13.40
249997,Telangana,Warangal,Villa,2910.0,236.94
249998,Odisha,Cuttack,Villa,1149.0,141.66


In [39]:
 # Remove last col
x = cp.iloc[:,:-1]
y = cp.iloc[:,-1]
x 

Unnamed: 0,State,City,Property_Type,Size_in_SqFt
0,Tamil Nadu,Chennai,Apartment,4740.0
1,Maharashtra,Pune,Independent House,2364.0
2,Punjab,Ludhiana,Apartment,3642.0
3,Rajasthan,Jodhpur,Independent House,2741.0
4,Rajasthan,Jaipur,Villa,4823.0
...,...,...,...,...
249995,Gujarat,Ahmedabad,Independent House,1995.0
249996,Tamil Nadu,Chennai,Apartment,2293.0
249997,Telangana,Warangal,Villa,2910.0
249998,Odisha,Cuttack,Villa,1149.0


In [40]:
y

0         489.76
1         195.52
2         183.79
3         300.29
4         182.90
           ...  
249995    274.75
249996     13.40
249997    236.94
249998    141.66
249999    199.69
Name: Price_in_Lakhs, Length: 250000, dtype: float64

In [41]:
cp.isnull().sum()

State             0
City              0
Property_Type     0
Size_in_SqFt      1
Price_in_Lakhs    0
dtype: int64

In [42]:
cp['Size_in_SqFt'] = cp['Size_in_SqFt'].fillna(cp['Size_in_SqFt'].mean())

In [43]:
cp.isnull().sum()

State             0
City              0
Property_Type     0
Size_in_SqFt      0
Price_in_Lakhs    0
dtype: int64

In [44]:
columns_to_keep = ["State", "City", "Property_Type", "Size_in_SqFt", "Price_in_Lakhs"]
xp = pd.read_csv('D:\AI_Practice\india_housing_prices.csv', usecols=columns_to_keep)
xp


Unnamed: 0,State,City,Property_Type,Size_in_SqFt,Price_in_Lakhs
0,Tamil Nadu,Chennai,Apartment,4740.0,489.76
1,Maharashtra,Pune,Independent House,2364.0,195.52
2,Punjab,Ludhiana,Apartment,3642.0,183.79
3,Rajasthan,Jodhpur,Independent House,2741.0,300.29
4,Rajasthan,Jaipur,Villa,4823.0,182.90
...,...,...,...,...,...
249995,Gujarat,Ahmedabad,Independent House,1995.0,274.75
249996,Tamil Nadu,Chennai,Apartment,2293.0,13.40
249997,Telangana,Warangal,Villa,2910.0,236.94
249998,Odisha,Cuttack,Villa,1149.0,141.66


In [45]:
# for panda array
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer.fit(xp.iloc[:, 3:5])
xp.iloc[:, 3:5] = imputer.transform(xp.iloc[:, 3:5])

In [46]:
xp.isnull().sum()

State             0
City              0
Property_Type     0
Size_in_SqFt      0
Price_in_Lakhs    0
dtype: int64

In [47]:
# Encoding of  categories or text into binary data

In [48]:
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [0, 1])],
    remainder='passthrough'  # Keep numerical columns as they are
)

xxp = np.array(ct.fit_transform(xp))
xxp

array([[0.0, 0.0, 0.0, ..., 'Apartment', 4740.0, 489.76],
       [0.0, 0.0, 0.0, ..., 'Independent House', 2364.0, 195.52],
       [0.0, 0.0, 0.0, ..., 'Apartment', 3642.0, 183.79],
       ...,
       [0.0, 0.0, 0.0, ..., 'Villa', 2910.0, 236.94],
       [0.0, 0.0, 0.0, ..., 'Villa', 1149.0, 141.66],
       [0.0, 0.0, 0.0, ..., 'Independent House', 4313.0, 199.69]],
      shape=(250000, 65), dtype=object)

In [49]:
ct.get_feature_names_out()


array(['encoder__State_Andhra Pradesh', 'encoder__State_Assam',
       'encoder__State_Bihar', 'encoder__State_Chhattisgarh',
       'encoder__State_Delhi', 'encoder__State_Gujarat',
       'encoder__State_Haryana', 'encoder__State_Jharkhand',
       'encoder__State_Karnataka', 'encoder__State_Kerala',
       'encoder__State_Madhya Pradesh', 'encoder__State_Maharashtra',
       'encoder__State_Odisha', 'encoder__State_Punjab',
       'encoder__State_Rajasthan', 'encoder__State_Tamil Nadu',
       'encoder__State_Telangana', 'encoder__State_Uttar Pradesh',
       'encoder__State_Uttarakhand', 'encoder__State_West Bengal',
       'encoder__City_Ahmedabad', 'encoder__City_Amritsar',
       'encoder__City_Bangalore', 'encoder__City_Bhopal',
       'encoder__City_Bhubaneswar', 'encoder__City_Bilaspur',
       'encoder__City_Chennai', 'encoder__City_Coimbatore',
       'encoder__City_Cuttack', 'encoder__City_Dehradun',
       'encoder__City_Durgapur', 'encoder__City_Dwarka',
       'encoder_

In [50]:
# Labeling the data based yes no or any 

In [51]:
le = LabelEncoder()

y = cp.iloc[:,2] = np.array(le.fit_transform(cp.iloc[:,2]))
# cp[:5]
y

array([0, 1, 0, ..., 2, 2, 1], shape=(250000,))

In [52]:
# Splitting the data set into training and test dataset 

In [53]:
X_train, X_test , Y_train, Y_test = train_test_split(xxp,y, test_size = 0.2, random_state = 1)

In [54]:
X_train

array([[0.0, 0.0, 0.0, ..., 'Villa', 2976.0, 194.08],
       [0.0, 0.0, 0.0, ..., 'Apartment', 2990.0, 269.74],
       [0.0, 0.0, 0.0, ..., 'Villa', 1411.0, 297.03],
       ...,
       [0.0, 0.0, 0.0, ..., 'Villa', 1056.0, 193.64],
       [0.0, 0.0, 0.0, ..., 'Independent House', 2612.0, 206.07],
       [0.0, 0.0, 0.0, ..., 'Independent House', 1378.0, 299.18]],
      shape=(200000, 65), dtype=object)

In [55]:
Y_train

array([2, 0, 2, ..., 2, 1, 1], shape=(200000,))

In [56]:
X_test

array([[0.0, 0.0, 0.0, ..., 'Apartment', 3342.0, 108.05],
       [0.0, 0.0, 0.0, ..., 'Villa', 4417.0, 25.74],
       [0.0, 0.0, 0.0, ..., 'Apartment', 1850.0, 41.04],
       ...,
       [0.0, 0.0, 0.0, ..., 'Independent House', 4600.0, 130.18],
       [0.0, 0.0, 0.0, ..., 'Independent House', 1016.0, 372.22],
       [0.0, 0.0, 0.0, ..., 'Independent House', 4460.0, 282.68]],
      shape=(50000, 65), dtype=object)

In [57]:
Y_test

array([0, 2, 0, ..., 1, 1, 1], shape=(50000,))

In [58]:
# # Feature SCaling
# Simple linear regression = b0 + b1x1
# multiple linear regression = b0 + b1x1 + b2x2 + .. +bnxn
# polynomial linear regression = b0 + b1x1 + b2(x2)2 + .. +bn(xn)n

# We also do standarization and normalization


In [61]:
sc = StandardScaler()
X_train[:, -2:]  = sc.fit_transform(X_train[:, -2:])
X_test[:, -2:]  = sc.transform(X_test[:, -2:])

In [62]:
X_train

array([[0.0, 0.0, 0.0, ..., 'Villa', 0.1731626072026661,
        -0.42876040397183773],
       [0.0, 0.0, 0.0, ..., 'Apartment', 0.1839233189131494,
        0.10691201560542155],
       [0.0, 0.0, 0.0, ..., 'Villa', -1.0297312375763574,
        0.3001250783897645],
       ...,
       [0.0, 0.0, 0.0, ..., 'Villa', -1.302592141663612,
        -0.43187560175949313],
       [0.0, 0.0, 0.0, ..., 'Independent House', -0.1066158972698991,
        -0.3438712642582331],
       [0.0, 0.0, 0.0, ..., 'Independent House', -1.0550957723224965,
        0.31534706757944364]], shape=(200000, 65), dtype=object)

In [63]:
X_test

array([[0.0, 0.0, 0.0, ..., 'Apartment', 0.45447835620530036,
        -1.0378523714081531],
       [0.0, 0.0, 0.0, ..., 'Villa', 1.2807472911174091,
        -1.6206067575488379],
       [0.0, 0.0, 0.0, ..., 'Apartment', -0.6923060632262032,
        -1.5122828344780996],
       ...,
       [0.0, 0.0, 0.0, ..., 'Independent House', 1.4214051656187263,
        -0.8811720826790393],
       [0.0, 0.0, 0.0, ..., 'Independent House', -1.3333370322649927,
        0.8324699003302107],
       [0.0, 0.0, 0.0, ..., 'Independent House', 1.3137980485138936,
        0.19852715054237283]], shape=(50000, 65), dtype=object)