In [85]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split




Remove blank using Panda lib. Used in data science

In [86]:
columns_to_keep = ["State", "City", "Property_Type", "Size_in_SqFt", "Price_in_Lakhs"]
cp = pd.read_csv(r'D:\AI_Practice\india_housing_prices.csv', usecols=columns_to_keep)
cp


Unnamed: 0,State,City,Property_Type,Size_in_SqFt,Price_in_Lakhs
0,Tamil Nadu,Chennai,Apartment,4740.0,489.76
1,Maharashtra,Pune,Independent House,2364.0,195.52
2,Punjab,Ludhiana,Apartment,3642.0,183.79
3,Rajasthan,Jodhpur,Independent House,2741.0,300.29
4,Rajasthan,Jaipur,Villa,4823.0,182.90
...,...,...,...,...,...
249995,Gujarat,Ahmedabad,Independent House,1995.0,274.75
249996,Tamil Nadu,Chennai,Apartment,2293.0,13.40
249997,Telangana,Warangal,Villa,2910.0,236.94
249998,Odisha,Cuttack,Villa,1149.0,141.66


In [87]:
 # Remove last col
x = cp.iloc[:,:-1]
y = cp.iloc[:,-1]
x 

Unnamed: 0,State,City,Property_Type,Size_in_SqFt
0,Tamil Nadu,Chennai,Apartment,4740.0
1,Maharashtra,Pune,Independent House,2364.0
2,Punjab,Ludhiana,Apartment,3642.0
3,Rajasthan,Jodhpur,Independent House,2741.0
4,Rajasthan,Jaipur,Villa,4823.0
...,...,...,...,...
249995,Gujarat,Ahmedabad,Independent House,1995.0
249996,Tamil Nadu,Chennai,Apartment,2293.0
249997,Telangana,Warangal,Villa,2910.0
249998,Odisha,Cuttack,Villa,1149.0


In [88]:
y

0         489.76
1         195.52
2         183.79
3         300.29
4         182.90
           ...  
249995    274.75
249996     13.40
249997    236.94
249998    141.66
249999    199.69
Name: Price_in_Lakhs, Length: 250000, dtype: float64

In [89]:
cp.isnull().sum()

State             0
City              0
Property_Type     0
Size_in_SqFt      1
Price_in_Lakhs    0
dtype: int64

In [90]:
cp['Size_in_SqFt'] = cp['Size_in_SqFt'].fillna(cp['Size_in_SqFt'].mean())

In [91]:
cp.isnull().sum()

State             0
City              0
Property_Type     0
Size_in_SqFt      0
Price_in_Lakhs    0
dtype: int64

In [92]:
columns_to_keep = ["State", "City", "Property_Type", "Size_in_SqFt", "Price_in_Lakhs"]
xp = pd.read_csv('D:\AI_Practice\india_housing_prices.csv', usecols=columns_to_keep)
xp


Unnamed: 0,State,City,Property_Type,Size_in_SqFt,Price_in_Lakhs
0,Tamil Nadu,Chennai,Apartment,4740.0,489.76
1,Maharashtra,Pune,Independent House,2364.0,195.52
2,Punjab,Ludhiana,Apartment,3642.0,183.79
3,Rajasthan,Jodhpur,Independent House,2741.0,300.29
4,Rajasthan,Jaipur,Villa,4823.0,182.90
...,...,...,...,...,...
249995,Gujarat,Ahmedabad,Independent House,1995.0,274.75
249996,Tamil Nadu,Chennai,Apartment,2293.0,13.40
249997,Telangana,Warangal,Villa,2910.0,236.94
249998,Odisha,Cuttack,Villa,1149.0,141.66


In [93]:
# for panda array
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer.fit(xp.iloc[:, 3:5])
xp.iloc[:, 3:5] = imputer.transform(xp.iloc[:, 3:5])

In [94]:
xp.isnull().sum()

State             0
City              0
Property_Type     0
Size_in_SqFt      0
Price_in_Lakhs    0
dtype: int64

In [95]:
# Encoding of  categories or text into binary data

In [96]:
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [0, 1])],
    remainder='passthrough'  # Keep numerical columns as they are
)

xxp = np.array(ct.fit_transform(xp))
xxp

array([[0.0, 0.0, 0.0, ..., 'Apartment', 4740.0, 489.76],
       [0.0, 0.0, 0.0, ..., 'Independent House', 2364.0, 195.52],
       [0.0, 0.0, 0.0, ..., 'Apartment', 3642.0, 183.79],
       ...,
       [0.0, 0.0, 0.0, ..., 'Villa', 2910.0, 236.94],
       [0.0, 0.0, 0.0, ..., 'Villa', 1149.0, 141.66],
       [0.0, 0.0, 0.0, ..., 'Independent House', 4313.0, 199.69]],
      shape=(250000, 65), dtype=object)

In [97]:
ct.get_feature_names_out()


array(['encoder__State_Andhra Pradesh', 'encoder__State_Assam',
       'encoder__State_Bihar', 'encoder__State_Chhattisgarh',
       'encoder__State_Delhi', 'encoder__State_Gujarat',
       'encoder__State_Haryana', 'encoder__State_Jharkhand',
       'encoder__State_Karnataka', 'encoder__State_Kerala',
       'encoder__State_Madhya Pradesh', 'encoder__State_Maharashtra',
       'encoder__State_Odisha', 'encoder__State_Punjab',
       'encoder__State_Rajasthan', 'encoder__State_Tamil Nadu',
       'encoder__State_Telangana', 'encoder__State_Uttar Pradesh',
       'encoder__State_Uttarakhand', 'encoder__State_West Bengal',
       'encoder__City_Ahmedabad', 'encoder__City_Amritsar',
       'encoder__City_Bangalore', 'encoder__City_Bhopal',
       'encoder__City_Bhubaneswar', 'encoder__City_Bilaspur',
       'encoder__City_Chennai', 'encoder__City_Coimbatore',
       'encoder__City_Cuttack', 'encoder__City_Dehradun',
       'encoder__City_Durgapur', 'encoder__City_Dwarka',
       'encoder_

In [98]:
# Labeling the data based yes no or any 

In [99]:
le = LabelEncoder()

cp.iloc[:,2] = np.array(le.fit_transform(cp.iloc[:,2]))
cp[:5]

Unnamed: 0,State,City,Property_Type,Size_in_SqFt,Price_in_Lakhs
0,Tamil Nadu,Chennai,0,4740.0,489.76
1,Maharashtra,Pune,1,2364.0,195.52
2,Punjab,Ludhiana,0,3642.0,183.79
3,Rajasthan,Jodhpur,1,2741.0,300.29
4,Rajasthan,Jaipur,2,4823.0,182.9


In [100]:
# Splitting the data set into training and test dataset 