In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import re
import random
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler,StandardScaler,OneHotEncoder,LabelEncoder,PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score,d2_pinball_score
import pycaret

In [2]:
df = pd.read_csv('../data/housing.csv', skipinitialspace=True)
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [4]:
null_index_list = df[pd.isna(df['total_bedrooms'])].index

In [5]:
df.duplicated().sum()

0

In [6]:
from pycaret.regression import RegressionExperiment
reg = RegressionExperiment()

In [7]:
reg.setup(df,target='median_house_value')

Unnamed: 0,Description,Value
0,Session id,260
1,Target,median_house_value
2,Target type,Regression
3,Original data shape,"(20640, 10)"
4,Transformed data shape,"(20640, 14)"
5,Transformed train set shape,"(14447, 14)"
6,Transformed test set shape,"(6193, 14)"
7,Numeric features,8
8,Categorical features,1
9,Rows with missing values,1.0%


<pycaret.regression.oop.RegressionExperiment at 0x7f42b11cbee0>

In [8]:
# best = reg.compare_models()

In [9]:
print(f"frequency-->{df['total_bedrooms'].value_counts().values[:10]}")
print(f"value-->{df['total_bedrooms'].value_counts().keys()[:10]}")

frequency-->[55 51 50 49 49 48 48 48 47 47]
value-->Float64Index([280.0, 331.0, 345.0, 343.0, 393.0, 328.0, 348.0, 394.0, 272.0,
              309.0],
             dtype='float64')


# gonna fill null values with the top 10 most frequent unique values,since uniques have very close frequency🛠️

In [10]:
class Do(LabelEncoder):
    def __init__(self,**kwargs):
        super().__init__(**kwargs)
        
    def filling(self):
        for index in range(len(df.index)):
            if pd.isna(df.loc[index,'total_bedrooms']):
                df.loc[index,'total_bedrooms']=random.choice(df['total_bedrooms'].value_counts().keys()[:10])
            else:
                continue

        print('Done')
        for index in null_index_list[:10]:
            print(df.loc[index,'total_bedrooms'])
            
    def check_sign(self):
        for column in df.columns:
            for index,value in enumerate(df[column].unique()):
                if re.search(r'[^\w\s]',str(value)):
                    if '.' not in list(str(value)):
                        print(f'{column}--> {value}--> {index}')
                        
    def give_num_unique(self):
        for column in df.columns:
            print(f"{column}--> {len(df[column].unique())}")
            
    def encode(self):
        LEN = LabelEncoder()
        df['ocean_proximity'] = LEN.fit_transform(df['ocean_proximity'])
        
    
        


In [11]:
Do().filling()

Done
272.0
272.0
343.0
328.0
393.0
272.0
393.0
345.0
345.0
280.0


In [12]:
df.isnull().sum().sum()

0

In [13]:
Do().check_sign()

ocean_proximity--> <1H OCEAN--> 1


# OK, we are now sure there is no irrational value like '?' 

In [14]:
Do().give_num_unique()

longitude--> 844
latitude--> 862
housing_median_age--> 52
total_rooms--> 5926
total_bedrooms--> 1923
population--> 3888
households--> 1815
median_income--> 12928
median_house_value--> 3842
ocean_proximity--> 5


In [15]:
Do().encode()


In [16]:
df['ocean_proximity'].value_counts()

0    9136
1    6551
4    2658
3    2290
2       5
Name: ocean_proximity, dtype: int64

In [17]:
cor = df.corr()
cor['median_house_value'].sort_values(ascending=False)

median_house_value    1.000000
median_income         0.688075
total_rooms           0.134153
housing_median_age    0.105623
ocean_proximity       0.081750
households            0.065843
total_bedrooms        0.049410
population           -0.024650
longitude            -0.045967
latitude             -0.144160
Name: median_house_value, dtype: float64

In [18]:
df['mean_num_family'] = df['population']/df['households']

In [19]:
cor = df.corr()
cor['median_house_value'].sort_values(ascending=False)

median_house_value    1.000000
median_income         0.688075
total_rooms           0.134153
housing_median_age    0.105623
ocean_proximity       0.081750
households            0.065843
total_bedrooms        0.049410
mean_num_family      -0.023737
population           -0.024650
longitude            -0.045967
latitude             -0.144160
Name: median_house_value, dtype: float64

In [20]:
# df = df.drop(columns=['mean_num_family','ocean_proximity','households','total_bedrooms','population','longitude','latitude'],axis=1)

In [21]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,mean_num_family
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,3,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,3,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,3,2.80226
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,3,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,3,2.181467


# for the reason that I can not understand it well, models can work better on the original data than data with deducted low correlated features😕

In [22]:
df =  pd.DataFrame(PowerTransformer().fit_transform(df),columns=df.columns)

In [23]:
df.skew()

longitude             0.000000
latitude              0.153850
housing_median_age   -0.113089
total_rooms           0.121378
total_bedrooms        0.106429
population            0.110641
households            0.109520
median_income        -0.002538
median_house_value   -0.012149
ocean_proximity       0.201653
mean_num_family      -0.106888
dtype: float64

In [24]:
# df.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
# s=df["population"]/100, label="population", figsize=(10,7),
# c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True)
# plt.legend()


In [25]:
X = df.drop(columns=['median_house_value'],axis=1)
Y = df['median_house_value']

In [26]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=100)
print(x_train.shape)
print(y_test.shape)

(16512, 10)
(4128,)


In [27]:
mms = MinMaxScaler(feature_range=(0,1))
x_train = mms.fit_transform(x_train)
x_test = mms.transform(x_test)

In [28]:
x_train

array([[0.12820513, 0.84182426, 0.29082996, ..., 0.51234495, 0.55910099,
        0.5704882 ],
       [0.28490028, 0.47246882, 0.24784563, ..., 0.49536638, 1.        ,
        0.49804813],
       [0.15954416, 0.92633422, 0.39355362, ..., 0.50814317, 0.55910099,
        0.45961222],
       ...,
       [0.64672365, 0.27185349, 0.05718713, ..., 0.6820536 , 0.55910099,
        0.58327867],
       [0.66096866, 0.04069486, 0.71112269, ..., 0.39742152, 1.        ,
        0.55702191],
       [0.52991453, 0.23427415, 0.93387296, ..., 0.61554143, 0.        ,
        0.5145662 ]])

In [29]:
linear = SVR(kernel='linear',gamma='auto',C=1)
linear.fit(x_train,y_train)
li_pre = linear.predict(x_test)
r2_score(y_test,li_pre)

0.6976350110728666

In [30]:
rbf = SVR(kernel='rbf',gamma='scale',C=0.1)
poly = SVR(kernel='poly',degree=3,C=1)

In [31]:
rbf.fit(x_train,y_train)
poly.fit(x_train,y_train)

In [32]:
rbf.score(x_test,y_test)

0.7811529385974698

In [33]:
rbf_pred = rbf.predict(x_test)
poly_pred = poly.predict(x_test)
poly_pred_train = poly.predict(x_train)
rbf_pred_train = rbf.predict(x_train)
print(r2_score(y_test,rbf_pred))
print(r2_score(y_test,poly_pred))

0.7811529385974698
0.7685406112545086


In [34]:
print(f"poly accuracy score-->{poly.score(x_test,y_test)}")
print(f"rbf accuracy score--> {rbf.score(x_test,y_test)}")
print(f" poly mean-squaree-error--> {mean_squared_error(y_test,poly_pred)}")
print(f" poly mean-squaree-error_train--> {mean_squared_error(y_train,poly_pred_train)}")
print('\n')
print(f" rbf mean-squaree-error--> {mean_squared_error(y_test,rbf_pred)}")
print(f"rbf mean-squaree-error_train--> {mean_squared_error(y_train,rbf_pred_train)}")





poly accuracy score-->0.7685406112545086
rbf accuracy score--> 0.7811529385974698
 poly mean-squaree-error--> 0.23517398464154582
 poly mean-squaree-error_train--> 0.2354764190224435


 rbf mean-squaree-error--> 0.22235924727909132
rbf mean-squaree-error_train--> 0.2290176717430492


In [35]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [36]:
lr.fit(x_train,y_train)

In [37]:
y_pred = lr.predict(x_test)

In [38]:
lr.score(x_test,y_test)

0.6981051627210666

In [39]:
r2_score(y_test,y_pred)

0.6981051627210666

In [40]:
rbf = SVR(kernel='rbf',gamma='auto',C=1.0,)
poly = SVR(kernel='poly',degree=2,C=1.0)
lin = SVR(kernel='linear',gamma='auto',C=1.0,)

In [41]:
rbf.fit(x_train,y_train)
poly.fit(x_train,y_train)
lin.fit(x_train,y_train)

In [42]:
rbf_pred = rbf.predict(x_test)
poly_pred = poly.predict(x_test)
lin_pred = lin.predict(x_test)
poly_pred_train = poly.predict(x_train)
rbf_pred_train = rbf.predict(x_train)
lin_pred_train = lin.predict(x_train)
print(r2_score(y_test,rbf_pred))
print(r2_score(y_test,lin_pred))

0.7365076485594639
0.6976350110728666


In [43]:
print(f"poly accuracy score-->{poly.score(x_test,y_test)}")
print(f"rbf accuracy score--> {rbf.score(x_test,y_test)}")
print(f"lin accuracy score--> {lin.score(x_test,y_test)}")
print(f" rbf mean-squaree-error--> {mean_squared_error(y_test,rbf_pred)}")
print(f"rbf mean-squaree-error_train--> {mean_squared_error(y_train,rbf_pred_train)}")
print(f"linear mean-squaree-error--> {mean_squared_error(y_test,lin_pred)}")
print(f" linear mean-squaree-error_train--> {mean_squared_error(y_train,lin_pred_train)}")
# print(f"rbf root_mean-squaree-error_train--> {mean_squared_error(y_test,rbf_pred_train,squared=False)}")
# print(f"liner root_mean-squaree-error_train--> {mean_squared_error(y_test,lin_pred_train,squared=False)}")


poly accuracy score-->0.7530967451297914
rbf accuracy score--> 0.7365076485594639
lin accuracy score--> 0.6976350110728666
 rbf mean-squaree-error--> 0.26772103109188944
rbf mean-squaree-error_train--> 0.28300723876659434
linear mean-squaree-error--> 0.3072175194426022
 linear mean-squaree-error_train--> 0.32587267440589407


In [44]:
d2_pinball_score(y_test,lin_pred,)

0.48440366413606406

In [45]:
rbf = SVR(kernel='rbf',gamma=6,C=5,epsilon=0.1,shrinking=True)
rbf.fit(x_train,y_train)
pred = rbf.predict(x_test)
pred_train = rbf.predict(x_train)
print(r2_score(y_test,pred))
print(mean_squared_error(y_test,pred))
print(mean_squared_error(y_train,pred_train))

0.8314485434784378
0.17125646905988462
0.14128942741484796
