#download data from the data sources

In [None]:
import numpy as np
import matplotlib as plt
import pandas as pd
import tarfile
import urllib.request
from pathlib import Path

In [None]:
#download data file
def download_housing_data():
    tarball_path = Path('dataset/housing.tag')

    if not tarball_path.is_file():
        Path('dataset').mkdir(parents=True,exist_ok=True)
        url= "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url,tarball_path)

        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path='dataset')
    return pd.read_csv(Path("dataset/housing/housing.csv"))

In [None]:
housing = download_housing_data()

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
housing.describe()

In [None]:
housing.hist(bins=50,figsize=(8,12))

#CREATE TEST AND TRAIN DATA SET

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_set, test_set = train_test_split(housing ,test_size=0.2,random_state=42)

In [None]:
len(train_set),len(test_set)

# creating catogorical variable for median_incoming 

In [None]:
housing['income_cat']=pd.cut(housing['median_income'],bins=[0.,1.5,3,4.5,6,np.inf],labels=[1,2,3,4,5])

#bar graph reprecentation

In [None]:
housing["income_cat"].value_counts().sort_index().plot.bar(rot=0, grid=True)

In [None]:
#create train and test data in the behaf of income_cat
from sklearn.model_selection import StratifiedShuffleSplit

splitter = StratifiedShuffleSplit(n_splits=10,test_size=0.2,random_state=42)
start_spliter=[]
for train_index,test_index in splitter.split(housing,housing['income_cat']):
    train_set_n = housing.iloc[train_index]
    test_set_n = housing.iloc[test_index]
    start_spliter.append([train_set_n,test_set_n])

In [None]:
strat_train_set, strat_test_set = start_spliter[0]

In [None]:
#auternative 
strat_train_set, strat_test_set = train_test_split(
housing, test_size=0.2, stratify=housing["income_cat"], random_state=42)

In [None]:
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [None]:
housing = strat_train_set.copy()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True)

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True,alpha=0.2)

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True,
s=housing["population"] / 100, label="population",
c="median_house_value", cmap="jet", colorbar=True,
legend=True, sharex=False, figsize=(10, 7))


In [None]:
#correlation with each features
corr_matrix = housing.corr()

In [None]:
#correlation with graph
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms","housing_median_age"]
scatter_matrix(housing[attributes],figsize=(12,8))

In [None]:
housing.plot(kind="scatter", x="median_income", y="median_house_value",
alpha=0.1, grid=True)


In [None]:
#creating new atributes
housing["rooms_per_house"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_ratio"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["people_per_house"] = housing["population"] / housing["households"]

In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
#seperated the label data
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [None]:
#handeling messing values for hole data set with simple imputer
from sklearn.impute import SimpleImputer

#create instances
imputer = SimpleImputer(strategy='median')

#slect only neumerical data
housing_num = housing.select_dtypes(include=[np.number])

#fit it
imputer.fit(housing_num)


In [None]:
imputer.statistics_

In [None]:
housing_num.median().values

In [None]:
#replace the null values using thire corespondig statics values
x= imputer.transform(housing_num)

In [None]:
housing_data = pd.DataFrame(x,columns=housing_num.columns,index=housing_num.index)

In [None]:
housing_data

In [None]:
housing_cat_ = housing[['ocean_proximity']]

In [None]:
housing_cat_.head()

In [None]:
from sklearn.preprocessing import OrdinalEncoder

Ordinal_Encoder =OrdinalEncoder()

data_ordinal_enco = Ordinal_Encoder.fit_transform(housing_cat_)

In [None]:
data_ordinal_enco[:8]

In [None]:
Ordinal_Encoder.categories_

In [None]:
Ordinal_Encoder.feature_names_in_

In [None]:
Ordinal_Encoder.get_feature_names_out()

In [None]:
from sklearn.preprocessing import OneHotEncoder

One_Hot_Encoder = OneHotEncoder()

One_cat_data = One_Hot_Encoder.fit_transform(housing_cat_)

In [None]:
One_cat_data

In [None]:
One_cat_data.toarray()

In [None]:
One_Hot_Encoder.categories_

In [None]:
df_test = pd.DataFrame({"ocean_proximity": ["INLAND", "NEAR BAY"]})

In [None]:
df_test

In [None]:
pd.get_dummies(df_test)

In [None]:
df_test_unknown = pd.DataFrame({"ocean_proximity": ["<2H OCEAN", "ISLAND"]})
pd.get_dummies(df_test_unknown)

In [None]:
One_Hot_Encoder.handle_unknown = "ignore"
df=One_Hot_Encoder.transform(df_test_unknown)

In [None]:
df.toarray()

In [None]:
One_Hot_Encoder.feature_names_in_

In [None]:
One_Hot_Encoder.get_feature_names_out()

In [None]:
df_output = pd.DataFrame(One_Hot_Encoder.transform(df_test_unknown),
columns=One_Hot_Encoder.get_feature_names_out(),
index=df_test_unknown.index)


In [None]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler(feature_range=(-1,1))

housing_num_min_max_scal = min_max_scaler.fit_transform(housing_num)

In [None]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()

housing_num_std_scaler = std_scaler.fit_transform(housing_num)

In [None]:
housing_num_std_scaler[:3]

In [None]:
std_scaler_without_mean=StandardScaler(with_mean=False)

df=std_scaler_without_mean.fit_transform(housing_num)

In [None]:
df[:3]

In [None]:
from sklearn.metrics.pairwise import rbf_kernel
age_simil_35 = rbf_kernel(housing[["housing_median_age"]], [[35]], gamma=0.1)

In [None]:
age_simil_35

In [None]:
some_new_data = housing[["median_income"]].iloc[:5] 

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
from sklearn.compose import TransformedTargetRegressor
model = TransformedTargetRegressor(LinearRegression(),
transformer=StandardScaler())
model.fit(housing[["median_income"]], housing_labels)
predictions = model.predict(some_new_data)


In [None]:
#logrethemuc trasformation
from sklearn.preprocessing import FunctionTransformer

log_transform = FunctionTransformer(np.log,inverse_func=np.exp)

target_log_transform = log_transform.transform(housing[["population"]])

In [None]:
target_log_transform

In [None]:
rbf_transform = FunctionTransformer(rbf_kernel,kw_args=dict(Y=[[35]],gamma=0.1))

rbf_transform_df = rbf_transform.transform(housing[['housing_median_age']])

In [None]:
#geography similarity between each distict and sunsfrasco
sf_coords = 37.7749, -122.41

sf_transform = FunctionTransformer(rbf_kernel,kw_args=dict(Y=[sf_coords],gamma= 0.1))

sf_transform_similarity=sf_transform.transform(housing[["latitude", "longitude"]])