import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/kaggle/input/usa-housing-price/USA_Housing.csv')

In [None]:
df.head()

In [None]:
from packaging import version
import sklearn

assert version.parse(sklearn.__version__) >= version.parse("1.0.1")

In [None]:
df.info()

# Model Train test split

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.2, 
                                       random_state=42)

In [None]:
print(len(train_set), len(test_set))

# Feature Engineering

## Imputation using sklearn.impute

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

In [None]:
df_num = df.select_dtypes(include=[np.number])

In [None]:
df_num.head()

In [None]:
df_num.median().values

In [None]:
imputer.fit(df_num)

In [None]:
X = imputer.transform(df_num)

In [None]:
X

In [None]:
df_tr = pd.DataFrame(X, columns=df_num.columns,
                          index=df_num.index)

In [None]:
df_tr.head()

# Outlier Detection using Isolation forest

In [None]:
from sklearn.ensemble import IsolationForest

isolation_forest = IsolationForest(random_state=42)
outlier_pred = isolation_forest.fit_predict(X)

In [None]:
outlier_pred

In [None]:
df_tr['outlier_pred'] = outlier_pred

In [None]:
df_tr.head()

In [None]:
df_tr.loc[df_tr['outlier_pred']==-1]

# Handling Text and Categorical Attributes

In [None]:
df.info()

In [None]:
df['Address']

In [None]:
from sklearn.preprocessing import OrdinalEncoder
# Male = 0 , female =1
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(df)

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(df)

## Feature Scaling

In [None]:
df_tr.head()

In [None]:
df_tr = df_tr.drop('outlier_pred',axis = 1)

In [None]:
df_tr.head()

In [None]:
df_scale = df_tr.copy()

In [None]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler(feature_range=(-1, 1))
df_num_min_max_scaled = min_max_scaler.fit_transform(df_scale)

In [None]:
df_num_min_max_scaled

In [None]:
df_scale = df_num.copy()

In [None]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
df_num_std_scaled = std_scaler.fit_transform(df_scale)

In [None]:
len(df_num_std_scaled)

# Apply simple linear Regression

In [None]:
df_std_scaled = pd.DataFrame(df_num_std_scaled, columns=df_num.columns,
                          index=df_num.index)

In [None]:
df_std_scaled.head()

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df_std_scaled, test_size=0.2, 
                                       random_state=42)

In [None]:
train_set.info()

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(train_set[["Avg. Area Income","Avg. Area House Age"]],train_set['Price'])



In [None]:
scaled_predictions = model.predict(test_set[["Avg. Area Income","Avg. Area House Age"]])


In [None]:
from sklearn.metrics import r2_score

r2_score(test_set['Price'], scaled_predictions)