In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from datetime import date


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler , MinMaxScaler


from sklearn.linear_model import LinearRegression

from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

import missingno as msno
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=UserWarning)


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)
pd.options.display.float_format = '{:.2f}'.format

# First Glance

In [None]:
df_train = pd.read_csv("/kaggle/input/train.csv")
df_test = pd.read_csv("/kaggle/input/test.csv")

# EDA

**Concat train and test data**

In [None]:
df = pd.concat([df_train, df_test], axis=0)

**Finding Numerical and Categorical Variables**

In [None]:
def grab_col_names(dataframe, cat_th=40, car_th=30):
    
    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car


cat_cols, num_cols, cat_but_car = grab_col_names(df)

**View the numerical variables**

In [None]:
df[num_cols].nunique().sort_values(ascending=False)

**View the categorical variables**

In [None]:
df[cat_cols].nunique().sort_values(ascending=False)

# Making Analysis of  Categorical Variables

**Graph of Cat_Cols**

In [None]:
fig, axs = plt.subplots(11, 6, figsize=(20, 30))

for i, col in enumerate(cat_cols, start=1):
    ax = axs.flatten()[i-1]
    df[col].value_counts().plot(kind='bar', color=['blue', 'orange', 'green', 'red', "black"], ax=ax)
    ax.set_title("Count of "+col)

plt.subplots_adjust(hspace=1.4)
plt.subplots_adjust(wspace=0.3)

plt.show()

**Summary of Cat_Cols**

In [None]:
def cat_summary(dataframe, col_name):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
    print("-"*50)

for col in cat_cols:
    cat_summary(df, col)

# Making Analysis of Numerical Variables

**Graph of Num_Cols**

In [None]:
num_cols.remove('Id')
num_cols.remove('SalePrice')
fig, axs = plt.subplots(3, 6, figsize=(20, 10))

for i, col in enumerate(num_cols, start=0):
    ax = axs.flatten()[i]
    ax.hist(df[col], bins=40, color='orange', edgecolor='red')
    ax.set_title(col + " Distribution")

plt.subplots_adjust(hspace=0.5, wspace=0.3)
plt.show()

**Summary of Num_Cols**

In [None]:
def num_summary(dataframe, numerical_col):
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print (dataframe[numerical_col].describe(quantiles))
    print("-"*50)

for col in num_cols:
    num_summary(df, col)

# Analysis of Target Variable

My target value is the "SalePrice" value. I want to learn each categorical columns for target value. Therefore in next step, I continued my analysis of target values.


**Graphs of Num_Cols between Target("SalePrice")**

In [None]:
fig, axs = plt.subplots(6, 3, figsize=(18, 24))
for i, col in enumerate(num_cols):
    sns.scatterplot(x=col, y="SalePrice", data=df, ax=axs[i // 3, i % 3])
    axs[i // 3, i % 3].set_title('SalePrice by ' + col, color='green')
plt.subplots_adjust(hspace=2)
plt.tight_layout()
plt.show()

**Summary of Cat_Cols between Target("SalePrice")**

In [None]:

def target_summary_with_cat(dataframe, target, categorical_col):
    print(pd.DataFrame({"TARGET_MEAN": dataframe.groupby(categorical_col)[target].mean()}), end="\n\n\n")
    print("-"*50)

for col in cat_cols:
    target_summary_with_cat(df, "SalePrice", col)

# Observation of Outliers 

**Firstly i want to observe with BoxPlot**

In [None]:
fig, axs = plt.subplots(6, 3, figsize=(20, 30))
for i, col in enumerate(num_cols, start=1):
    sns.boxplot(x=df[col], ax=axs[(i-1)//3, (i-1)%3])
plt.subplots_adjust(hspace=0.3) 
plt.show()

**Now i observe with function to outliers**

In [None]:
def outlier_thresholds(dataframe, col_name, q1=0.25, q3=0.75):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit


def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False


for col in num_cols:
    print(f"{col:-<20}: {check_outlier(df, col)}")

# Observation of Missing Values

In [None]:
msno.bar(df, color="blue",fontsize=10)
plt.rcParams.update({'font.size': 10})
plt.show()

**Missing values numbers and ratios**

In [None]:
def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]

    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")

    if na_name:
        return na_columns

missing_values_table(df)

# Feature Engineering

**Adjust Outliers**

In [None]:
def outlier_thresholds(dataframe, col_name, q1=0.15, q3=0.85):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit


def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False


for col in num_cols:
    print(f"{col:-<20} : {check_outlier(df, col)}")

**Replace Outliers With Thresholds**

In [None]:
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit
    
    
for col in num_cols:
    replace_with_thresholds(df, col)    
    
    
for col in num_cols:
    print(f"{col:-<20} : {check_outlier(df, col)}")    

**Adjust Missing Values**

In [None]:
for col in cat_cols:
    if df[col].dtype != 'object':
        df[col] = df[col].astype('object')
        

def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]

    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")

    if na_name:
        return na_columns



In [None]:
df=df.apply(lambda x: x.fillna(x.mode()[0]) if (x.dtype == "O" and len(x.unique()) <= 40) else x, axis=0)

missing_values_table(df)     

In [None]:
for i in num_cols:
    df[i].fillna(df.groupby("Neighborhood")[i].transform("mean"), inplace=True)

missing_values_table(df)   

# Rare Encoding

**Rare Analysis Between Target Valuables**

In [None]:
def rare_analyser(dataframe, target, cat_cols):
    for col in cat_cols:
        print(col, ":", len(dataframe[col].value_counts()))
        print(pd.DataFrame({"COUNT": dataframe[col].value_counts(),
                            "RATIO": dataframe[col].value_counts() / len(dataframe),
                            "TARGET_MEAN": dataframe.groupby(col)[target].mean()}), end="\n\n\n")


rare_analyser(df, "SalePrice", cat_cols)

In [None]:
def rare_encoder(dataframe, rare_perc):
    temp_df = dataframe.copy()

    rare_columns = [col for col in temp_df.columns if temp_df[col].dtypes == 'O'
                    and (temp_df[col].value_counts() / len(temp_df) < rare_perc).any(axis=None)]

    for var in rare_columns:
        tmp = temp_df[var].value_counts() / len(temp_df)
        rare_labels = tmp[tmp < rare_perc].index
        temp_df[var] = np.where(temp_df[var].isin(rare_labels), 'Rare', temp_df[var])

    return temp_df


df = rare_encoder(df, 0.02)

rare_analyser(df, "SalePrice", cat_cols)


**Define the state of not being in the following variables with 0 and the state of being with 1.**

In [None]:
new = ["WoodDeckSF","GarageArea","BsmtFinSF2","EnclosedPorch","OpenPorchSF", "ScreenPorch","MasVnrArea","Fireplaces","MiscVal","3SsnPorch","LowQualFinSF","PoolArea"]
for item in new:
    name = "new_" + item
    df[name] = np.where(df[item] > 0, 1, 0)

**Total Porch Area**

In [None]:
df["new_totalPorchArea"]= df["EnclosedPorch"] +df["OpenPorchSF"]+df["ScreenPorch"]+df["3SsnPorch"]

df['new_luxury'] = (df["new_GarageArea"] + df["new_EnclosedPorch"] + df["new_OpenPorchSF"] + df["new_ScreenPorch"] + df["new_3SsnPorch"] + df["new_PoolArea"])
df['new_luxury'] = (df['new_luxury'] >=3).astype(int)


**Age of Building**

In [None]:
date = 2011
built=["YearBuilt","YearRemodAdd","GarageYrBlt"]

for i in built:
    df["new_age"+i]=date-df[i]
    df.drop(columns=i, axis=1 ,inplace=True)
    
    
bins_built = [0, 5, 15, 35, 75, np.inf]
labels_built = ['Very_New', 'New', 'Moderate', 'Old', 'Very_Old']
df["new_age_category"] = pd.cut(df["new_ageYearBuilt"], bins=bins_built, labels=labels_built, right=False)


bins_remodel = [1, 6, 16, 36, np.inf]
labels_remodel = [ 'Recently Renovated', 'Moderately Renovated', 'Old Renovated', 'Very Old Renovated']
df["new_remodel_age_category"] = pd.cut(df["new_ageYearRemodAdd"], bins=bins_remodel, labels=labels_remodel, right=False)



# Encoding

**Label Encoding**

In [None]:
def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe


binary_cols = [col for col in df.columns if df[col].dtype not in ["int64", "float64"]
               and df[col].nunique() == 2]


for col in binary_cols:
    label_encoder(df, col)
    
df.sample(5)

**One Hot Encoding**

In [None]:
def one_hot_encoder(dataframe, categorical_cols, drop_first=True):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first, dtype=int)
    return dataframe

ohe_cols = [col for col in df.columns if  17 >= df[col].nunique() > 2]

df=one_hot_encoder(df, ohe_cols)

df.sample(5)

# Scaling

In [None]:
columns_to_scale = [col for col in df.columns if col not in ['Id', 'SalePrice']]
df_scaled = df.copy()
df_scaled[columns_to_scale] = MinMaxScaler().fit_transform(df[columns_to_scale])
df_scaled.head()

# Modelling

**Splitting Train and Test**

In [None]:
df_test = df_scaled[df_scaled["SalePrice"].isna()]
df_train = df_scaled.dropna(subset = ["SalePrice"])


**Log Convertion for SalePrice**

In [None]:
df_train['SalePrice'] = np.log(df_train['SalePrice'])
df_train.head()

In [None]:
X = df_train.drop(["Id",'SalePrice'], axis=1)
y = df_train[["SalePrice"]]

# Linear Regression

In [None]:
Linear_model = LinearRegression().fit(X, y)


In [None]:
y_pred = Linear_model.predict(X)
np.sqrt(mean_squared_error(y, y_pred)) 