In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv("/kaggle/input/boston-housing-dataset/HousingData.csv")

#### head() - Gives first rows (by default it is 5)

In [None]:
df.head()

#### To print last 5 rows -> tail()

In [None]:
df.tail()

#### Here MEDV is target variable which is numerical feature

#### INFO : tells the information of each and every column
* ##### class of the file (Dataframe/ CSV)
* ##### RangeIndex, number of colummn
* ##### Column its non-null value count, and Data type of corresponding column
* ##### memory usage

In [None]:
df.info()

#### Printing null values respective to column

In [None]:
df.isnull().sum()

#### Here null values are present in the data 
##### To handle the null values we can drop that rows or we can replce that null value with mean, mode or median of that column.
##### In case of Numerical column --> Mean and median is prefered 
##### For categorical column --> Mode is preferred

In [None]:
# For non-categorical values

for col in df.columns:
    if df[col].dtype != 'object': 
        df[col].fillna(df[col].mean(), inplace=True)

df.isna().sum()

#### Now count of null values become 0

#### Describe -> Gives the statistical analysis of every column

In [None]:
df.describe()

#### Check for the duplicated values

In [None]:
df.duplicated().sum()

In [None]:
import warnings
warnings.filterwarnings("ignore")

#### To check the how dataset is distributed

In [None]:
plt.figure(figsize = (20, 15))
plotnumber = 1

for column in df:
    if plotnumber <= 14:
        ax = plt.subplot(3, 5, plotnumber)
        sns.distplot(df[column])
        plt.xlabel(column, fontsize = 15)
        
    plotnumber += 1
    
plt.tight_layout()
plt.show()

#### How the target column is related to other features

In [None]:
# Plotting `Price` with remaining columns

plt.figure(figsize = (20, 15))
plotnumber = 1

for column in df:
    if plotnumber <= 14:
        ax = plt.subplot(3, 5, plotnumber)
        sns.scatterplot(x = df['MEDV'], y = df[column])
        
    plotnumber += 1

plt.tight_layout()
plt.show()

#### looking for outliers using box plot

In [None]:
plt.figure(figsize = (20, 8))
sns.boxplot(data = df, width = 0.8)
plt.show()

#### To find the correlation between multiple variables

In [None]:
plt.figure(figsize = (4, 4))
clustermap = sns.clustermap(df.corr(), vmin = -1, vmax = 1, annot = True)

#### TAX And RAD has the coreation of 0.91 so drop one of the feature to reduce overfitting of the model

In [None]:
df = df.drop("TAX", axis=1)

#### Dividing the dataset into Features and Target variable

In [None]:
# creating features and label variable

X = df.iloc[:,:-1]
y = df['MEDV']

In [None]:
print(df.shape)
print(X.shape)
print(y.shape)

#### Printing head of the Fetures

In [None]:
X.head()

#### Printing head of the target variable

In [None]:
y.head()

#### Scaling the variables into -1 to 1 because of this every feature gets same importance

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_scaled

#### Spliiting the dataset to train the model

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Defining function to check how different models behave with different datasets

In [None]:
def train_test(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"For {model}")
    print("Train score : ", model.score(X_train, y_train))
    print("Test score : ", model.score(X_test, y_test))
    
    plt.figure(figsize = (10,5))
    sns.regplot(x=y_test,y=y_pred, line_kws=dict(color="r"))
    plt.title(f'{model}', fontsize = 20)
    print("-"*100)
    return

#### Importing differnt models

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor  

#### Creating object of model so that we can easily train the model

In [None]:
m1 = LinearRegression()
m2 = RandomForestRegressor()
m3 = KNeighborsRegressor(n_neighbors=5, metric='minkowski', p=2 )

In [None]:
models = [m1, m2, m3]

for model in models:
    train_test(model, X_train, X_test, y_train, y_test)

### From above conclusion is,
#### RandomForestRegressor model got overfitted as train score 0.97 is way higher than 0.86
#### LinearRegression and KNeighborsRegressor perform quite well 