#### Import the libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import warnings
warnings.filterwarnings('ignore')

#### Read the dataset

In [4]:
df = pd.read_csv('bank.csv')
# OR
#df = pd.read_excel('file_name.xlsx')
#df.head()

In [5]:
df.shape

(41188, 1)

### Data preprocessing


#### 1) Create a function to return a dataframe containing columns, count of null values and percentage of null values for each column

In [6]:
def fetch_count_and_percent_of_null(data):  # data is df
    nv = data.isnull().sum()  # nv = null values
    nv = nv[nv>0]
    res = pd.DataFrame({'Feature':nv.index,'Count_Null':nv.values,
                       'Percent_Null':(nv.values/data.shape[0])*100})
    res = res.sort_values('Percent_Null',ascending=False)
    return res

In [7]:
fetch_count_and_percent_of_null(df)

Unnamed: 0,Feature,Count_Null,Percent_Null


#### Handle Duplicates

In [8]:
df.duplicated().sum()

12

#### If duplicates exist

In [9]:
df.drop_duplicates(inplace=True)

#### Check data types

In [10]:
df.dtypes

age;"job";"marital";"education";"default";"housing";"loan";"contact";"month";"day_of_week";"duration";"campaign";"pdays";"previous";"poutcome";"emp.var.rate";"cons.price.idx";"cons.conf.idx";"euribor3m";"nr.employed";"y"    object
dtype: object

#### Feature Preprocessing

#### Feature Engineering

#### Separate numerical and Categorical Featurs

In [7]:
num_cols = df.dtypes[df.dtypes!='object'].index
cat_cols = df.dtypes[df.dtypes=='object'].index
print(num_cols)
print(cat_cols)

### EDA

In [8]:
for i in df.columns:
    print(f'Feature {i} | Unique Count {df[i].nunique()}')

#### Univariate Analysis

#### 1) Countplot for Categorical Features

In [9]:
cols = []

plt.figure(figsize=(12,20))
for i in range(len(cols)): 
    plt.subplot(,,i+1)  
    sns.countplot(x=df[cols[i]])
    plt.title(f'Countplot for {cols[i]}')

plt.tight_layout()
plt.show()

In [10]:
# num_cols

#### 2) Boxplot for numerical Features

In [11]:
cols = []

plt.figure(figsize=(12,20))
for i in range(len(cols)): 
    plt.subplot(,,i+1)
    sns.boxplot(x=df[cols[i]])
    plt.title(f'Boxplot for {cols[i]}')

plt.tight_layout()
plt.show()

#### 3) Histplot for numerical Features

In [13]:
cols = []

plt.figure(figsize=(12,20))
for i in range(len(cols)):  
    plt.subplot(,,i+1)    
    sns.distplot(x=df[cols[i]],kde=True)
    plt.title(f'Histplot for {cols[i]}')

plt.tight_layout()
plt.show()

#### 4) Pie Chart for Categorical Features

#### Correlation

In [None]:
num_cols = df.dtypes[df.dtypes!='object'].index

In [None]:
corr = df[num_cols].corr()

plt.figure(figsize=(12,12))
sns.heatmap(corr,annot=True,cmap='RdBu')  # annot= True => annotation
plt.show()

#### Drop Redundant Features (If Any)

In [None]:
df.drop(col_list,axis=1)

#### Outlier Treatment

In [None]:
a = df.describe(percentiles =[0.01,0.02,0.03,0.97,0.98,0.99]).T
a = a.iloc[:,3:]
a

In [None]:
df1 = df.copy()
df2 = df.copy()

### Method - 1 (Treats extreme outliers)

In [15]:
cols_left_skewed = []

def lower_outlier_treatment(x):
    x = x.clip(lower = x.quantile(0.01))
    return x

In [None]:
df[cols_left_skewed] = df[cols_left_skewed].apply(lower_outlier_treatment)

In [None]:
cols_right_skewed = []

def upper_outlier_treatment(x):
    x = x.clip(upper = x.quantile(0.99))
    return x

In [None]:
df[cols_right_skewed] = df[cols_right_skewed].apply(lower_outlier_treatment)

### Method - 2 (Treats all outliers)

In [None]:
cols_with_outliers = []

In [40]:
for i in df[cols_with_outliers]:
    q1 = df[i].quantile(0.25)
    q3 = df[i].quantile(0.75)
    iqr = q3 - q1
    upper = q3 + 1.5*iqr
    lower = q1 - 1.5*iqr
    df[i] = np.where(df[i]>upper,upper,df[i])
    df[i] = np.where(df[i]<lower,lower,df[i])

#### Encoding

In [43]:
cat_cols = df.dtypes[df.dtypes=='object'].index
print(cat_cols)

In [41]:
for i in cat_cols:
    print(i,df[i].nunique())

In [16]:
cols_to_be_encoded = []

df_dum = pd.get_dummies(data=df,columns = cols_to_be_encoded, drop_first=True)
print(df_dum.shape)
print(df_dum.columns)

In [17]:
df_dum.dtypes

#### Select x and y

In [20]:
x = df_dum.drop(['Target'],axis=1)
y = df_dum['Rent']
print(x.shape)
print(y.shape)

#### Assumption Check: Multicollinearity(VIF)

In [74]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [21]:
vif = pd.DataFrame()
vif['Feature'] = x.columns
vif['VIF'] = [variance_inflation_factor(x.values,i) for i in range(x.shape[1])]
vif = vif.sort_values('VIF',ascending=False)
vif

#### Assmption Check : Target variable is normally dist

In [42]:
from scipy.stats import shapiro, jarque_bera
# Ho : Data is normally dist => p>0.05
# H1 : Data is not normally dist => p<=0.05

In [22]:
test_stat,p = shapiro(df_dum['Target'])
print(test_stat)
print(p)

In [23]:
test_stat,p = shapiro(np.log((df_dum['Target'])))
print(test_stat)
print(p)

### OLS Model - 1

In [98]:
x1 = x.copy()
y1 = y.copy()

In [24]:
x1 = sm.add_constant(x1)
ols_m1 = sm.OLS(y1,x1).fit()
ols_m1.summary()

In [25]:
print('OLS_M1')
print(ols_m1.rsquared)
print(ols_m1.rsquared_adj)

#### Inference
1) col1, cols2, cols3 etc have high p-value

### OLS - M2

In [26]:
x2 = x.drop(['cols_with_high_p_val'],axis=1)
y2 = y1.copy()

In [27]:
x2 = sm.add_constant(x2)
ols_m2 = sm.OLS(y2,x2).fit()
ols_m2.summary()

In [28]:
print('OLS_M1')
print(ols_m1.rsquared)
print(ols_m1.rsquared_adj)

print('OLS_M2')
print(ols_m2.rsquared)
print(ols_m2.rsquared_adj)

#### Assumption Check : Homescedasticity

In [None]:
residuals = ols_m2.resid

In [None]:
plt.scatter(ols_m2.predict(), residuals)   
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.title('Residuals vs. Fitted values')
plt.show()

#### Split data into train and test

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
x_train,x_test, y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

#### Function to evaluate Regression Model

In [31]:
from sklearn.metrics import *

#### Approach - 1

In [138]:
def eval_model_basic(ytest,ypred):
    mae = mean_absolute_error(ytest,ypred)
    mse = mean_squared_error(ytest,ypred)
    rmse = np.sqrt(mse)
    r2s = r2_score(ytest,ypred)
    print('MAE',mae)
    print('MSE',mse)
    print('RMSE',rmse)
    print('R2_Score',r2s)

#### Apporach - 2

In [32]:
def eval_model_advanced(model,x_train,y_train,x_test,y_test,modelname):
    model.fit(x_train,y_train)
    ypred = model.predict(x_test)
    train_r2 = model.score(x_train,y_train)
    train_r2 = model.score(x_test,y_test)
    test_mae = mean_absolute_error(ytest,ypred)
    test_mse = mean_squared_error(ytest,ypred)
    test_rmse = np.sqrt(mse)
    res = pd.DataFrame({'Train_R2':train_r2,'Test_R2':test_r2,'Test_MAE':test_mae,
                        'Test_MSE':test_mse,'Test_RMSE':test_rmse},index=[modelname])
    return res

In [37]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso

#### Linear Regression

In [33]:
lr1 = LinearRegression()
lr1_res = eval_model_advanced(lr1,x_train,y_train,x_test,y_test,'Lin_Reg')
lr1_res

#### If model Overfits => Apply Regularization

#### Ridge

In [None]:
rid1 = Ridge()
rid1_res = eval_model_advanced(rid1,x_train,y_train,x_test,y_test,'Ridge')
rid1_res

#### Lasso

In [None]:
las1 = Lasso()
las1_res = eval_model_advanced(las1,x_train,y_train,x_test,y_test,'Lasso')
las1_res

### Tabulate All Results

In [None]:
all_res = pd.concat([lr1_res,rid1_res,las1_res])
all_res

#### Overtting and Underfitting

1) Overfitting<br>
a) Model with high train score and comparatively low test score.<br>
b) Model with high variance and low bias.


2) Underfitting<br>
a) Model with high test score and comparatively low train score.<br>
b) Model with low variance and high bias.

Bias varaince TradeOff<br>
1) Bais and Variance are inversely proprotaional.