<a href="https://colab.research.google.com/github/BumaranChe/Python_Regression_Using_class_file_to_call_methods/blob/main/class_regression_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.datasets import make_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold


In [None]:
class RegressionModel:

  def __init__(self,model,X,y,test_size = 0.2,random_state=42):

    self.model = model
    self.X = X
    self.y = y
    self.test_size = test_size
    self.random_state = random_state
    self.X_train,self.X_test,self.y_train,self.y_test = train_test_split(self.X,self.y,test_size=self.test_size,random_state=self.random_state)
    self.model.fit(self.X_train,self.y_train)
    self.y_pred = self.model.predict(self.X_test)

  def evaluate(self):#Must Have
    r2 = round(metrics.r2_score(self.y_test, self.y_pred), 2)
    print(f"{self.model} R2 : {r2}")
    adj_r2 = round(1 - (1 - r2) * (len(self.y_test) - 1) / (len(self.y_test) - self.X.shape[1] - 1), 2)
    print(f"{self.model} adj_R2 : {adj_r2}")
    mse = metrics.mean_squared_error(self.y_test, self.y_pred)
    print(f"{self.model} MSE : {mse}")
    rmse = np.sqrt(mse)
    print(f"{self.model} RMSE: {rmse}")


  def plot_heat_map(self,df):#Must Have
    correlation_matrix = df.corr()
    print(correlation_matrix)
    sns.heatmap(correlation_matrix, annot=True)
    plt.show()

  def k_fold(self):
    kfold = KFold(n_splits=5,shuffle=True,random_state=42)
    scores = cross_val_score(self.model,self.X,self.y,cv=5)
    print("Cross-validation scores: ", np.round(scores,2))
    print("Average cross-validation score: ", np.round(scores.mean(),2))


  def GridSearchCV_DTR(self):
    param_grid= {
    "max_depth":[None,5,10,15],
    "min_samples_split":[2,5,10],
    "min_samples_leaf":[1,2,5],
    "max_features":[None,"auto","sqrt","log2"]
    }
    grid_search=GridSearchCV(estimator=DecisionTreeRegressor(),param_grid=param_grid,cv=5,scoring="neg_mean_squared_error")
    grid_search.fit(self.X_train,self.y_train)
    best_params=grid_search.best_params_
    print(best_params)
    grid_search.best_estimator_
    self.y_pred_tuned=grid_search.best_estimator_.predict(self.X_test)
    mse_tuned=metrics.mean_squared_error(self.y_test,self.y_pred_tuned)
    print(f"{self.model} Mean Squared Error: {mse_tuned}")
    rmse_tuned = np.sqrt(mse_tuned)
    print(f"{self.model} Root Mean Squared Error: {rmse_tuned}")
    r2 = round(metrics.r2_score(self.y_test, self.y_pred), 2)
    print(f"{self.model} R2 : {r2}")
    adj_r2 = round(1 - (1 - r2) * (len(self.y_test) - 1) / (len(self.y_test) - self.X.shape[1] - 1), 2)
    print(f"{self.model} adj_R2 : {adj_r2}")

  def RFR(self):
    model=RandomForestRegressor(n_estimators=500,max_features=3,max_samples=60,oob_score=True,random_state=0)
    print(f"OOB Score:{(model.oob_score_)}")
    self.evaluate

  def predict(self,X_new):#Must Have
    self.y_pred=self.model.predict(X_new)[0]
    print(f"{self.model} Predicted Value : {self.y_pred}")

  def remove_outliers(self):

    sns.set_style('darkgrid')
    OrderedCols = np.concatenate([self.df.select_dtypes(exclude='object').columns.values,
                              self.df.select_dtypes(include='object').columns.values])


    fig, ax = plt.subplots(2, 4, figsize=(15,7),dpi=100)

    for i,col in enumerate(OrderedCols):#This loop iterates through each column (col) in the OrderedCols array, creating a box plot for it.
        x = i//4# These lines calculate the row (x) and column (y) position in the plot grid for the current box plot.
        y = i%4
        if i<5:
            sns.boxplot(data=self.df, y=col, ax=ax[x,y])#This creates the actual box plot for the current column (col).
            ax[x,y].yaxis.label.set_size(15)#This specifies the subplot where the box plot should be drawn.
        else:
            sns.boxplot(data=self.df, x=col, y='Selling_Price', ax=ax[x,y])
            ax[x,y].xaxis.label.set_size(15)
            ax[x,y].yaxis.label.set_size(15)

    plt.tight_layout()#This adjusts the spacing between subplots to prevent overlapping.
    plt.show()#This displays the generated plot.

  def remove_outliers_iqr(data, column):
         Q1 = data[column].quantile(0.25)#This line calculates the first quartile (Q1), which is the value that separates the bottom 25% of the data from the rest.
         Q3 = data[column].quantile(0.75)#This line calculates the third quartile (Q3), which is the value that separates the bottom 75% of the data from the rest.
         IQR = Q3 - Q1#This line calculates the Interquartile Range (IQR) by subtracting Q1 from Q3. The IQR represents the middle 50% of the data.
         lower_bound = Q1 - 1.5 * IQR#This line calculates the lower bound for outlier detection. Any data point below this bound is considered an outlier.
         upper_bound = Q3 + 1.5 * IQR#This line calculates the upper bound for outlier detection. Any data point above this bound is considered an outlier.
         data_filtered = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]#This line filters the data, keeping only the data points within the calculated bounds (i.e., not outliers).
         return data_filtered
         for columns in df.columns:
            df=remove_outliers_iqr(df, columns)


