In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer, SimpleImputer
from scipy.stats import zscore
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#@title Data Cleaning
class DataCleaning:
    def __init__(self, duplicates=True, missing_num=None, missing_categ=None, outliers_method=None, scaler=None, extract_datetime=False, encode_categ=None):
        self.duplicates = duplicates
        self.missing_num = missing_num
        self.missing_categ = missing_categ
        self.outliers_method = outliers_method
        self.scaler = scaler
        self.extract_datetime = extract_datetime
        self.encode_categ = encode_categ

    def fit_transform(self, df):
        original_dtypes = df.dtypes

        if self.duplicates:
            df = Duplicates().handle(df)
        if self.missing_num or self.missing_categ:
            df = MissingValues(self.missing_num, self.missing_categ).handle(df)
        if self.outliers_method:
            df = Outliers().handle(df)
        if self.scaler or self.extract_datetime:
            df = Adjust(self.scaler, self.extract_datetime).handle(df)
        if self.encode_categ:
            df = EncodeCateg(self.encode_categ).handle(df)

        for col in df.columns:
            if original_dtypes[col] in [np.float64, np.int64]:
                df[col] = df[col].astype(original_dtypes[col])

        return df


class Duplicates:
    def handle(self, df):
        df.drop_duplicates(inplace=True, ignore_index=True)
        return df


class MissingValues:
    def __init__(self, missing_num=None, missing_categ=None):
        self.missing_num = missing_num
        self.missing_categ = missing_categ

    def handle(self, df, _n_neighbors=5):
        if self.missing_num or self.missing_categ:
            if df.isna().sum().sum() != 0:
                if self.missing_num:
                    df = self._handle_missing_num(df, _n_neighbors)
                if self.missing_categ:
                    df = self._handle_missing_categ(df, _n_neighbors)
        return df

    def _handle_missing_num(self, df, _n_neighbors):
        num_cols = df.select_dtypes(include=np.number).columns
        for col in num_cols:
            if self.missing_num in ['auto', 'knn']:
                imputer = KNNImputer(n_neighbors=_n_neighbors)
                df[col] = imputer.fit_transform(df[[col]])
                df[col] = df[col].round().astype('Int64')
        return df

    def _handle_missing_categ(self, df, _n_neighbors):
        cat_cols = set(df.columns) - set(df.select_dtypes(include=np.number).columns)
        for col in cat_cols:
            if self.missing_categ in ['auto', 'logreg', 'most_frequent']:
                if self.missing_categ == 'most_frequent':
                    strategy = self.missing_categ
                else:
                    strategy = 'constant'
                imputer = SimpleImputer(strategy=strategy)
                df[col] = imputer.fit_transform(df[[col]])
        return df


class Outliers:
    def handle(self, df):
        df = self.replace_outliers(df)
        return df

    def detect_outliers(self, df):
        Q1 = df.quantile(0.25)
        Q3 = df.quantile(0.75)
        IQR = Q3 - Q1
        return ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)

    def replace_outliers(self, df, replacement_value=None):
        if replacement_value is None:
            replacement_value = df.median(numeric_only=True)
        for col in df.columns:
            if df[col].dtype != 'O':
                Q1 = df[col].quantile(0.25)
                Q3 = df[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                df[col] = np.where((df[col] < lower_bound) | (df[col] > upper_bound), replacement_value[col], df[col])
        return df


class Adjust:
    def __init__(self, scaler=None, extract_datetime=False):
        self.scaler = scaler
        self.extract_datetime = extract_datetime

    def handle(self, df):
        if self.scaler or self.extract_datetime:
            df = self._convert_datetime(df)
            if self.scaler:
                if self.scaler in ['MinMax', 'Standard', 'Robust']:
                    scaler = preprocessing.__getattribute__(self.scaler+'Scaler')()
                    df[df.columns] = scaler.fit_transform(df[df.columns])
        return df

    def _convert_datetime(self, df):
        cols = set(df.columns) & set(self.extract_datetime)
        for col in cols:
            try:
                df[col] = pd.to_datetime(df[col], errors='coerce')
                if self.extract_datetime:
                    df[col + '_year'] = df[col].dt.year
                    df[col + '_month'] = df[col].dt.month
                    df[col + '_day'] = df[col].dt.day
                    df.drop(columns=[col], inplace=True)
            except:
                pass
        return df


class EncodeCateg:
    def __init__(self, encode_categ=None):
        self.encode_categ = encode_categ

    def handle(self, df):
        if self.encode_categ:
            if self.encode_categ == 'auto':
                self._auto_encode(df)
            elif isinstance(self.encode_categ, list):
                for col in self.encode_categ:
                    if col in df.columns:
                        self._auto_encode(df, col)
        return df

    def _auto_encode(self, df, col=None):
        if col:
            if df[col].dtype == 'O':
                if len(df[col].unique()) <= 10:
                    df[col] = df[col].astype('category')
                    df = pd.get_dummies(df, columns=[col], prefix=[col], drop_first=True)
                else:
                    le = LabelEncoder()
                    df[col] = le.fit_transform(df[col])
        else:
            for col in df.select_dtypes(include='object'):
                if len(df[col].unique()) <= 10:
                    df[col] = df[col].astype('category')
                    df = pd.get_dummies(df, columns=[col], prefix=[col], drop_first=True)
                else:
                    le = LabelEncoder()
                    df[col] = le.fit_transform(df[col])
        return df

In [None]:
data = pd.read_csv("loan_data.csv")
print('original data info :')
print(data.info())
def detect_outliers_iqr(data):
    outliers = pd.DataFrame()
    for column in data.columns:
        if data[column].dtype in ['int64', 'float64']:  # Check if column is numerical
            q1 = data[column].quantile(0.25)
            q3 = data[column].quantile(0.75)
            iqr = q3 - q1
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr
            column_outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)][column]
            outliers = pd.concat([outliers, column_outliers], axis=1)
    return outliers

outliers = detect_outliers_iqr(data)
print()
print('Outliers in original dataset :')
print(outliers.any())

original data info :
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4587 entries, 0 to 4586
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ApplicantIncome    4157 non-null   float64
 1   CoapplicantIncome  4587 non-null   float64
 2   LoanAmount         4145 non-null   float64
 3   Loan_Amount_Term   4587 non-null   int64  
 4   Credit_History     4587 non-null   int64  
 5   Education          4587 non-null   object 
 6   Property_Area      4587 non-null   object 
 7   Loan_Status        4587 non-null   object 
dtypes: float64(3), int64(2), object(3)
memory usage: 286.8+ KB
None

Outliers in original dataset :
ApplicantIncome       True
CoapplicantIncome     True
LoanAmount            True
Loan_Amount_Term     False
Credit_History       False
dtype: bool


In [None]:
clean_data = DataCleaning(duplicates=True,missing_num='knn',missing_categ='most_frequent',outliers_method=True,
                                scaler='minMax',extract_datetime='year',encode_categ='auto')

In [None]:
cleaned_data = clean_data.fit_transform(data)
print('cleaned data info :')
print(cleaned_data.info())

def detect_outliers_iqr(data):
    outliers = pd.DataFrame()
    for column in data.columns:
        if data[column].dtype in ['int64', 'float64']:
            q1 = data[column].quantile(0.25)
            q3 = data[column].quantile(0.75)
            iqr = q3 - q1
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr
            column_outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)][column]
            outliers = pd.concat([outliers, column_outliers], axis=1)
    return outliers

outliers = detect_outliers_iqr(cleaned_data)
print()
print('Outliers in cleaned dataset :')
print(outliers.any())

cleaned data info :
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4587 entries, 0 to 4586
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ApplicantIncome    4587 non-null   float64
 1   CoapplicantIncome  4587 non-null   float64
 2   LoanAmount         4587 non-null   float64
 3   Loan_Amount_Term   4587 non-null   int64  
 4   Credit_History     4587 non-null   int64  
 5   Education          4587 non-null   object 
 6   Property_Area      4587 non-null   object 
 7   Loan_Status        4587 non-null   object 
dtypes: float64(3), int64(2), object(3)
memory usage: 286.8+ KB
None

Outliers in cleaned dataset :
ApplicantIncome      False
CoapplicantIncome    False
LoanAmount           False
Loan_Amount_Term     False
Credit_History       False
dtype: bool


In [None]:
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.svm import SVR, SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
import pandas as pd

reg_data = pd.read_csv('powerplant_energy_data.csv')
class_data = pd.read_csv('thyroid_cancer_data.csv')

X_reg = reg_data.drop('energy_output', axis=1)
y_reg = reg_data['energy_output']

X_class = class_data.drop('diagnosis', axis=1)
y_class = class_data['diagnosis']

In [None]:
# Hyperparameter tuning grid for Decision Tree Regressor
dt_regressor_params = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Hyperparameter tuning grid for Decision Tree Classifier
dt_classifier_params = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Decision Tree Regressor on reg_data
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)
dt_regressor = DecisionTreeRegressor(random_state=0)
grid_regressor = GridSearchCV(dt_regressor, dt_regressor_params, cv=3, scoring='neg_mean_squared_error')
grid_regressor.fit(X_reg_train, y_reg_train)
y_reg_pred = grid_regressor.best_estimator_.predict(X_reg_test)
r2 = r2_score(y_reg_test, y_reg_pred)
print("R2 score for Regression:", r2)
print("Best Parameters for Regression:", grid_regressor.best_params_)

# Decision Tree Classifier on class_data
X_class_train, X_class_test, y_class_train, y_class_test = train_test_split(X_class, y_class, test_size=0.2, random_state=42)
dt_classifier = DecisionTreeClassifier(random_state=0)
grid_classifier = GridSearchCV(dt_classifier, dt_classifier_params, cv=3, scoring='accuracy')
grid_classifier.fit(X_class_train, y_class_train)
y_class_pred = grid_classifier.best_estimator_.predict(X_class_test)
accuracy = accuracy_score(y_class_test, y_class_pred)
print("\nAccuracy for Classification:", accuracy)
print("Best Parameters for Classification:", grid_classifier.best_params_)

R2 score for Regression: 0.9451894316423902
Best Parameters for Regression: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}

Accuracy for Classification: 0.9210526315789473
Best Parameters for Classification: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5}


In [None]:
# Hyperparameter tuning grid for Random Forest Regressor
rf_regressor_params = {
    'n_estimators': [10, 20, 30],
    'max_depth': [15, 20, 30, 50],
    'min_samples_leaf': [1, 2, 4, 8],
    'bootstrap': [True, False]
}

# Hyperparameter tuning grid for Random Forest Classifier
rf_classifier_params = {
    'n_estimators': [10, 20, 30],
    'max_depth': [15, 20, 30, 50],
    'min_samples_leaf': [1, 2, 4, 8],
    'bootstrap': [True, False]
}

# Random Forest Regressor on reg_data
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)
rf_regressor = RandomForestRegressor(random_state=0)
grid_regressor = GridSearchCV(rf_regressor, rf_regressor_params, cv=3, scoring='neg_mean_squared_error')
grid_regressor.fit(X_reg_train, y_reg_train)
y_reg_pred = grid_regressor.best_estimator_.predict(X_reg_test)
r2 = r2_score(y_reg_test, y_reg_pred)
print("R2 score for Regression:", r2)
print("Best Parameters for Regression:", grid_regressor.best_params_)

# Random Forest Classifier on class_data
X_class_train, X_class_test, y_class_train, y_class_test = train_test_split(X_class, y_class, test_size=0.2, random_state=42)
rf_classifier = RandomForestClassifier(random_state=0)
grid_classifier = GridSearchCV(rf_classifier, rf_classifier_params, cv=3, scoring='accuracy')
grid_classifier.fit(X_class_train, y_class_train)
y_class_pred = grid_classifier.best_estimator_.predict(X_class_test)
accuracy = accuracy_score(y_class_test, y_class_pred)
print("\nAccuracy for Classification:", accuracy)
print("Best Parameters for Classification:", grid_classifier.best_params_)

R2 score for Regression: 0.9632158655441234
Best Parameters for Regression: {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 1, 'n_estimators': 30}

Accuracy for Classification: 0.9736842105263158
Best Parameters for Classification: {'bootstrap': True, 'max_depth': 15, 'min_samples_leaf': 2, 'n_estimators': 10}


In [None]:
# Hyperparameter tuning grid for GBM Regressor
gbm_regressor_params = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 3, 4]
}

# Hyperparameter tuning grid for GBM Classifier
gbm_classifier_params = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 3, 4],
    'subsample': [0.8, 0.9, 1.0]
}

# GBM Regressor on reg_data
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)
gbm_regressor = GradientBoostingRegressor(random_state=0)
grid_regressor = GridSearchCV(gbm_regressor, gbm_regressor_params, cv=3, scoring='neg_mean_squared_error')
grid_regressor.fit(X_reg_train, y_reg_train)
y_reg_pred = grid_regressor.best_estimator_.predict(X_reg_test)
r2 = r2_score(y_reg_test, y_reg_pred)
print("R2 score for Regression:", r2)
print("Best Parameters for Regression:", grid_regressor.best_params_)

# GBM Classifier on class_data
X_class_train, X_class_test, y_class_train, y_class_test = train_test_split(X_class, y_class, test_size=0.2, random_state=42)
gbm_classifier = GradientBoostingClassifier(random_state=0)
grid_classifier = GridSearchCV(gbm_classifier, gbm_classifier_params, cv=3, scoring='accuracy')
grid_classifier.fit(X_class_train, y_class_train)
y_class_pred = grid_classifier.best_estimator_.predict(X_class_test)
accuracy = accuracy_score(y_class_test, y_class_pred)
print("\nAccuracy for Classification:", accuracy)
print("Best Parameters for Classification:", grid_classifier.best_params_)

R2 score for Regression: 0.9675191975526548
Best Parameters for Regression: {'learning_rate': 0.2, 'max_depth': 5, 'min_samples_split': 4, 'n_estimators': 150}

Accuracy for Classification: 0.9385964912280702
Best Parameters for Classification: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 0.9}


In [None]:
# Hyperparameter tuning grid for SVM Regressor
svm_regressor_params = {
    'kernel': ['linear', 'poly', 'rbf'],
    'C': [0.1, 1, 10],
    'epsilon': [0.1, 0.2, 0.5]
}

# Hyperparameter tuning grid for SVM Classifier
svm_classifier_params = {
    'kernel': ['linear', 'poly', 'rbf'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced', None]
}

# SVM Regressor on reg_data
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)
svm_regressor = SVR()
grid_regressor = GridSearchCV(svm_regressor, svm_regressor_params, cv=3, scoring='neg_mean_squared_error')
grid_regressor.fit(X_reg_train, y_reg_train)
y_reg_pred = grid_regressor.best_estimator_.predict(X_reg_test)
r2 = r2_score(y_reg_test, y_reg_pred)
print("R2 score for Regression:", r2)
print("Best Parameters for Regression:", grid_regressor.best_params_)

# SVM Classifier on class_data
X_class_train, X_class_test, y_class_train, y_class_test = train_test_split(X_class, y_class, test_size=0.2, random_state=42)
svm_classifier = SVC()
grid_classifier = GridSearchCV(svm_classifier, svm_classifier_params, cv=3, scoring='accuracy')
grid_classifier.fit(X_class_train, y_class_train)
y_class_pred = grid_classifier.best_estimator_.predict(X_class_test)
accuracy = accuracy_score(y_class_test, y_class_pred)
print("\nAccuracy for Classification:", accuracy)
print("Best Parameters for Classification:", grid_classifier.best_params_)

R2 score for Regression: 0.9291343762670312
Best Parameters for Regression: {'C': 1, 'epsilon': 0.2, 'kernel': 'linear'}
