In [None]:
!pip install dagshub mlflow

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# To see all columns

In [None]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.width', None)        
pd.set_option('display.expand_frame_repr', False)

# Read and Split data

In [None]:
df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['SalePrice'])
y = df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save Id columns

In [None]:
train_ids = X_train.pop('Id')
test_ids = X_test.pop('Id')

# Build processor to handle nulls in data

In [None]:
def custom_preprocess(df: pd.DataFrame) -> pd.DataFrame:
    # 1. Drop columns with 80% or more missing values
    threshold = 0.8
    null_fraction = df.isnull().mean()
    cols_to_drop = null_fraction[null_fraction >= threshold].index
    df = df.drop(columns=cols_to_drop)

    # 2. Separate numerical and categorical columns
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    categoric_cols = df.select_dtypes(include=['object']).columns

    # 3. Fill numeric NaNs with 0
    df[numeric_cols] = df[numeric_cols].fillna(0)

    # 4. Fill categoric NaNs with "NO"
    df[categoric_cols] = df[categoric_cols].fillna("NO")

    return df

In [None]:
X_train_cleaned = custom_preprocess(X_train.copy())
X_test_cleaned = custom_preprocess(X_test.copy())

# Split data to process with WOE and OHE

In [None]:
cat_cols = [col for col in X_train_cleaned.columns if X_train_cleaned[col].dtype == 'object']
num_cols = [col for col in X_train_cleaned.columns if X_train_cleaned[col].dtype != 'object']

In [None]:
s = X_train_cleaned[cat_cols].nunique()

In [None]:
threshold = 3

woe_columns = list(s[s > 3].index)
one_hot_columns = list(s[s <= 3].index)

In [None]:
X_train_cleaned[woe_columns].mode().T[0].to_dict()

# Custom Preprocessor class

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, woe_columns, one_hot_columns):
        self.woe_columns = woe_columns # Columns Which Should Be Preprocessed Using WOE
        self.one_hot_columns = one_hot_columns # Columns Which Should Be Preprocessed Using One Hot Encoder

    def fit(self, X, y):
        # Generate Fill Na Values Just in Case
        self.woe_columns_fill_na = X[woe_columns].mode().T[0].to_dict()
        
        df_woe = X.copy()
        target_col = 'target'
        df_woe[target_col] = y

        woe_mappings = {}
        iv_values = {}
        
        for col in self.woe_columns:
            print(f"Processing {col}...")
            
            groups = df_woe.groupby([col])[target_col].agg(['count', 'sum'])
            groups.columns = ['n_obs', 'n_pos']
            groups['n_neg'] = groups['n_obs'] - groups['n_pos']
            
            groups['prop_pos'] = groups['n_pos'] / groups['n_pos'].sum()
            groups['prop_neg'] = groups['n_neg'] / groups['n_neg'].sum()
            
            groups['woe'] = np.log(groups['prop_pos'] / groups['prop_neg'])
            groups['iv'] = (groups['prop_pos'] - groups['prop_neg']) * groups['woe']
            
            groups.replace([np.inf, -np.inf], 0, inplace=True)
            groups.fillna(0, inplace=True)
            
            woe_dict = groups['woe'].to_dict()
            iv = groups['iv'].sum()
            
            woe_mappings[col] = woe_dict
            iv_values[col] = iv

        self.woe_mappings = woe_mappings
        self.iv_values = iv_values

        return self


    def transform(self, X):
        X_transformed = X.copy()

        # Preprocess WOE Columns
        print("***")
        print("Preprocessing WOE Columns")
        for col in self.woe_columns:
            X_transformed[f'{col}_woe'] = X_transformed[col].map(self.woe_mappings[col])
            X_transformed.drop(columns=col, inplace=True)

        print("Preprocessing One Hot Columns")
        X_transformed = pd.get_dummies(X_transformed, columns=self.one_hot_columns, drop_first=True, dummy_na=True, dtype=int)

        print("Check Nans")
        n = X_transformed.isna().mean()

        na_cols = list(n[n > 0].index)

        print(na_cols)

        for col in na_cols:
            name, pr = col.split("_")
            if pr != "woe":
                print("Error Related to Nans")

            dic = self.woe_columns_fill_na
            mappings = self.woe_mappings
            X_transformed[col] = X_transformed[col].fillna(mappings[name][dic[name]])

            print(col, name, pr, dic[name], mappings[name][dic[name]])
            
        return X_transformed

# Build preprocessor

In [None]:
preprocessor = CustomPreprocessor(woe_columns=woe_columns, one_hot_columns=one_hot_columns)

In [None]:
X_train_t = preprocessor.fit_transform(X_train_cleaned, y_train)

In [None]:
X_test_t = preprocessor.transform(X_test_cleaned)

# Feature Selection 
# By Correlation

In [None]:
X_corr = X_train_t.copy()
X_corr['SalePrice'] = y_train

In [None]:
corr_matrix = X_corr.corr().abs()

In [None]:
# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Find feature pairs with correlation greater than a threshold
threshold = 0.8
high_corr_pairs = []

for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if corr_matrix.iloc[i, j] > threshold:
            high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))

# Display highly correlated pairs
if high_corr_pairs:
    print("Highly correlated feature pairs:")
    for feat1, feat2, corr in high_corr_pairs:
        print(f"{feat1} and {feat2}: {corr:.4f}")
else:
    print(f"No feature pairs with correlation above {threshold} found.")

# To remove one feature from each highly correlated pair
# (typically the one with lower correlation with target)
features_to_drop = []
for feat1, feat2, _ in high_corr_pairs:
    # Compare correlation with target
    if abs(X_train_t[feat1].corr(y)) < abs(X_train_t[feat2].corr(y)):
        features_to_drop.append(feat1)
    else:
        features_to_drop.append(feat2)

# Remove duplicates
features_to_drop = list(set(features_to_drop))
print(f"Features to drop due to high correlation: {features_to_drop}")

In [None]:
X_train_t = X_train_t.drop(columns=features_to_drop)
X_test_t = X_test_t.drop(columns=features_to_drop)

In [None]:
X_train_t.shape,X_test_t.shape

# RFE

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Scale features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train_t),
    columns=X_train_t.columns,
    index=X_train_t.index
)

# Create and fit the model for feature selection only
model = LinearRegression()
rfe = RFE(estimator=model, n_features_to_select=15, step=1)
rfe.fit(X_train_scaled, y_train)

# Get selected features
rfe_selected_features = X_train_t.columns[rfe.support_].tolist()
print("Features selected by RFE:")
for i, feature in enumerate(rfe_selected_features, 1):
    print(f"{i}. {feature}")

# Create plot for feature ranking
feature_ranking = pd.Series(rfe.ranking_, index=X_train_t.columns)
plt.figure(figsize=(12, 8))
feature_ranking.sort_values().head(20).plot(kind='bar')
plt.title('Top 20 Features by RFE Ranking (lower is better)')
plt.ylabel('Ranking')
plt.tight_layout()
plt.savefig("rfe_feature_ranking.png")

# Create before/after scaling visualization for a few selected features
plt.figure(figsize=(15, 10))
for i, feature in enumerate(rfe_selected_features[:3], 1):
    plt.subplot(3, 2, 2*i-1)
    plt.hist(X_train_t[feature], bins=30)
    plt.title(f'{feature} - Before Scaling')

    plt.subplot(3, 2, 2*i)
    plt.hist(X_train_scaled[feature], bins=30)
    plt.title(f'{feature} - After Scaling')

plt.tight_layout()
plt.savefig("scaling_visualization.png")

# Create scatter plots of features vs. target for top 5 features
plt.figure(figsize=(15, 10))
for i, feature in enumerate(rfe_selected_features[:5], 1):  # Plot top 5 features
    plt.subplot(2, 3, i)
    plt.scatter(X_train_scaled[feature], y_train, alpha=0.5)
    plt.title(f'{feature} vs SalePrice')
    plt.xlabel(feature)
    plt.ylabel('SalePrice')

plt.tight_layout()
plt.savefig("feature_distributions.png")

# Show plots if in interactive mode
plt.show()

# Build Final Preprocessor

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class FinalCustomPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, woe_columns, one_hot_columns, final_columns):
        self.woe_columns = woe_columns # Columns Which Should Be Preprocessed Using WOE
        self.one_hot_columns = one_hot_columns # Columns Which Should Be Preprocessed Using One Hot Encoder

        self.final_columns = final_columns # Columns Selected By RFE

    def fit(self, X, y):
        # Generate Fill Na Values Just in Case
        self.woe_columns_fill_na = X[woe_columns].mode().T[0].to_dict()
        
        df_woe = X.copy()
        target_col = 'target'
        df_woe[target_col] = y

        woe_mappings = {}
        iv_values = {}
        
        for col in self.woe_columns:
            print(f"Processing {col}...")
            
            groups = df_woe.groupby([col])[target_col].agg(['count', 'sum'])
            groups.columns = ['n_obs', 'n_pos']
            groups['n_neg'] = groups['n_obs'] - groups['n_pos']
            
            groups['prop_pos'] = groups['n_pos'] / groups['n_pos'].sum()
            groups['prop_neg'] = groups['n_neg'] / groups['n_neg'].sum()
            
            groups['woe'] = np.log(groups['prop_pos'] / groups['prop_neg'])
            groups['iv'] = (groups['prop_pos'] - groups['prop_neg']) * groups['woe']
            
            groups.replace([np.inf, -np.inf], 0, inplace=True)
            groups.fillna(0, inplace=True)
            
            woe_dict = groups['woe'].to_dict()
            iv = groups['iv'].sum()
            
            woe_mappings[col] = woe_dict
            iv_values[col] = iv

        self.woe_mappings = woe_mappings
        self.iv_values = iv_values

        return self


    def transform(self, X):
        X_transformed = X.copy()

        # Preprocess WOE Columns
        print("***")
        print("Preprocessing WOE Columns")
        for col in self.woe_columns:
            X_transformed[f'{col}_woe'] = X_transformed[col].map(self.woe_mappings[col])
            X_transformed.drop(columns=col, inplace=True)

        print("Preprocessing One Hot Columns")
        X_transformed = pd.get_dummies(X_transformed, columns=self.one_hot_columns, drop_first=True, dummy_na=True, dtype=int)

        print("Check Nans")
        n = X_transformed.isna().mean()

        na_cols = list(n[n > 0].index)

        print(na_cols)

        for col in na_cols:
            name, pr = col.split("_")
            if pr != "woe":
                print("Error Related to Nans")

            dic = self.woe_columns_fill_na
            mappings = self.woe_mappings
            X_transformed[col] = X_transformed[col].fillna(mappings[name][dic[name]])

            print(col, name, pr, dic[name], mappings[name][dic[name]])
            
        return X_transformed[self.final_columns]

In [None]:
final_preprocessor = FinalCustomPreprocessor(woe_columns=woe_columns, 
                                             one_hot_columns=one_hot_columns, 
                                             final_columns=rfe_selected_features)

In [None]:
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
X_train_clean = custom_preprocess(X_train)
X_test_clean = custom_preprocess(X_test)
test_clean = custom_preprocess(test)

In [None]:
test_clean.shape

# LinearRegression

In [None]:
import mlflow
import mlflow.sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Define pipeline
pipeline_lr = Pipeline([
    ('preprocess', FinalCustomPreprocessor(woe_columns=woe_columns, 
                                           one_hot_columns=one_hot_columns, 
                                           final_columns=rfe_selected_features)),
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

# Start MLflow experiment/run
with mlflow.start_run(run_name="LinearRegressionPipeline"):

    # Fit the model
    pipeline_lr.fit(X_train_clean, y_train)

    # Predict
    preds_for_evaluation = pipeline_lr.predict(X_test_clean)

    # Log-transform
    log_preds = np.log(preds_for_evaluation)
    log_y_test = np.log(y_test)

    # Metrics
    rmse = np.sqrt(mean_squared_error(log_y_test, log_preds))
    bias = np.mean(log_preds - log_y_test)
    variance = np.var(log_preds)

    # Log parameters (no hyperparams for basic LinearRegression, but we can still log config)
    mlflow.log_param("model", "LinearRegression")
    mlflow.log_param("fit_intercept", pipeline_lr.named_steps['model'].fit_intercept)

    # Log metrics
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("bias", bias)
    mlflow.log_metric("variance", variance)

    # Log the entire pipeline
    mlflow.sklearn.log_model(pipeline_lr, "linear_regression_pipeline")

    print("\n✅ Model and metrics logged to MLflow")
    print(f"→ RMSE: {rmse:.4f}")
    print(f"→ Bias: {bias:.4f}")
    print(f"→ Variance: {variance:.4f}")


# RandomForestRegressor

In [None]:
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error

# # Define the pipeline with RandomForestRegressor
# pipeline_rf = Pipeline([
#     ('preprocess', FinalCustomPreprocessor(woe_columns=woe_columns, 
#                                            one_hot_columns=one_hot_columns, 
#                                            final_columns=rfe_selected_features)),
#     ('scaler', StandardScaler()),
#     ('model', RandomForestRegressor(random_state=42))  # Use RandomForestRegressor
# ])

# # Fit the model (preprocessing, scaling, and training are done inside the pipeline)
# pipeline_rf.fit(X_train_clean, y_train)

# # Make predictions on the test set
# preds_for_evaluation = pipeline_rf.predict(X_test_clean)

# # Log-transform the predictions and actual values
# log_preds = np.log(preds_for_evaluation)
# log_y_test = np.log(y_test)  # Assuming y_test contains the actual SalePrice values for the test set

# # Calculate RMSE
# rmse = np.sqrt(mean_squared_error(log_y_test, log_preds))
# print(f"RMSE: {rmse}")

# # Calculate Bias
# bias = np.mean(log_preds - log_y_test)
# print(f"Bias: {bias}")

# # Calculate Variance
# variance = np.var(log_preds)
# print(f"Variance: {variance}") 

# XGBRegressor

In [None]:
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler
# from xgboost import XGBRegressor
# from sklearn.metrics import mean_squared_error
# import numpy as np

# # Define the pipeline with XGBRegressor
# pipeline_xgb = Pipeline([
#     ('preprocess', FinalCustomPreprocessor(woe_columns=woe_columns, 
#                                            one_hot_columns=one_hot_columns, 
#                                            final_columns=rfe_selected_features)),
#     ('scaler', StandardScaler()),
#     ('model', XGBRegressor(random_state=42))  # Use XGBRegressor
# ])

# # Fit the model (preprocessing, scaling, and training are done inside the pipeline)
# pipeline_xgb.fit(X_train_clean, y_train)

# # Make predictions on the test set
# preds_for_evaluation = pipeline_xgb.predict(X_test_clean)

# # Log-transform the predictions and actual values
# log_preds = np.log(preds_for_evaluation)
# log_y_test = np.log(y_test)  # Assuming y_test contains the actual SalePrice values for the test set

# # Calculate RMSE
# rmse = np.sqrt(mean_squared_error(log_y_test, log_preds))
# print(f"RMSE: {rmse}")

# # Calculate Bias
# bias = np.mean(log_preds - log_y_test)
# print(f"Bias: {bias}")

# # Calculate Variance
# variance = np.var(log_preds)
# print(f"Variance: {variance}") 

# GradientBoostingRegressor

In [None]:
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler
# from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.metrics import mean_squared_error
# import numpy as np

# # Define the pipeline with GradientBoostingRegressor
# pipeline_gb = Pipeline([
#     ('preprocess', FinalCustomPreprocessor(woe_columns=woe_columns, 
#                                            one_hot_columns=one_hot_columns, 
#                                            final_columns=rfe_selected_features)),
#     ('scaler', StandardScaler()),
#     ('model', GradientBoostingRegressor(
#     n_estimators=100,
#     learning_rate=0.1,
#     max_depth=3,
#     subsample=1.0,
#     random_state=None
# ))  # Use GradientBoostingRegressor
# ])

# # Fit the model (preprocessing, scaling, and training are done inside the pipeline)
# pipeline_gb.fit(X_train_clean, y_train)

# # Make predictions on the test set
# preds_for_evaluation = pipeline_gb.predict(X_test_clean)

# # Log-transform the predictions and actual values
# log_preds = np.log(preds_for_evaluation)
# log_y_test = np.log(y_test)  # Assuming y_test contains the actual SalePrice values for the test set

# # Calculate RMSE
# rmse = np.sqrt(mean_squared_error(log_y_test, log_preds))
# print(f"RMSE: {rmse}")

# # Calculate Bias
# bias = np.mean(log_preds - log_y_test)
# print(f"Bias: {bias}")

# # Calculate Variance
# variance = np.var(log_preds)
# print(f"Variance: {variance}") 

In [None]:
import dagshub
dagshub.init(repo_owner='AleksandreBakhtadze', repo_name='ML-abakh22-assignment-1', mlflow=True)


In [None]:
import mlflow
with mlflow.start_run():
  mlflow.log_param('parameter name', 'value')
  mlflow.log_metric('metric name', 1)