# Init


In [None]:
# Install necessary requirements
%pip install -r ../requirements.txt

# Change working directory to root
import os
if os.getcwd().endswith("notebooks"):
    %cd ..
    print(os.getcwd())

# Automatically reload changes in code
%load_ext autoreload
%autoreload 2

In [None]:
import logging
from typing import List

import pandas as pd

from association_finder.concept_drifts_finder import ConceptDriftsFinder
from association_finder.models import Transaction, ConceptDriftResult
logging.basicConfig()

# Read and parse file

In [None]:
from sklearn.model_selection import train_test_split
from typing import Dict, Optional
from dataclasses import dataclass
import numpy as np

np.random.seed(0)

dataset_path = "datasets/houseprices/train.csv"
df = pd.read_csv(train_dataset_path, index_col='Id')
df_train, df_val = train_test_split(df, test_size=0.3)

@dataclass
class TrainParams:
    na_columns_mean: Dict[int, float]
    numerical_columns_cut: Dict[str, List[float]]
    
        
def preprocess(df, train_params: Optional[TrainParams] = None):
    # Defining numeric and categorical columns
    numeric_columns = df.dtypes[(df.dtypes == "float64") | (df.dtypes == "int64")].index.tolist()
    very_numerical = [nc for nc in numeric_columns if df[nc].nunique() > 20]
    categorical_columns = [c for c in df.columns if c not in numeric_columns]
    ordinals = list(set(numeric_columns) - set(very_numerical))

    # Filling Null Values with the column's mean
    na_columns = df[very_numerical].isna().sum()
    na_columns = na_columns[na_columns > 0]

    na_columns_mean = {}
        
    for nc in na_columns.index:
        if train_params is None:
            column_mean = df[nc].mean()

            # Save mean
            na_columns_mean[nc] = column_mean
        else:
            column_mean = train_params.na_columns_mean[nc]

        df[nc].fillna(column_mean, inplace=True)


    # Dropping and filling NA values for categorical columns:
    # drop if at least 70% are NA:
    nul_cols = df[categorical_columns].isna().sum() / len(df)
    drop_us = nul_cols[nul_cols > 0.7]
    df = df.drop(drop_us.index, axis=1)

    # Fill with a new 'na' category:
    categorical_columns = list(set(categorical_columns) - set(drop_us.index))
    df[categorical_columns] = df[categorical_columns].fillna('na')

    df = df.copy()

    # Bin numerical data
    numerical_columns_cut = {}
    for c in very_numerical:
        if train_params is None:
            try:
                # df[c] = pd.qcut(df[c], 5, labels=["very low", "low", "medium", "high", "very high"])
                df[c], bins = pd.qcut(df[c], 5, labels=[1, 2, 3, 4, 5], retbins=True)
            except:
                # sometimes for highly skewed data, we cannot perform qcut as most quantiles are equal
                # df[c] = pd.cut(df[c], 5, labels=["very low", "low", "medium", "high", "very high"])
                df[c], bins = pd.cut(df[c], 5, labels=[1, 2, 3, 4, 5], retbins=True)

            # Make bin edges larger (infinity and -infinity)
            bins = np.concatenate(([-np.inf], bins[1:-1], [np.inf]))

            # Save bin
            numerical_columns_cut[c] = bins
            
        else:
            # Use existing train bins
            bins = train_params.numerical_columns_cut[c]
            df[c] = pd.cut(df[c], labels=[1, 2, 3, 4, 5], bins=bins)
        
    return df, TrainParams(na_columns_mean, numerical_columns_cut)

df_train_prep, train_params = preprocess(df_train)
            

# Baseline

In [None]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from typing import Tuple

def split_X_y(df_prep: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    X_columns = list(set(good_columns) - {target_column})
    X = df_prep[X_columns]

    one_hot_feature = 'BldgType'
    X = pd.concat([X, pd.get_dummies(X[one_hot_feature], prefix=one_hot_feature)], axis=1)
    X = X.drop(columns=[one_hot_feature])

    y = df_prep[target_column]
    
    return X, y

X_train, y_train = split_X_y(df_train_prep)
X_val, y_val = split_X_y(preprocess(df_val, train_params)[0])

clf = LogisticRegression(random_state=0).fit(X_train, y_train)

In [None]:
print(f"Train accuracy: {clf.score(X_train, y_train)}")
print(f"Validation accuracy: {clf.score(X_val, y_val)}")

# Find rules

In [None]:
# Focusing on prominent columns:
good_columns = ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'OverallCond', 'BldgType', 'LotArea',
                'GrLivArea', 'FullBath', 'BedroomAbvGr', 'LotFrontage', 'TotalBsmtSF', 'SalePrice']
# TODO: Check where Amit took the good columns list from

target_column = "SalePrice"

# We need to convert our dataframe to a list of transactions
records = df[good_columns].to_dict(orient='records')
transactions = []
for r in records:
    transactions.append(Transaction({k: v for k, v in r.items()}))

potential_concept_columns = good_columns.copy()
potential_concept_columns.remove(target_column)
all_concepts = []
for concept_column in potential_concept_columns[:4]:
    try:
        print()
        print(f"Starting concept column '{concept_column}'")
        
        # Run the ConceptDriftsFinder
        concepts: List[ConceptDriftResult] = ConceptDriftsFinder().find_concept_drifts(transactions, concept_column,
                                                                                       target_column, min_confidence=0.4,
                                                                                       min_support=0.4, diff_threshold=0.1)

        # Convert to dataframe
        all_concepts.extend(concepts)
        concepts_df = pd.DataFrame([x.to_dict() for x in concepts])

        # Print dataframe
        pd.set_option("display.max_columns", 20)
        print(concepts_df.head())
    except:
        print(f"Failed concept column '{concept_column}'")
        # logging.exception(f"Failed concept column {concept_column}")


In [None]:
pd.DataFrame([x.to_dict() for x in all_concepts])

# Build model using rules

In [None]:
X_train_rules[X_train_rules['OverallQual'] >= 8.2].shape

In [None]:
def modify_X(X):
    X.loc[(X['OverallQual'] >= 4.6) & (X['OverallQual'] < 8.2), 'BldgType_1Fam'] = 0
    X.loc[(X['OverallQual'] >= 4.6) & (X['OverallQual'] < 8.2) & (X['FullBath'] == 1), 'FullBath'] = 0

    X.loc[X['OverallQual'] >= 8.2, 'BldgType_1Fam'] = X.loc[X['OverallQual'] >= 8.2, 'BldgType_1Fam'] * 2
    X.loc[(X['OverallQual'] >= 8.2) & (X['FullBath'] == 2), 'FullBath'] = X.loc[(X['OverallQual'] >= 8.2) & (X['FullBath'] == 2), 'FullBath'] * 2
    
    return X

X_train_rules = X_train.copy()
X_val_rules = X_val.copy()

X_train_rules = modify_X(X_train_rules)
X_val_rules = modify_X(X_val_rules)

In [None]:
clf_rules = LogisticRegression(random_state=0).fit(X_train_rules, y_train)

In [None]:
print(f"Train accuracy: {clf_rules.score(X_train_rules, y_train)}")
print(f"Validation accuracy: {clf_rules.score(X_val_rules, y_val)}")