# Init


In [None]:
# Install necessary requirements
%pip install -r ../requirements.txt

# Change working directory to root
import os
if os.getcwd().endswith("notebooks"):
    %cd ..
    print(os.getcwd())

# Automatically reload changes in code
%load_ext autoreload
%autoreload 2

In [None]:
import logging
from typing import List

import pandas as pd

from association_finder.concept_drifts_finder import ConceptDriftsFinder
from association_finder.models import Transaction, ConceptDriftResult
logging.basicConfig(level=logging.INFO)

# Read and parse file

In [None]:

# def preprocess(df, train_params: Optional[TrainParams] = None):
    
#     # Defining numeric and categorical columns
#     numeric_columns = df.dtypes[(df.dtypes == "float64") | (df.dtypes == "int64")].index.tolist()
#     very_numerical = [nc for nc in numeric_columns if df[nc].nunique() > 20]
#     categorical_columns = [c for c in df.columns if c not in numeric_columns]
#     ordinals = list(set(numeric_columns) - set(very_numerical))

#     # Filling Null Values with the column's mean
#     na_columns = df[very_numerical].isna().sum()
#     na_columns = na_columns[na_columns > 0]

#     na_columns_mean = {}
        
#     for nc in na_columns.index:
#         if train_params is None:
#             column_mean = df[nc].mean()

#             # Save mean
#             na_columns_mean[nc] = column_mean
#         else:
#             column_mean = train_params.na_columns_mean[nc]

#         df[nc].fillna(column_mean, inplace=True)

#     # print(na_columns_mean,'\n')
    
#     # Dropping and filling NA values for categorical columns:
#     # drop if at least 70% are NA:
#     nul_cols = df[categorical_columns].isna().sum() / len(df)
#     drop_us = nul_cols[nul_cols > 0.7]
    
#     # print('nul_cols: ',nul_cols,'\n')
#     # print('drop_us: ',drop_us,'\n')
    
#     df = df.drop(drop_us.index, axis=1)

#     # Fill with a new 'na' category:
#     categorical_columns = list(set(categorical_columns) - set(drop_us.index))
#     df[categorical_columns] = df[categorical_columns].fillna('na')

#     # Fill Null values in ordinals with a new '-1' ordinal:
#     df[ordinals] = df[ordinals].fillna(-1)

#     # Turn Yes/No columns into 1/0 columns, respectively.
#     df['RainToday']= df.RainToday.map(dict(Yes=1, No=0))
#     df['RainTomorrow'] = df.RainTomorrow.map(dict(Yes=1, No=0))
    
#     df = df.copy()

#     # Bin numerical data
#     numerical_columns_cut = {}
#     for c in very_numerical:
#         if train_params is None:
#             try:
#                 # df[c] = pd.qcut(df[c], 5, labels=["very low", "low", "medium", "high", "very high"])
#                 df[c], bins = pd.qcut(df[c], 5, labels=[1, 2, 3, 4, 5], retbins=True)
#             except:
#                 # sometimes for highly skewed data, we cannot perform qcut as most quantiles are equal
#                 # df[c] = pd.cut(df[c], 5, labels=["very low", "low", "medium", "high", "very high"])
#                 df[c], bins = pd.cut(df[c], 5, labels=[1, 2, 3, 4, 5], retbins=True)

#             # Make bin edges larger (infinity and -infinity)
#             bins = np.concatenate(([-np.inf], bins[1:-1], [np.inf]))

#             # Save bin
#             numerical_columns_cut[c] = bins
            
#         else:
#             # Use existing train bins
#             bins = train_params.numerical_columns_cut[c]
#             df[c] = pd.cut(df[c], labels=[1, 2, 3, 4, 5], bins=bins)
        
#     return df, TrainParams(na_columns_mean, numerical_columns_cut),[d for d in drop_us.keys()]

# def split_X_y(df_prep: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    
#     X_columns = list(set(good_columns) - set(train_params.dropped_columns) - {target_column})
#     X = df_prep[X_columns]

#     for one_hot_column in one_hot_columns:
#         X = pd.concat([X, pd.get_dummies(X[one_hot_column], prefix=one_hot_column)], axis=1)
#         X = X.drop(columns=[one_hot_column])

#     y = df_prep[target_column]
    
#     return X, y

In [None]:
from sklearn.model_selection import train_test_split
from association_finder.preprocessing import preprocess_dataset, split_X_y
from typing import Dict, Tuple, Optional
from dataclasses import dataclass
import numpy as np

np.random.seed(0)

train_dataset_path = "datasets/rain_in_australia/weatherAUS.csv"
# df = pd.read_csv(train_dataset_path)
df = pd.read_csv(train_dataset_path, index_col='Date')

# Drop rows with NaN values in the following columns.
df.drop(df[df['RainToday'].isna()].index,inplace=True)
df.drop(df[df['RainTomorrow'].isna()].index,inplace=True)

# Turn Yes/No columns into 1/0 columns, respectively.
df['RainToday']= df.RainToday.map(dict(Yes=1, No=0))
df['RainTomorrow'] = df.RainTomorrow.map(dict(Yes=1, No=0))

df_train, df_val = train_test_split(df, test_size=0.3)

df_train_prep, train_params = preprocess_dataset(df_train)

# Focusing on prominent columns:
good_columns = [column for column in ['Location', 'MinTemp', 'MaxTemp',# 'Rainfall',
                                      #'Evaporation',
                'Sunshine', 'WindGustDir', 'WindSpeed9am','WindSpeed3pm', 'Humidity9am','Humidity3pm',
                'WindGustSpeed',#'Pressure9am','Pressure3pm','Cloud9am','Cloud3pm','Temp9am','Temp3pm', 'WindDir9am', 'WindDir3pm',
                                      'RainToday','RainTomorrow']
                if column not in train_params.dropped_columns]

target_column = "RainTomorrow"

one_hot_columns = [column for column in ['Location', 'WindGustDir'] if column not in train_params.dropped_columns] #'Date', ,'RainToday','RainTomorrow']



# Baseline

In [None]:
from sklearn.linear_model import LogisticRegression
from typing import Tuple

X_train, y_train = split_X_y(df_train_prep, good_columns, train_params, one_hot_columns, target_column)
X_val, y_val = split_X_y(preprocess_dataset(df_val, train_params)[0], good_columns, train_params, one_hot_columns, target_column)

clf = LogisticRegression(random_state=0, max_iter=10000).fit(X_train, y_train)

In [None]:
print(f"Train accuracy: {clf.score(X_train, y_train)}")
print(f"Validation accuracy: {clf.score(X_val, y_val)}")
# Train accuracy: 0.8448883666274971
# Validation accuracy: 0.85072792588391

# Train accuracy: 0.8194416305360833
# Validation accuracy: 0.8177349215352618

# Train accuracy: 0.8211434823128976
# Validation accuracy: 0.8209491397239553

# Build model using rules

In [None]:
from association_finder.concept_engineering import ConceptEngineering

concept_engineering = ConceptEngineering(verbose=True)
X_train_rules = concept_engineering.fit_transform(X_train, df_train_prep[good_columns], target_column, one_hot_columns)
X_val_rules = concept_engineering.transform(X_val)

clf_rules = LogisticRegression(random_state=0, max_iter=10000).fit(X_train_rules, y_train)

print(f"Train accuracy: {clf_rules.score(X_train_rules, y_train)}")
print(f"Validation accuracy: {clf_rules.score(X_val_rules, y_val)}")

# TODO: Same accuracy before and after the rules.
# TODO: If all features are in good_columns, this cell runs a long long time (couldn't get an accuracies)

# Analyze concepts

In [None]:
pd.set_option("display.max_columns", None)
pd.set_option("display.min_rows", 200)
concept_engineering.concepts_df