In [18]:
import datetime
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from typing import List
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt
from rich import print
from rich.console import Console
from rich.panel import Panel
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

def analyze_impact_features(X, target, exclude_cols=None):

    discrete_cols = X.select_dtypes(include='object').columns
    print(f"Converting discrete columns to numeric: {discrete_cols}")
    for column in discrete_cols:
        for i, value in enumerate(X[column].unique()):
            X[column] = X[column].replace(value, i)

    X_encoded = pd.get_dummies(X[discrete_cols], drop_first=True)
    X = pd.concat([X.drop(columns=discrete_cols), X_encoded], axis=1)

    # Drop rows with missing data
    X.dropna(inplace=True)

    # Make Y
    y = X[target]
    X = X.drop(columns=[target])

    # Impute missing values in predictors
    imputer = SimpleImputer(strategy='median')
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.3, random_state=42)

    # Initialize and train the Random Forest model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train.dropna())

    # Predict on the test set
    y_pred = rf.predict(X_test)

    # Calculate and print the RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"RMSE on the test set: {rmse:.2f}")

    # Feature importance
    feature_importances = pd.DataFrame(rf.feature_importances_,
                                        index=X_train.columns,
                                        columns=['importance']).sort_values('importance', ascending=False)

    print(feature_importances)

def impute_feature(df, target_feature, features_to_use):
    """
    Imputes missing values in the 'LotFrontage' column using multivariate imputation.

    Parameters:
    - df: DataFrame containing the dataset.
    - features_to_use: List of column names to be used for imputation.
    
    Returns:
    - DataFrame with 'LotFrontage' imputed.
    """
    # Ensure 'LotFrontage' is included in the features to use
    if target_feature not in features_to_use:
        features_to_use.append(target_feature)
    
    # Filter the DataFrame to include only the specified features
    df_filtered = df[features_to_use]
    
    # Identify categorical features for encoding
    categorical_features = df_filtered.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Define a column transformer for OneHotEncoding categorical variables
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)], remainder='passthrough')
    
    # Define the imputer model using RandomForestRegressor
    imputer = IterativeImputer(estimator=RandomForestRegressor(n_estimators=10, random_state=42),
                               max_iter=10, random_state=42)
    
    # Create a pipeline with preprocessing and imputation
    impute_pipeline = make_pipeline(preprocessor, imputer)
    
    # Perform imputation
    df_imputed = impute_pipeline.fit_transform(df_filtered)
    print(df_imputed.columns)
    #df_imputed = pd.DataFrame(df_imputed, columns=preprocessor.transformers_[0][1].get_feature_names_out().tolist() + features_to_use)
    
    # The imputed 'LotFrontage' will be among the last columns after OneHotEncoding. Locate and return it.
    # Note: Adjust the column name extraction based on actual DataFrame structure post-imputation
    #target_imputed = df_imputed.filter(like=target_feature).iloc[:, 0]  # Adjust based on actual column positioning
    
    # Insert the imputed 'LotFrontage' values back into the original DataFrame
    #df[target_feature + '_Imputed'] = target_imputed.values
    
    return df

df = pd.read_csv('data/train.csv')
#print(df.head())
df_imputed = impute_feature(df, 'LotFrontage', ['LotArea', 'Condition1', 'RoofMatl', 'LotConfig', 'GarageArea'])
print(df_imputed[['LotFrontage', 'LotFrontage_Imputed']])


# Example usage:
# df_imputed = impute_lot_frontage(df, ['LotArea', 'Condition1', 'RoofMatl', 'LotConfig', 'GarageArea'])
# print(df_imputed[['LotFrontage', 'LotFrontage_Imputed']])







TypeError: Sparse data was passed for X, but dense data is required. Use '.toarray()' to convert to a dense numpy array.