In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [None]:
dataset = pd.read_csv('audiA1_price_data.csv')
dataset

print("Column indexes:")
for i, column in enumerate(dataset.columns):
    print(f"{i} : {column}")

target_index = int(input("Enter the index of the target variable: "))
feature_indexes_str = input("Enter the indexes of the features (comma-separated): ")
feature_indexes = [int(idx.strip()) for idx in feature_indexes_str.split(',')]

target_variable = dataset.columns[target_index]
features = [dataset.columns[idx] for idx in feature_indexes]
print()

data = dataset[features + [target_variable]]
data

Column indexes:
0 : index
1 : Year
2 : Type
3 : Mileage(miles)
4 : Engine
5 : PS
6 : Transmission
7 : Fuel
8 : Number_of_Owners
9 : Price(£)
10 : href
11 : PPY
12 : MileageRank
13 : PriceRank
14 : PPYRank
15 : Score
Enter the index of the target variable: 9
Enter the indexes of the features (comma-separated): 3,5,6,7,8,11


Unnamed: 0,Mileage(miles),PS,Transmission,Fuel,Number_of_Owners,PPY,Price(£)
0,44000.0,114.398422,Manual,Diesel,1,2499.166667,14995.0
1,42596.0,93.688363,Manual,Petrol,3,2688.750000,10755.0
2,42700.0,123.274162,Manual,Petrol,2,3599.666667,10799.0
3,86000.0,103.550296,Manual,Diesel,3,3745.000000,7490.0
4,104310.0,103.550296,Manual,Diesel,3,3700.000000,7400.0
...,...,...,...,...,...,...,...
466,40195.0,138.067061,Automatic,Petrol,3,5637.500000,11275.0
467,26218.0,114.398422,Manual,Petrol,1,2311.875000,18495.0
468,48571.0,123.274162,Manual,Petrol,2,2399.000000,11995.0
469,9584.0,93.688363,Manual,Petrol,1,2496.250000,19970.0


In [None]:
x = data.drop(columns=[target_variable])
y = data[target_variable]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=42)

In [None]:
# #ML models
# regression_models = [LinearRegression, DecisionTreeRegressor, RandomForestRegressor]
# classification_models = [LogisticRegression, DecisionTreeClassifier, RandomForestClassifier]

# #problem selection
# target_dtype = data[target_variable].dtype

# if target_dtype in [np.float64, np.int64]:
#     model = regression_models
# elif target_dtype == np.object:
#     model = classification_models
# else:
#     raise ValueError("Unsupported target variable type. Please ensure the target variable is numeric or categorical.")

In [None]:
# from sklearn.feature_selection import RFE
# def feature_selection(data, target_variable, num_features):
#     X = data.drop(columns=[target_variable])
#     y = data[target_variable]
#     model = LinearRegression()  # You can choose any model for feature selection
#     rfe = RFE(model, num_features)
#     selected_features = rfe.fit_transform(X, y)
#     selected_feature_indices = rfe.get_support(indices=True)
#     selected_features_df = X.iloc[:, selected_feature_indices]
#     return selected_features_df

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from scipy.stats import boxcox
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.decomposition import PCA

# Handle missing values
def mean_imputation(data):
    if data.isnull().any().any():
        return data.fillna(data.mean())
    else:
        return data

def median_imputation(data):
    if data.isnull().any().any():
        return data.fillna(data.median())
    else:
        return data

def forward_fill(data):
    if data.isnull().any().any():
        return data.ffill()
    else:
        return data

def backward_fill(data):
    if data.isnull().any().any():
        return data.bfill()
    else:
        return data

def delete_missing_values(data):
    if data.isnull().any().any():
        return data.dropna()
    else:
        return data

# Handle outliers and/or skewness
def winsorize(data, lower_percentile=5, upper_percentile=95):
    if any(data.apply(lambda x: np.any(x < np.percentile(x, lower_percentile)) or np.any(x > np.percentile(x, upper_percentile)))):
        lower_bound = np.percentile(data, lower_percentile)
        upper_bound = np.percentile(data, upper_percentile)
        data[data < lower_bound] = lower_bound
        data[data > upper_bound] = upper_bound
        return data
    else:
        return data

def log_transform(data):
    if any(data <= 0):
        raise ValueError("Log transform cannot be applied to non-positive values.")
    return np.log1p(data)

def box_cox(data):
    if any(data <= 0):
        raise ValueError("Box-Cox transform cannot be applied to non-positive values.")
    transformed_data, _ = boxcox(data)
    return transformed_data

# Data transformation
def standardize_data(data):
    standardized_data = data.copy()
    for column in data.columns:
        if data[column].dtype in [int, float] and data[column].std() != 0:
            standardized_data[column] = (data[column] - data[column].mean()) / data[column].std()
    return standardized_data

def normalize_data(data):
    normalized_data = data.copy()
    for column in data.columns:
        min_val = data[column].min()
        max_val = data[column].max()
        if min_val != max_val:
            normalized_data[column] = (data[column] - min_val) / (max_val - min_val)
    return normalized_data

def min_max_scaling(data):
    scaled_data = data.copy()
    for column in data.columns:
        if data[column].dtype in [int, float] and data[column].std() != 0:
            scaler = MinMaxScaler()
            scaled_data[column] = scaler.fit_transform(data[[column]])
    return scaled_data

def robust_scaling(data):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform(data)
    scaled_data = pd.DataFrame(scaled_data, columns=data.columns)
    return scaled_data

# Encode categorical features
def one_hot_encoding(data, categorical_columns):
    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    encoded_cols = pd.DataFrame(encoder.fit_transform(data[categorical_columns]))
    encoded_cols.columns = encoder.get_feature_names(categorical_columns)
    data = data.drop(columns=categorical_columns)
    data = pd.concat([data, encoded_cols], axis=1)
    return data

def label_encoding(data, categorical_columns):
    label_encoder = LabelEncoder()
    for column in categorical_columns:
        data[column] = label_encoder.fit_transform(data[column])
    return data

# PCA
def pca_reduction(data, n_components=2):
    pca = PCA(n_components=n_components)
    reduced_data = pca.fit_transform(data)
    reduced_data = pd.DataFrame(reduced_data, columns=[f'PC{i}' for i in range(1, n_components+1)])
    return reduced_data

In [None]:
#automatic data preprocessing
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.decomposition import PCA
from scipy.stats import boxcox
from sklearn.impute import SimpleImputer

# Load dataset
dataset = pd.read_csv('dp_data1.csv')
dataset

print("Column indexes:")
for i, column in enumerate(dataset.columns):
    print(f"{i} : {column}")

target_index = int(input("Enter the index of the target variable: "))
feature_indexes_str = input("Enter the indexes of the features (comma-separated): ")
feature_indexes = [int(idx.strip()) for idx in feature_indexes_str.split(',')]

target_variable = dataset.columns[target_index]
features = [dataset.columns[idx] for idx in feature_indexes]
print()

data = dataset[features + [target_variable]]
data

# Data split
def split_data(dataset, target_variable):
    X = dataset.drop(columns=[target_variable])
    y = dataset[target_variable]
    return train_test_split(X, y, test_size=0.3, random_state=42)

# Handle missing values
def mean_imputation(data):
    if data.isnull().any().any():
        return data.fillna(data.mean())
    else:
        return data

def median_imputation(data):
    if data.isnull().any().any():
        return data.fillna(data.median())
    else:
        return data

def forward_fill(data):
    if data.isnull().any().any():
        return data.ffill()
    else:
        return data

def backward_fill(data):
    if data.isnull().any().any():
        return data.bfill()
    else:
        return data

def delete_missing_values(data):
    if data.isnull().any().any():
        return data.dropna()
    else:
        return data

# Handle outliers and/or skewness
def winsorize(data, lower_percentile=5, upper_percentile=95):
    if any(data.apply(lambda x: np.any(x < np.percentile(x, lower_percentile)) or np.any(x > np.percentile(x, upper_percentile)))):
        lower_bound = np.percentile(data, lower_percentile)
        upper_bound = np.percentile(data, upper_percentile)
        data[data < lower_bound] = lower_bound
        data[data > upper_bound] = upper_bound
        return data
    else:
        return data

def log_transform(data):
    if any(data <= 0):
        raise ValueError("Log transform cannot be applied to non-positive values.")
    return np.log1p(data)

def box_cox(data):
    if any(data <= 0):
        raise ValueError("Box-Cox transform cannot be applied to non-positive values.")
    transformed_data, _ = boxcox(data)
    return transformed_data

# Data transformation
def standardize_data(data):
    standardized_data = data.copy()
    for column in data.columns:
        if data[column].dtype in [int, float] and data[column].std() != 0:
            standardized_data[column] = (data[column] - data[column].mean()) / data[column].std()
    return standardized_data

def normalize_data(data):
    normalized_data = data.copy()
    for column in data.columns:
        min_val = data[column].min()
        max_val = data[column].max()
        if min_val != max_val:
            normalized_data[column] = (data[column] - min_val) / (max_val - min_val)
    return normalized_data

def min_max_scaling(data):
    scaled_data = data.copy()
    for column in data.columns:
        if data[column].dtype in [int, float] and data[column].std() != 0:
            scaler = MinMaxScaler()
            scaled_data[column] = scaler.fit_transform(data[[column]])
    return scaled_data

def robust_scaling(data):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform(data)
    scaled_data = pd.DataFrame(scaled_data, columns=data.columns)
    return scaled_data

# Encode categorical features
def one_hot_encoding(data, categorical_columns):
    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    encoded_cols = pd.DataFrame(encoder.fit_transform(data[categorical_columns]))
    encoded_cols.columns = encoder.get_feature_names(categorical_columns)
    data = data.drop(columns=categorical_columns)
    data = pd.concat([data, encoded_cols], axis=1)
    return data

def label_encoding(data, categorical_columns):
    label_encoder = LabelEncoder()
    for column in categorical_columns:
        data[column] = label_encoder.fit_transform(data[column])
    return data

# PCA
def pca_reduction(data, n_components=2):
    pca = PCA(n_components=n_components)
    reduced_data = pca.fit_transform(data)
    reduced_data = pd.DataFrame(reduced_data, columns=[f'PC{i}' for i in range(1, n_components+1)])
    return reduced_data

#types of features
def categorize_features(data):
    numerical_features = []
    categorical_features = []

    for column in data.columns:
        if pd.api.types.is_numeric_dtype(data[column]):
            numerical_features.append(column)
        elif pd.api.types.is_object_dtype(data[column]):
            categorical_features.append(column)

    return numerical_features, categorical_features

#techniques selection
def determine_preprocessing_steps(data):
    techniques = []
    preprocessing_steps = {
        'imputation': False,
        'outlier_handling': False,
        'transformation': False,
        'encoding': False
    }

    # Check for missing values
    if data.isnull().any().any():
        preprocessing_steps['imputation'] = True
        # Append imputation techniques
        techniques.extend(['mean_imputation', 'median_imputation', 'forward_fill', 'backward_fill', 'delete_missing_values'])

    # Check for outliers
    # You can add more sophisticated outlier detection techniques here if needed
    if data.select_dtypes(include=['number']).apply(lambda x: x.skew()).abs().max() > 2:
        preprocessing_steps['outlier_handling'] = True
        # Append outlier handling techniques
        techniques.append('winsorize')

    # Check if transformation is necessary (e.g., log transform for skewed data)
    if data.select_dtypes(include=['number']).apply(lambda x: x.skew()).abs().max() > 0.5:
        preprocessing_steps['transformation'] = True
        # Append transformation techniques
        techniques.extend(['log_transform', 'box_cox'])

    # Check if encoding is required (for categorical features)
    if len(data.select_dtypes(include=['object', 'category']).columns) > 0:
        preprocessing_steps['encoding'] = True
        # Append encoding techniques
        techniques.extend(['one_hot_encoding', 'label_encoding'])

    return preprocessing_steps, techniques

# Modify preprocess_data function to take target_variable as input
def preprocess_data(data, target_variable):
    # Feature categorization
    numerical_features, categorical_features = categorize_features(data)

    # Determine preprocessing steps
    preprocessing_steps, techniques = determine_preprocessing_steps(data)

    # Initialize an empty list to store preprocessing pipelines
    preprocessing_pipelines = []

    # If imputation is needed
    if preprocessing_steps['imputation']:
        for technique in ['mean_imputation', 'median_imputation', 'forward_fill', 'backward_fill', 'delete_missing_values']:
            if technique in techniques:
                pipeline_steps = [('imputation', globals()[technique])]
                preprocessing_pipelines.append(('Imputation: ' + technique, Pipeline(pipeline_steps)))

    # If outlier handling is needed
    if preprocessing_steps['outlier_handling']:
        for technique in ['winsorize']:
            if technique in techniques:
                pipeline_steps = [('outlier_handling', globals()[technique])]
                preprocessing_pipelines.append(('Outlier Handling: ' + technique, Pipeline(pipeline_steps)))

    # If transformation is needed
    if preprocessing_steps['transformation']:
        for technique in ['log_transform', 'box_cox']:
            if technique in techniques:
                pipeline_steps = [('transformation', globals()[technique])]
                preprocessing_pipelines.append(('Transformation: ' + technique, Pipeline(pipeline_steps)))

    # If encoding is needed
    # If encoding is needed
    if preprocessing_steps['encoding']:
        if categorical_features:  # Check if there are categorical features
            for technique in ['one_hot_encoding', 'label_encoding']:
                if technique in techniques:
                    if technique == 'one_hot_encoding':
                        pipeline_steps = [('encoding', globals()[technique](categorical_columns))]
                    else:
                        pipeline_steps = [('encoding', globals()[technique](categorical_features))]
                    preprocessing_pipelines.append(('Encoding: ' + technique, Pipeline(pipeline_steps)))
    else:
      preprocessing_steps['encoding'] = False  # No categorical features to encode


    # No preprocessing required, use raw data
    pipeline_steps = [('none', None)]
    preprocessing_pipelines.append(('None', Pipeline(pipeline_steps)))

    # Evaluate each preprocessing pipeline using cross-validation
    for name, pipeline in preprocessing_pipelines:
        print(f"\nEvaluation results for preprocessing pipeline: {name}")
        preprocessing_data = dataset.copy()

        # Apply preprocessing steps
        if pipeline is not None:
            X_train, X_test, y_train, y_test = split_data(preprocessing_data, target_variable)
            pipeline.fit(X_train, y_train)
            X_train_transformed = pipeline.transform(X_train)
            scores = cross_val_score(RandomForestClassifier(), X_train_transformed, y_train, cv=5)
            print("Mean Cross-Validation Accuracy:", np.mean(scores))
        else:
            X_train, X_test, y_train, y_test = split_data(preprocessing_data, target_variable)
            scores = cross_val_score(RandomForestClassifier(), X_train, y_train, cv=5)
            print("Mean Cross-Validation Accuracy (No Preprocessing):", np.mean(scores))

# Example usage
target_variable = data.columns[-1]  # Assuming the target variable is the last column
preprocess_data(data, target_variable)


Column indexes:
0 : Feature_1
1 : Feature_2
2 : Feature_3
3 : Feature_4
4 : Feature_5
5 : Feature_6
6 : Feature_7
7 : Feature_8
8 : Feature_9
9 : Feature_10
10 : Feature_11
11 : Feature_12
12 : Feature_13
13 : Feature_14
14 : Feature_15
15 : Feature_16
16 : Feature_17
17 : Feature_18
18 : Feature_19
19 : Feature_20
20 : Feature_21
21 : Feature_22
22 : Feature_23
23 : Feature_24
24 : Feature_25
Enter the index of the target variable: 10
Enter the indexes of the features (comma-separated): 0,1,2,3,4,5,6,7,8,9



NameError: name 'categorical_columns' is not defined

In [None]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.decomposition import PCA
from scipy.stats import boxcox
from sklearn.impute import SimpleImputer

# Load dataset
dataset = pd.read_csv('dp_data1.csv')

def categorize_columns(dataset):
    numerical_columns = []
    categorical_columns = []

    for column in dataset.columns:
        if dataset[column].dtype in ['int64', 'float64']:
            numerical_columns.append(column)
        else:
            categorical_columns.append(column)

    return numerical_columns, categorical_columns

def split_data(dataset, target_variable):
    X = dataset.drop(columns=[target_variable])
    y = dataset[target_variable]
    return train_test_split(X, y, test_size=0.3, random_state=42)

def mean_imputation(data):
    if data.isnull().any().any():
        return data.fillna(data.mean())
    else:
        return data

def median_imputation(data):
    if data.isnull().any().any():
        return data.fillna(data.median())
    else:
        return data

def forward_fill(data):
    if data.isnull().any().any():
        return data.ffill()
    else:
        return data

def backward_fill(data):
    if data.isnull().any().any():
        return data.bfill()
    else:
        return data

def delete_missing_values(data):
    if data.isnull().any().any():
        return data.dropna()
    else:
        return data

def winsorize(data, lower_percentile=5, upper_percentile=95):
    if any(data.apply(lambda x: np.any(x < np.percentile(x, lower_percentile)) or np.any(x > np.percentile(x, upper_percentile)))):
        lower_bound = np.percentile(data, lower_percentile)
        upper_bound = np.percentile(data, upper_percentile)
        data[data < lower_bound] = lower_bound
        data[data > upper_bound] = upper_bound
        return data
    else:
        return data

def log_transform(data):
    if any(data <= 0):
        raise ValueError("Log transform cannot be applied to non-positive values.")
    return np.log1p(data)

def box_cox(data):
    if any(data <= 0):
        raise ValueError("Box-Cox transform cannot be applied to non-positive values.")
    transformed_data, _ = boxcox(data)
    return transformed_data

def standardize_data(data):
    standardized_data = data.copy()
    for column in data.columns:
        if data[column].dtype in [int, float] and data[column].std() != 0:
            standardized_data[column] = (data[column] - data[column].mean()) / data[column].std()
    return standardized_data

def normalize_data(data):
    normalized_data = data.copy()
    for column in data.columns:
        min_val = data[column].min()
        max_val = data[column].max()
        if min_val != max_val:
            normalized_data[column] = (data[column] - min_val) / (max_val - min_val)
    return normalized_data

def min_max_scaling(data):
    scaled_data = data.copy()
    for column in data.columns:
        if data[column].dtype in [int, float] and data[column].std() != 0:
            scaler = MinMaxScaler()
            scaled_data[column] = scaler.fit_transform(data[[column]])
    return scaled_data

def robust_scaling(data):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform(data)
    scaled_data = pd.DataFrame(scaled_data, columns=data.columns)
    return scaled_data

def one_hot_encoding(data, categorical_columns):
    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    encoded_cols = pd.DataFrame(encoder.fit_transform(data[categorical_columns]))
    encoded_cols.columns = encoder.get_feature_names(categorical_columns)
    data = data.drop(columns=categorical_columns)
    data = pd.concat([data, encoded_cols], axis=1)
    return data

def label_encoding(data, categorical_columns):
    label_encoder = LabelEncoder()
    for column in categorical_columns:
        data[column] = label_encoder.fit_transform(data[column])
    return data

def pca_reduction(data, n_components=2):
    pca = PCA(n_components=n_components)
    reduced_data = pca.fit_transform(data)
    reduced_data = pd.DataFrame(reduced_data, columns=[f'PC{i}' for i in range(1, n_components+1)])
    return reduced_data

# Function to suggest preprocessing techniques
def required_preprocessing(data):
    preprocessing_techniques = []

    # Check for missing values
    if data.isnull().values.any():
        preprocessing_techniques.append("Handle missing values (e.g., mean imputation, median imputation)")

    # Check for outliers
    numerical_columns, _ = categorize_columns(data)
    for column in numerical_columns:
        if data[column].max() > 3 * data[column].quantile(0.75) or data[column].min() < 3 * data[column].quantile(0.25):
            preprocessing_techniques.append("Handle outliers (e.g., Winsorization)")

    # Check for skewness
    skew_threshold = 1
    skewness = data[numerical_columns].skew()
    skewed_columns = skewness[abs(skewness) > skew_threshold].index.tolist()
    if skewed_columns:
        preprocessing_techniques.append("Handle skewness (e.g., log transformation, Box-Cox transformation)")

    # Check for categorical variables
    _, categorical_columns = categorize_columns(data)
    if len(categorical_columns) > 0:
        preprocessing_techniques.append("Encode categorical variables (e.g., one-hot encoding, label encoding)")

    return preprocessing_techniques

# Get preprocessing suggestions
required_techniques = required_preprocessing(dataset)


class AutoDataPreprocessing:
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

    def suggest_preprocessing(self, X, y=None):
        preprocessing_techniques = []

        # Check for missing values
        if X.isnull().values.any():
            preprocessing_techniques.append("Handle missing values (e.g., mean imputation, median imputation)")

        # Check for outliers
        numerical_columns = X.select_dtypes(include=[np.number]).columns
        for column in numerical_columns:
            if self.detect_outliers(X[column]):
                preprocessing_techniques.append("Handle outliers (e.g., Winsorization)")

        # Check for skewness
        skew_threshold = 1
        skewness = X[numerical_columns].skew()
        skewed_columns = skewness[abs(skewness) > skew_threshold].index.tolist()
        if skewed_columns:
            preprocessing_techniques.append("Handle skewness (e.g., log transformation, Box-Cox transformation)")

        # Check for categorical variables
        categorical_columns = X.select_dtypes(exclude=[np.number]).columns
        if len(categorical_columns) > 0:
            preprocessing_techniques.append("Encode categorical variables (e.g., one-hot encoding)")

        return preprocessing_techniques

    def detect_outliers(self, series, threshold=3):
        z_scores = (series - series.mean()) / series.std()
        return (z_scores.abs() > threshold).any()

    def handle_missing_values(self, X):
        # Implement missing value handling techniques (e.g., mean imputation, median imputation)
        # Example:
        # imputer = SimpleImputer(strategy='mean')
        # X_filled = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
        # return X_filled
        pass

    def handle_outliers(self, X):
        # Implement outlier handling techniques (e.g., Winsorization)
        pass

    def handle_skewness(self, X):
        # Implement skewness handling techniques (e.g., log transformation, Box-Cox transformation)
        pass

    def encode_categorical_variables(self, X):
        # Implement categorical variable encoding techniques (e.g., one-hot encoding)
        # Example:
        # encoder = OneHotEncoder()
        # X_encoded = pd.DataFrame(encoder.fit_transform(X), columns=encoder.get_feature_names(X.columns))
        # return X_encoded
        pass

# Load dataset
dataset = pd.read_csv('dp_data1.csv')

# Separate features and target variable
X = dataset.drop(columns=['target_column'])
y = dataset['target_column']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize AutoDataPreprocessing object
preprocessor = AutoDataPreprocessing()

# Suggest preprocessing techniques
preprocessing_suggestions = preprocessor.suggest_preprocessing(X_train)

print("Suggested preprocessing techniques:")
for technique in preprocessing_suggestions:
    print("-", technique)


Preprocessing suggestions:
- Handle missing values (e.g., mean imputation, median imputation)
- Handle outliers (e.g., Winsorization)
- Handle outliers (e.g., Winsorization)
- Handle outliers (e.g., Winsorization)
- Handle outliers (e.g., Winsorization)
- Handle outliers (e.g., Winsorization)
- Handle outliers (e.g., Winsorization)
- Handle outliers (e.g., Winsorization)
- Handle outliers (e.g., Winsorization)
- Handle outliers (e.g., Winsorization)
- Handle outliers (e.g., Winsorization)
- Handle outliers (e.g., Winsorization)
- Handle outliers (e.g., Winsorization)
- Handle outliers (e.g., Winsorization)
- Handle outliers (e.g., Winsorization)
- Handle outliers (e.g., Winsorization)
- Handle outliers (e.g., Winsorization)
- Handle outliers (e.g., Winsorization)
- Handle outliers (e.g., Winsorization)
- Handle outliers (e.g., Winsorization)
- Handle skewness (e.g., log transformation, Box-Cox transformation)
- Encode categorical variables (e.g., one-hot encoding, label encoding)


In [None]:
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.decomposition import PCA
from scipy.stats import boxcox

# Load dataset
dataset = pd.read_csv('dp_data1.csv')

def categorize_columns(dataset):
    numerical_columns = []
    categorical_columns = []

    for column in dataset.columns:
        if dataset[column].dtype in ['int64', 'float64']:
            numerical_columns.append(column)
        else:
            categorical_columns.append(column)

    return numerical_columns, categorical_columns


def split_data(dataset, target_variable):
    X = dataset.drop(columns=[target_variable])
    y = dataset[target_variable]
    return train_test_split(X, y, test_size=0.3, random_state=42)



# Missing values
def handle_missing_values(dataset):
    num_missing = dataset.isnull().sum()
    num_columns = dataset.shape[1]
    if num_missing.sum() == 0:
        print("No missing values found")
        return dataset
    else:
        imputer = None
        univariate_columns = [col for col in dataset.columns if dataset[col].isnull().any() and dataset[col].nunique() == 1]
        multivariate_columns = [col for col in dataset.columns if dataset[col].isnull().any() and dataset[col].nunique() > 1]
        if univariate_columns:
            imputer = SimpleImputer(strategy='mean')
            print("Using SimpleImputer for univariate columns:", univariate_columns)
        if multivariate_columns:
            imputer = IterativeImputer()
            print("Using IterativeImputer for multivariate columns:", multivariate_columns)

        if imputer:
            dataset[univariate_columns + multivariate_columns] = imputer.fit_transform(dataset[univariate_columns + multivariate_columns])
            return dataset
        else:
            print("No suitable imputer found.")
            return dataset

# Outliers
def handle_outliers(dataset):
    dataset = dataset.apply(winsorize, axis=0)
    dataset = dataset.apply(log_transform, axis=0)
    return dataset

def winsorize(data, lower_percentile=5, upper_percentile=95):
    if any(data.apply(lambda x: np.any(x < np.percentile(x, lower_percentile)) or np.any(x > np.percentile(x, upper_percentile)))):
        lower_bound = np.percentile(data, lower_percentile)
        upper_bound = np.percentile(data, upper_percentile)
        data[data < lower_bound] = lower_bound
        data[data > upper_bound] = upper_bound
        return data
    else:
        return data

def log_transform(data):
    if any(data <= 0):
        raise ValueError("Log transform cannot be applied to non-positive values.")
    return np.log1p(data)

# Encoding categorical features
def encode_categorical_features(dataset):
    categorical_columns = dataset.select_dtypes(include=['object']).columns
    if len(categorical_columns) > 0:
        encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        encoded_data = encoder.fit_transform(dataset[categorical_columns])
        encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names(categorical_columns))
        dataset.drop(columns=categorical_columns, inplace=True)
        dataset = pd.concat([dataset, encoded_df], axis=1)
        return dataset
    else:
        print("No categorical features found.")
        return dataset

# Polynomial features
def add_polynomial_features(dataset, degree=2):
    polynomial_features = PolynomialFeatures(degree=degree)
    transformed_data = polynomial_features.fit_transform(dataset)
    return transformed_data

# Normalization/Standardization/MinMaxScaler
def scale_features(dataset, scaler_type='standard'):
    if scaler_type == 'standard':
        scaler = StandardScaler()
    elif scaler_type == 'minmax':
        scaler = MinMaxScaler()
    else:
        raise ValueError("Invalid scaler_type. Use 'standard' or 'minmax'.")
    scaled_data = scaler.fit_transform(dataset)
    return scaled_data

#PCA
def pca_reduction(data, n_components=2):
    pca = PCA(n_components=n_components)
    reduced_data = pca.fit_transform(data)
    reduced_data = pd.DataFrame(reduced_data, columns=[f'PC{i}' for i in range(1, n_components+1)])
    return reduced_data



# Function to suggest preprocessing techniques
def required_preprocessing(data):
    preprocessing_techniques = []

    # Check for missing values
    if data.isnull().values.any():
        preprocessing_techniques.append("Handle missing values (e.g., mean imputation, median imputation)")

    # Check for outliers
    numerical_columns, _ = categorize_columns(data)
    for column in numerical_columns:
        if data[column].max() > 3 * data[column].quantile(0.75) or data[column].min() < 3 * data[column].quantile(0.25):
            preprocessing_techniques.append("Handle outliers (e.g., Winsorization)")

    # Check for skewness
    skew_threshold = 1
    skewness = data[numerical_columns].skew()
    skewed_columns = skewness[abs(skewness) > skew_threshold].index.tolist()
    if skewed_columns:
        preprocessing_techniques.append("Handle skewness (e.g., log transformation, Box-Cox transformation)")

    # Check for categorical variables
    _, categorical_columns = categorize_columns(data)
    if len(categorical_columns) > 0:
        preprocessing_techniques.append("Encode categorical variables (e.g., one-hot encoding, label encoding)")

    return preprocessing_techniques

# Get preprocessing suggestions
required_techniques = required_preprocessing(dataset)

# Define the preprocessing steps
preprocessing_steps = [
    ('missing_values', FunctionTransformer(handle_missing_values)),
    ('outliers', FunctionTransformer(handle_outliers)),
    ('categorical_encoding', FunctionTransformer(encode_categorical_features)),
    ('polynomial_features', FunctionTransformer(add_polynomial_features)),
    ('scaling', FunctionTransformer(scale_features)),
    ('pca', FunctionTransformer(pca_reduction))
]

# Create the preprocessing pipeline
preprocessing_pipeline = Pipeline(steps=preprocessing_steps)

# Apply the preprocessing pipeline to the dataset
preprocessed_data = preprocessing_pipeline.fit_transform(dataset)

# Print the shape of the preprocessed data
print("Shape of preprocessed data:", preprocessed_data.shape)
