In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
class Duplicates:
    def __init__(self, duplicates=True):
        self.duplicates = duplicates
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X
    def handle(self, df):
        if self.duplicates:
            original_shape = df.shape
            df.drop_duplicates(inplace=True, ignore_index=True)
            df.reset_index(drop=True, inplace=True)
            new_shape = df.shape
            count = original_shape[0] - new_shape[0]
        return df


class MissingValues:
    def __init__(self, missing_num=None, missing_categ=None):
        self.missing_num = missing_num
        self.missing_categ = missing_categ
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X
    def handle(self, df, _n_neighbors=5):
        if self.missing_num or self.missing_categ:
            self.count_missing = df.isna().sum().sum()
            if self.count_missing != 0:
                df = df.dropna(how='all')
                df.reset_index(drop=True)
                if self.missing_num: # numeric data
                    if self.missing_num == 'auto':
                        self.missing_num = 'linreg'
                        lr = LinearRegression()
                        df = self._lin_regression_impute(df, lr)
                        self.missing_num = 'knn'
                        imputer = KNNImputer(n_neighbors=_n_neighbors)
                        df = self._impute(df, imputer, type='num')
                    elif self.missing_num == 'linreg':
                        lr = LinearRegression()
                        df = self._lin_regression_impute(df, lr)
                    elif self.missing_num == 'knn':
                        imputer = KNNImputer(n_neighbors=_n_neighbors)
                        df = self._impute(df, imputer, type='num')
                    elif self.missing_num in ['mean', 'median', 'most_frequent']:
                        imputer = SimpleImputer(strategy=self.missing_num)
                        df = self._impute(df, imputer, type='num')
                    elif self.missing_num == 'delete':
                        df = self._delete(df, type='num')

                if self.missing_categ: # categorical data
                    if self.missing_categ == 'auto':
                        self.missing_categ = 'logreg'
                        lr = LogisticRegression()
                        df = self._log_regression_impute(df, lr)
                        self.missing_categ = 'knn'
                        imputer = KNNImputer(n_neighbors=_n_neighbors)
                        df = self._impute(df, imputer, type='categ')
                    elif self.missing_categ == 'logreg':
                        lr = LogisticRegression()
                        df = self._log_regression_impute(df, lr)
                    elif self.missing_categ == 'knn':
                        imputer = KNNImputer(n_neighbors=_n_neighbors)
                        df = self._impute(df, imputer, type='categ')
                    elif self.missing_categ == 'most_frequent':
                        imputer = SimpleImputer(strategy=self.missing_categ)
                        df = self._impute(df, imputer, type='categ')
                    elif self.missing_categ == 'delete':
                        df = self._delete(df, type='categ')
            else:
                pass
        else:
            pass
        return df
    def _impute(self, df, imputer, type):
        cols_num = df.select_dtypes(include=np.number).columns
        if type == 'num':
            for feature in df.columns:
                if feature in cols_num:
                    if df[feature].isna().sum().sum() != 0:
                        try:
                            df_imputed = pd.DataFrame(imputer.fit_transform(np.array(df[feature]).reshape(-1, 1)))
                            counter = df[feature].isna().sum().sum() - df_imputed.isna().sum().sum()

                            if (df[feature].fillna(-9999) % 1  == 0).all():
                                df[feature] = df_imputed
                                df[feature] = df[feature].round()
                                df[feature] = df[feature].astype('Int64')
                            else:
                                df[feature] = df_imputed
                            if counter != 0:
                                pass
                        except:
                            pass
        else:
            for feature in df.columns:
                if feature not in cols_num:
                    if df[feature].isna().sum()!= 0:
                        try:
                            mapping = dict()
                            mappings = {k: i for i, k in enumerate(df[feature].dropna().unique(), 0)}
                            mapping[feature] = mappings
                            df[feature] = df[feature].map(mapping[feature])
                            df_imputed = pd.DataFrame(imputer.fit_transform(np.array(df[feature]).reshape(-1, 1)), columns=[feature])
                            counter = sum(1 for i, j in zip(list(df_imputed[feature]), list(df[feature])) if i != j)
                            df[feature] = df_imputed
                            df[feature] = df[feature].round()
                            df[feature] = df[feature].astype('Int64')
                            mappings_inv = {v: k for k, v in mapping[feature].items()}
                            df[feature] = df[feature].map(mappings_inv)
                        except:
                            pass
        return df
    def _lin_regression_impute(self, df, model):
        cols_num = df.select_dtypes(include=np.number).columns
        mapping = dict()
        for feature in df.columns:
            if feature not in cols_num:
                mappings = {k: i for i, k in enumerate(df[feature])}
                mapping[feature] = mappings
                df[feature] = df[feature].map(mapping[feature])
        for feature in cols_num:
            try:
                test_df = df[df[feature].isnull()==True].dropna(subset=[x for x in df.columns if x != feature])
                train_df = df[df[feature].isnull()==False].dropna(subset=[x for x in df.columns if x != feature])
                if len(test_df.index) != 0:
                    pipe = make_pipeline(StandardScaler(), model)
                    y = np.log(train_df[feature])
                    X_train = train_df.drop(feature, axis=1)
                    test_df.drop(feature, axis=1, inplace=True)
                    try:
                        model = pipe.fit(X_train, y)
                    except:
                        y = train_df[feature]
                        model = pipe.fit(X_train, y)
                    if (y == train_df[feature]).all():
                        pred = model.predict(test_df)
                    else:
                        pred = np.exp(model.predict(test_df))
                    test_df[feature]= pred
                    if (df[feature].fillna(-9999) % 1  == 0).all():
                        test_df[feature] = test_df[feature].round()
                        test_df[feature] = test_df[feature].astype('Int64')
                        df[feature].update(test_df[feature])
                    else:
                        df[feature].update(test_df[feature])
            except:
                pass
        for feature in df.columns:
            try:
                mappings_inv = {v: k for k, v in mapping[feature].items()}
                df[feature] = df[feature].map(mappings_inv)
            except:
                pass
        return df
    def _log_regression_impute(self, df, model):
        cols_num = df.select_dtypes(include=np.number).columns
        mapping = dict()
        for feature in df.columns:
            if feature not in cols_num:
                mappings = {k: i for i, k in enumerate(df[feature])}
                mapping[feature] = mappings
                df[feature] = df[feature].map(mapping[feature])
        for feature in cols_num:
            try:
                test_df = df[df[feature].isnull()==True].dropna(subset=[x for x in df.columns if x != feature])
                train_df = df[df[feature].isnull()==False].dropna(subset=[x for x in df.columns if x != feature])
                if len(test_df.index) != 0:
                    pipe = make_pipeline(StandardScaler(), model)
                    y = train_df[feature].astype('int')
                    X_train = train_df.drop(feature, axis=1)
                    test_df.drop(feature, axis=1, inplace=True)
                    try:
                        model = pipe.fit(X_train, y)
                    except:
                        y = np.log(train_df[feature].astype('int'))
                        model = pipe.fit(X_train, y)
                    if (y == np.log(train_df[feature].astype('int'))).all():
                        pred = model.predict(test_df)
                    else:
                        pred = np.exp(model.predict(test_df))
                    test_df[feature]= pred
                    if (df[feature].fillna(-9999) % 1  == 0).all():
                        test_df[feature] = test_df[feature].round()
                        test_df[feature] = test_df[feature].astype('Int64')
                        df[feature].update(test_df[feature])
                    else:
                        df[feature].update(test_df[feature])
            except:
                pass
        for feature in df.columns:
            try:
                mappings_inv = {v: k for k, v in mapping[feature].items()}
                df[feature] = df[feature].map(mappings_inv)
            except:
                pass
        return df
    def _delete(self, df, type):
        if type == 'num':
            cols_num = df.select_dtypes(include=np.number).columns
            for feature in df.columns:
                if feature in cols_num:
                    df = df.dropna(subset=[feature])
        else:
            for feature in df.columns:
                if feature not in cols_num:
                    df = df.dropna(subset=[feature])
        return df


from scipy.stats import zscore
class Outliers:
    def __init__(self, method=None):
        self.method = method
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X
    def handle(self, df):
        if self.method:
            self.count_outliers = 0
            if self.method == 'zscore':
                df, self.count_outliers = self._zscore_outliers(df)
            elif self.method == 'iqr':
                df, self.count_outliers = self._iqr_outliers(df)
            elif self.method == 'manual':
                df, self.count_outliers = self._manual_outliers(df)
            else:
                pass
        return df
    def _zscore_outliers(self, df):
        outlier_count = 0
        cols_num = df.select_dtypes(include=np.number).columns
        for feature in cols_num:
            if df[feature].dtype != 'object':
                z_scores = zscore(df[feature])
                abs_z_scores = np.abs(z_scores)
                outliers = (abs_z_scores > self.threshold).sum()
                outlier_count += outliers
                df = df[(abs_z_scores < self.threshold).all(axis=1)]
        return df, outlier_count
    def _iqr_outliers(self, df):
        outlier_count = 0
        cols_num = df.select_dtypes(include=np.number).columns
        for feature in cols_num:
            if df[feature].dtype != 'object':
                q1 = df[feature].quantile(0.25)
                q3 = df[feature].quantile(0.75)
                iqr = q3 - q1
                lower_bound = q1 - (self.threshold * iqr)
                upper_bound = q3 + (self.threshold * iqr)
                outliers = ((df[feature] < lower_bound) | (df[feature] > upper_bound)).sum()
                outlier_count += outliers
                df = df[(df[feature] >= lower_bound) & (df[feature] <= upper_bound)]
        return df, outlier_count
    def _manual_outliers(self, df):
        outlier_count = 0
        for feature in self.manual_dict:
            if feature in df.columns:
                outliers = df[df[feature].isin(self.manual_dict[feature])]
                outlier_count += len(outliers)
                df = df[~df[feature].isin(self.manual_dict[feature])]
        return df, outlier_count


class Adjust:
    def __init__(self, scaler=None, extract_datetime=False, round_values=False):
        self.scaler = scaler
        self.extract_datetime = extract_datetime
        self.round_values = round_values
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X
    def handle(self, df):
        if self.scaler == 'minmax':
            scaler = preprocessing.MinMaxScaler()
        elif self.scaler == 'standard':
            scaler = preprocessing.StandardScaler()
        elif self.scaler == 'robust':
            scaler = preprocessing.RobustScaler()
        elif self.scaler == 'maxabs':
            scaler = preprocessing.MaxAbsScaler()
        elif self.scaler == 'quantile':
            scaler = preprocessing.QuantileTransformer()
        elif self.scaler == 'power':
            scaler = preprocessing.PowerTransformer()
        if self.extract_datetime:
            df = self._convert_datetime(df)
        if self.round_values:
            df = self._round_values(df)
        df[df.columns] = scaler.fit_transform(df[df.columns])
        return df
    def _convert_datetime(self, df):
        cols = set(df.columns) ^ set(df.select_dtypes(include=np.number).columns)
        for feature in cols:
            try:
                df[feature] = pd.to_datetime(df[feature], infer_datetime_format=True)
                try:
                    df['Day'] = pd.to_datetime(df[feature]).dt.day
                    if self.extract_datetime in ['auto', 'M','Y','h','m','s']:
                        df['Month'] = pd.to_datetime(df[feature]).dt.month
                        if self.extract_datetime in ['auto', 'Y','h','m','s']:
                            df['Year'] = pd.to_datetime(df[feature]).dt.year
                            if self.extract_datetime in ['auto', 'h','m','s']:
                                df['Hour'] = pd.to_datetime(df[feature]).dt.hour
                                if self.extract_datetime in ['auto', 'm','s']:
                                    df['Minute'] = pd.to_datetime(df[feature]).dt.minute
                                    if self.extract_datetime in ['auto', 's']:
                                        df['Sec'] = pd.to_datetime(df[feature]).dt.second
                except:
                    pass
            except:
                pass
        return df
    def _round_values(self, df):
        cols_num = df.select_dtypes(include=np.number).columns
        for feature in cols_num:
            if (df[feature].fillna(-9999) % 1  == 0).all():
                try:
                    df[feature] = df[feature].astype('Int64')
                except:
                    pass
            else:
                try:
                    df[feature] = df[feature].astype(float)
                    dec = None
                    for value in df[feature]:
                        try:
                            if dec is None:
                                dec = str(value)[::-1].find('.')
                            else:
                                if str(value)[::-1].find('.') > dec:
                                    dec = str(value)[::-1].find('.')
                        except:
                            pass
                    df[feature] = df[feature].round(decimals=dec)
                except:
                    pass
        return df


class EncodeCateg:
    def __init__(self, encode_categ=None):
        self.encode_categ = encode_categ
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X
    def handle(self, df):
        if self.encode_categ:
            if not isinstance(self.encode_categ, list):
                self.encode_categ = ['auto']
            cols_categ = set(df.columns) ^ set(df.select_dtypes(include=pd.np.number).columns)
            if len(self.encode_categ) == 1:
                target_cols = cols_categ
            else:
                target_cols = self.encode_categ[1]
            for feature in target_cols:
                if feature in cols_categ:
                    feature = feature
                else:
                    feature = df.columns[feature]
                try:
                    pd.to_datetime(df[feature])
                except:
                    try:
                        if self.encode_categ[0] == 'auto':
                            if df[feature].nunique() <=10:
                                df = self._to_onehot(df, feature)
                            elif df[feature].nunique() <=20:
                                df = self._to_label(df, feature)
                        elif self.encode_categ[0] == 'onehot':
                            df = self._to_onehot(df, feature)
                        elif self.encode_categ[0] == 'label':
                            df = self._to_label(df, feature)
                    except:
                        pass
        return df
    def _to_onehot(self, df, feature, limit=10):
        one_hot = pd.get_dummies(df[feature], prefix=feature)
        if one_hot.shape[1] > limit:
            print('ONEHOT encoding for feature "{}" creates {} new features. Consider LABEL encoding instead.'.format(feature, one_hot.shape[1]))
        df = df.join(one_hot)
        return df
    def _to_label(self, df, feature):
        le = LabelEncoder()
        df[feature + '_lab'] = le.fit_transform(df[feature].values)
        mapping = dict(zip(le.classes_, range(len(le.classes_))))
        for key in mapping:
            try:
                if pd.np.isnan(key):
                    replace = {mapping[key] : key }
                    df[feature].replace(replace, inplace=True)
            except:
                pass
        return df

In [None]:
class CleanData:
    def __init__(self, input_data, mode='auto', duplicates=False, missing_num=False, missing_categ=False,
                 encode_categ=False, extract_datetime=False, outliers=False, outlier_param=1.5):
        self.input_data = input_data
        output_data = input_data.copy()
        duplicates, missing_num, missing_categ, outliers, encode_categ, extract_datetime = 'auto', 'auto', 'auto', 'winz', ['auto'], 's'
        self.mode = mode
        self.duplicates = duplicates
        self.missing_num = missing_num
        self.missing_categ = missing_categ
        self.outliers = outliers
        self.encode_categ = encode_categ
        self.extract_datetime = extract_datetime
        self.outlier_param = outlier_param

    def _clean_data(self, input_data):
        df = self.input_data.reset_index(drop=True)
        df = Duplicates.handle(self, df)
        df = MissingValues.handle(self, df)
        df = Outliers.handle(self, df)
        df = Adjust.convert_datetime(self, df)
        df = EncodeCateg.handle(self, df)
        df = Adjust.round_values(self, df, self.input_data)
        return df

In [None]:
dataset = pd.read_csv('dp_data1.csv')
cleaner = CleanData(input_data=dataset, mode='auto', duplicates=True, missing_num=True, missing_categ=True,
                    outliers=True, encode_categ=True, extract_datetime=True)
cleaned_data = cleaner._clean_data(dataset)

AttributeError: 'CleanData' object has no attribute '_lin_regression_impute'

In [None]:
dataset = pd.read_csv('dp_data1.csv')
dataset

print("Column indexes:")
for i, column in enumerate(dataset.columns):
    print(f"{i} : {column}")

target_index = int(input("Enter the index of the target variable: "))
feature_indexes_str = input("Enter the indexes of the features (comma-separated): ")
feature_indexes = [int(idx.strip()) for idx in feature_indexes_str.split(',')]

target_variable = dataset.columns[target_index]
features = [dataset.columns[idx] for idx in feature_indexes]
print()

data = dataset[features + [target_variable]]
data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 25 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Feature_1   1000 non-null   float64
 1   Feature_2   1000 non-null   float64
 2   Feature_3   1000 non-null   float64
 3   Feature_4   1000 non-null   int64  
 4   Feature_5   1000 non-null   int64  
 5   Feature_6   1000 non-null   object 
 6   Feature_7   1000 non-null   int64  
 7   Feature_8   1000 non-null   float64
 8   Feature_9   1000 non-null   float64
 9   Feature_10  1000 non-null   float64
 10  Feature_11  1000 non-null   int64  
 11  Feature_12  1000 non-null   float64
 12  Feature_13  1000 non-null   float64
 13  Feature_14  1000 non-null   float64
 14  Feature_15  1000 non-null   int64  
 15  Feature_16  1000 non-null   float64
 16  Feature_17  782 non-null    float64
 17  Feature_18  1000 non-null   float64
 18  Feature_19  746 non-null    object 
 19  Feature_20  1000 non-null   

In [None]:
from sklearn.pipeline import Pipeline
preprocessing_pipeline = Pipeline([
    ('missing_values', MissingValues(missing_num='knn', missing_categ='logreg')),
    ('outliers', Outliers(method='zscore')),
    ('adjust', Adjust(scaler='standard', extract_datetime=True, round_values=True)),
    ('encode_categ', EncodeCateg(encode_categ=['auto'])),
    ('duplicates', Duplicates())
])

cleaned_dataset = preprocessing_pipeline.fit_transform(dataset)
cleaned_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 25 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Feature_1   1000 non-null   float64
 1   Feature_2   1000 non-null   float64
 2   Feature_3   1000 non-null   float64
 3   Feature_4   1000 non-null   int64  
 4   Feature_5   1000 non-null   int64  
 5   Feature_6   1000 non-null   object 
 6   Feature_7   1000 non-null   int64  
 7   Feature_8   1000 non-null   float64
 8   Feature_9   1000 non-null   float64
 9   Feature_10  1000 non-null   float64
 10  Feature_11  1000 non-null   int64  
 11  Feature_12  1000 non-null   float64
 12  Feature_13  1000 non-null   float64
 13  Feature_14  1000 non-null   float64
 14  Feature_15  1000 non-null   int64  
 15  Feature_16  1000 non-null   float64
 16  Feature_17  782 non-null    float64
 17  Feature_18  1000 non-null   float64
 18  Feature_19  746 non-null    object 
 19  Feature_20  1000 non-null   

In [None]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [None]:
#ML models
regression_models = [LinearRegression, DecisionTreeRegressor, RandomForestRegressor]
classification_models = [LogisticRegression, DecisionTreeClassifier, RandomForestClassifier]

#problem selection
target_dtype = data[target_variable].dtype

models = None
if target_dtype in [np.float64, np.int64]:
    models = regression_models
elif target_dtype == np.object:
    models = classification_models
else:
    raise ValueError("Unsupported target variable type. Please ensure the target variable is numeric or categorical.")

In [None]:
best_model = None
best_score = -float('inf')

for model in models:
    scores = cross_val_score(model, cleaned_dataset.drop(columns=[target_variable]), cleaned_dataset[target_variable], cv=5)
    avg_score = scores.mean()
    if avg_score > best_score:
        best_score = avg_score
        best_model = model

print(f"The best model is {best_model.__class__.__name__} with an average cross-validation score of {best_score:.2f}.")

In [None]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, accuracy_score

# Create features representing characteristics of the dataset and models
features = np.column_stack([
    X.mean(axis=1),
    X.var(axis=1),
    X.min(axis=1),
    X.max(axis=1),
    X.mean() / X.var(),
])

# Create labels representing the performance of each model
# For regression, use mean squared error; for classification, use accuracy
labels_regression = np.array([
    mean_squared_error(y_regression, LinearRegression().fit(X, y_regression).predict(X)),
    mean_squared_error(y_regression, PolynomialFeatures(degree=2).fit_transform(X)),
    mean_squared_error(y_regression, DecisionTreeRegressor().fit(X, y_regression).predict(X)),
])

labels_classification = np.array([
    accuracy_score(y_classification, LogisticRegression().fit(X, y_classification).predict(X)),
    accuracy_score(y_classification, DecisionTreeClassifier().fit(X, y_classification).predict(X)),
])

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(features, labels_regression, test_size=0.2, random_state=42)

# Build a simple neural network
model = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation='relu', input_shape=(features.shape[1],)),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=8, validation_data=(X_test, y_test))

# Use the trained neural network to predict the best model for a new dataset
new_data = np.array([[X.mean(), X.var(), X.min(), X.max(), X.mean() / X.var()]])
predicted_performance = model.predict(new_data)

# Identify the model with the lowest predicted performance (mean squared error)
best_model_index = np.argmin(predicted_performance)
best_model = ["Linear Regression", "Polynomial Regression", "Decision Tree Regression"][best_model_index]

print(f"The best fitting model for the given dataset is: {best_model}")
