In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import skew

In [None]:
#@title Preprocessing database

irrelevant_columns = [
    "ID", "id", "Index", "index", "Serial Number", "serial_number",
    "Address", "address", "Href", "href", "Timestamp", "timestamp",
    "Creation Date", "creation_date", "Last Updated Date", "last_updated_date",
    "Version", "version", "Checksum", "checksum", "Year", "year"
    "Row ID", "row_id", "Record ID", "record_id", "Customer ID", "customer_id",
    "Client ID", "client_id", "Account ID", "account_id", "Transaction ID", "transaction_id",
    "Email", "email", "Phone Number", "phone_number", "Website", "website",
    "Fax", "fax", "IP Address", "ip_address", "MAC Address", "mac_address",
    "Social Security Number", "social_security_number", "Driver's License", "drivers_license",
    "ID_", "id_", "Index_", "index_", "Serial_Number", "serial_number_",
    "Address_", "address_", "Href_", "href_", "Timestamp_", "timestamp_",
    "Creation_Date", "creation_date_", "Last_Updated_Date", "last_updated_date_",
    "Version_", "version_", "Checksum_", "checksum_",
    "Row_ID", "row_id_", "Record_ID", "record_id_", "Customer_ID", "customer_id_",
    "Client_ID", "client_id_", "Account_ID", "account_id_", "Transaction_ID", "transaction_id_",
    "Email_", "email_", "Phone_Number", "phone_number_", "Website_", "website_",
    "Fax_", "fax_", "IP_Address", "ip_address_", "MAC_Address", "mac_address_",
    "Social_Security_Number", "social_security_number_", "Driver's_License", "drivers_license_"
]

ordinal_data = {
    'low':1,
    'medium':2,
    'moderate':2,
    'high':3,
    # Education Level
    'high school diploma': 1,
    'associate\'s degree': 2,
    'bachelor\'s degree': 3,
    'master\'s degree': 4,
    'doctorate degree': 5,
    # Income Level
    'low income': 1,
    'middle income': 2,
    'high income': 3,
    # Customer Satisfaction
    'very dissatisfied': 1,
    'dissatisfied': 2,
    'neutral': 3,
    'satisfied': 4,
    'very satisfied': 5,
    # Likert Scale
    'strongly disagree': 1,
    'disagree': 2,
    'neither agree nor disagree': 3,
    'agree': 4,
    'strongly agree': 5,
    # Job Seniority
    'entry-level': 1,
    'mid-level': 2,
    'senior-level': 3,
    'executive-level': 4,
    # Severity of Illness/Condition
    'mild': 1,
    'moderate': 2,
    'severe': 3,
    # Temperature
    'cold': 1,
    'warm': 2,
    'hot': 3,
    'very hot': 4,
    # Customer Rating
    '1 star': 1,
    '2 stars': 2,
    '3 stars': 3,
    '4 stars': 4,
    '5 stars': 5,
    # Likelihood of Purchase
    'very unlikely': 1,
    'unlikely': 2,
    'likely': 4,
    'very likely': 5,
    # Degree of Agreement
    'strongly disagree': 1,
    'disagree': 2,
    'neutral': 3,
    'agree': 4,
    'strongly agree': 5,
    # Pain Scale
    'no pain': 1,
    'mild pain': 2,
    'moderate pain': 3,
    'severe pain': 4,
    'extreme pain': 5,
    # Likelihood of Recommendation
    'very unlikely to recommend': 1,
    'unlikely to recommend': 2,
    'likely to recommend': 4,
    'very likely to recommend': 5,
    # Quality Ratings
    'poor quality': 1,
    'fair quality': 2,
    'good quality': 3,
    'very good quality': 4,
    'excellent quality': 5,
    # Customer Service Experience
    'very poor': 1,
    'poor': 2,
    'average': 3,
    'good': 4,
    'excellent': 5,
    # Ease of Use
    'very difficult': 1,
    'difficult': 2,
    'easy': 4,
    'very easy': 5,
    # Likelihood of Churn
    'very unlikely to churn': 1,
    'unlikely to churn': 2,
    'likely to churn': 4,
    'very likely to churn': 5,
    # Satisfaction with Product/Service
    'not satisfied': 1,
    'slightly satisfied': 2,
    'moderately satisfied': 3,
    'extremely satisfied': 5,
    # Risk Levels
    'low risk': 1,
    'moderate risk': 2,
    'high risk': 3,
    # Performance Ratings
    'below expectations': 1,
    'meeting expectations': 2,
    'exceeding expectations': 3 }

In [None]:
#@title Data Preprocessing

def remove_irrelevant_columns(data, irrelevant_columns):
  columns_to_delete = [col for col in data.columns if col in irrelevant_columns]

  if len(columns_to_delete)>0:
    data.drop(columns=columns_to_delete, inplace=True)

  return data


def remove_duplicate_values(data):
  if data.duplicated().any():
    data.drop_duplicates(inplace=True)

  return data


def remove_constant_values(data):
  constant_columns = [col for col in data.columns if data[col].nunique() == 1]

  if len(constant_columns) > 0:
    data.drop(columns=constant_columns, inplace=True)

  return data


def remove_string_numerical(data):
  string_num_cols = [col for col in data.columns if data[col].dtype == 'object' and data[col].str.isnumeric().all()]

  if len(string_num_cols) > 0:
    data[string_num_cols] = data[string_num_cols].apply(pd.to_numeric)

  return data


def remove_object_numerical(data):
  import re
  object_cols = data.select_dtypes(include=['object']).columns

  if len(object_cols) > 0:
    for col in object_cols:
      numerical_values = data[col].apply(lambda x: re.findall(r'\d+\.\d+|\d+', str(x)))
      numeric_col = col + '_numeric'
      data[numeric_col] = numerical_values.apply(lambda x: float(x[0]) if x else None)

  return data


def missing_values(data, threshold=0.5, k_neighbors=5):
  numerical_cols = data.select_dtypes(include=['number']).columns
  categorical_cols = data.select_dtypes(include=['object']).columns

  if data.isnull().any().any():
      missing_percentage = data.isnull().mean()

      if (missing_percentage > threshold).any():
        imputer = KNNImputer(n_neighbors=k_neighbors)
        strategy = 'knn'
      else:
        imputer = SimpleImputer(strategy='mean')
        strategy = 'mean'
      if strategy == 'knn':
        imputer = KNNImputer(n_neighbors=k_neighbors)
      else:
        imputer = SimpleImputer(strategy=strategy)
      data[numerical_cols] = imputer.fit_transform(data[numerical_cols])
      mode_imputer = SimpleImputer(strategy='most_frequent')
      data[categorical_cols] = mode_imputer.fit_transform(data[categorical_cols])

  return data


# def convert_datetime(data):
#     object_cols = data.select_dtypes(include=['object']).columns

#     if len(object_cols) > 0:
#         for col in object_cols:
#             try:
#                 data[col] = pd.to_datetime(data[col])
#                 data[col + '_numeric'] = data[col].astype('int64') // 10**9
#                 data.drop(columns=[col], inplace=True)
#             except (ValueError, TypeError):
#                 pass

#     return data


def encode_objects(data):
  categorical_columns = data.select_dtypes(include=['object']).columns

  if len(categorical_columns) > 0:
    for col in categorical_columns:
      unique_values_count = data[col].nunique()

      if unique_values_count == 2:
        encoder = LabelEncoder()
        data[col] = encoder.fit_transform(data[col])

      elif unique_values_count <= 7:
          if all(value in ordinal_data for value in data[col].str.lower()):
            data[col] = data[col].str.lower().map(ordinal_data)
          elif any(word in data[col].str.lower() for word in ['low', 'medium', 'moderate', 'high']):
            data[col] = data[col].apply(lambda x: ordinal_data[x.lower()] if x.lower() in ordinal_data else x)
          else:
            encoder = OneHotEncoder(sparse_output=False, drop='first')
            encoded_values = encoder.fit_transform(data[[col]])
            col_names = [f"{col}_{value}" for value in encoder.categories_[0][1:]]
            df = pd.DataFrame(encoded_values, columns=col_names)
            data = pd.concat([data, df], axis=1)
            data.drop(columns=[col], inplace=True)

  return data


def adjust_values(data):
  scaler = StandardScaler()
  scaler.fit_transform(data)

  return data


#master function
def automatic_data_preprocessing(data, threshold=0.5, k_neighbors=5):

    data = remove_irrelevant_columns(data, irrelevant_columns)
    data = remove_duplicate_values(data)
    data = remove_constant_values(data)
    data = remove_string_numerical(data)
    data = remove_object_numerical(data)
    data = missing_values(data, threshold, k_neighbors)
    # data = convert_datetime(data)
    data = encode_objects(data)
    data = adjust_values(data)

    return data

In [None]:
#@title Feature selection
from sklearn.feature_selection import SelectKBest, f_classif, f_regression

def feature_selection(X, y, k=10, problem_type):
    if problem_type == 'classification':
        selector = SelectKBest(score_func=f_classif, k=k)
    elif problem_type == 'regression':
        selector = SelectKBest(score_func=f_regression, k=k)
    else:
        raise ValueError("Invalid problem type. Use 'Classification' or 'Regression' !!")

    X_new = selector.fit_transform(X, y)
    selected_features_indices = selector.get_support(indices=True)
    selected_features_names = X.columns[selected_features_indices].tolist()

    return X_new, selected_features_names

# Example usage for classification
data_c = pd.read_csv('stroke_data.csv')
cdata = automatic_data_preprocessing(data_c)
X_train = cdata.drop('stroke', axis=1)
y_train = cdata['stroke']

X_train_selected, selected_features = feature_selection_filter_method(X_train, y_train, problem_type='classification')

print("Selected Features (Classification):", selected_features)

# Example usage for regression
data_r = pd.read_csv('audiA1_price_data.csv')
rdata = automatic_data_preprocessing(data_r)
X_train_r = rdata.drop('Price(£)', axis=1)
y_train_r = rdata['Price(£)']

X_train_selected_r, selected_features_r = feature_selection_filter_method(X_train_r, y_train_r, problem_type='regression')

print("\nSelected Features (Regression):", selected_features_r)

Selected Features (Classification): ['age', 'hypertension', 'heart_disease', 'ever_married', 'Residence_type', 'avg_glucose_level', 'bmi', 'work_type_Self-employed', 'work_type_children', 'smoking_status_formerly smoked']

Selected Features (Regression): ['Mileage(miles)', 'Transmission', 'Fuel', 'Number_of_Owners', 'MileageRank', 'PriceRank', 'Engine_numeric', 'Engine_1.4L', 'Engine_1.5L', 'Engine_1.6L']


In [None]:
dataset = pd.read_csv('audiA1_price_data.csv')

num_duplicates = dataset.duplicated().sum()
print('Number of Duplicate Values:', num_duplicates)
print()
null_columns = dataset.columns[dataset.isnull().any()]
print('Features with Null Values:', null_columns.tolist())
print()
print('Data Types:', dataset.dtypes.unique())
print()
for column_name in dataset.columns:
    unique_values = dataset[column_name].nunique()
    print(f"Unique values in '{column_name}': {unique_values}")
print()
print(dataset.info())

Number of Duplicate Values: 0

Features with Null Values: []

Data Types: [dtype('int64') dtype('float64') dtype('O')]

Unique values in 'index': 471
Unique values in 'Year': 9
Unique values in 'Type': 1
Unique values in 'Mileage(miles)': 416
Unique values in 'Engine': 6
Unique values in 'PS': 28
Unique values in 'Transmission': 2
Unique values in 'Fuel': 2
Unique values in 'Number_of_Owners': 7
Unique values in 'href': 471
Unique values in 'PPY': 406
Unique values in 'MileageRank': 471
Unique values in 'PriceRank': 471
Unique values in 'PPYRank': 471
Unique values in 'Score': 306
Unique values in 'Price(£)': 356

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 471 entries, 0 to 470
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             471 non-null    int64  
 1   Year              471 non-null    float64
 2   Type              471 non-null    object 
 3   Mileage(miles)    471 non-null   

In [None]:
preprocessed_data = automatic_data_preprocessing(dataset)

num_duplicate = preprocessed_data.duplicated().sum()
print('Number of Duplicate Values:', num_duplicate)
print()
null_column = preprocessed_data.columns[preprocessed_data.isnull().any()]
print('Features with Null Values:', null_column.tolist())
print()
print('Data Types:', preprocessed_data.dtypes.unique())
print()
for column_name in preprocessed_data.columns:
    unique_values = preprocessed_data[column_name].nunique()
    print(f"Unique values in '{column_name}': {unique_values}")
print()
print(preprocessed_data.info())

Number of Duplicate Values: 0

Features with Null Values: []

Data Types: [dtype('float64') dtype('int64')]

Unique values in 'Mileage(miles)': 416
Unique values in 'PS': 28
Unique values in 'Transmission': 2
Unique values in 'Fuel': 2
Unique values in 'Number_of_Owners': 7
Unique values in 'PPY': 406
Unique values in 'MileageRank': 471
Unique values in 'PriceRank': 471
Unique values in 'PPYRank': 471
Unique values in 'Score': 306
Unique values in 'Price(£)': 356
Unique values in 'Engine_numeric': 6
Unique values in 'Transmission_numeric_numeric': 1
Unique values in 'Fuel_numeric_numeric': 1
Unique values in 'Engine_1.2L': 2
Unique values in 'Engine_1.4L': 2
Unique values in 'Engine_1.5L': 2
Unique values in 'Engine_1.6L': 2
Unique values in 'Engine_2.0L': 2

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 471 entries, 0 to 470
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Mi

  data[col] = pd.to_datetime(data[col])
  data[col] = pd.to_datetime(data[col])
  data[col] = pd.to_datetime(data[col])
