In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler  
from sklearn.impute import SimpleImputer 
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA

In [30]:
def load_basket(filepath):
    basket = pd.read_csv(filepath)
    return basket

def load_info(filepath):
    info = pd.read_csv(filepath)
    return info

customer_info= load_info('customer_info.csv')
customer_basket = load_basket('customer_basket.csv')
customer_info.head()

Unnamed: 0.1,Unnamed: 0,customer_id,customer_name,customer_gender,customer_birthdate,kids_home,teens_home,number_complaints,distinct_stores_visited,lifetime_spend_groceries,lifetime_spend_electronics,typical_hour,lifetime_spend_vegetables,lifetime_spend_nonalcohol_drinks,lifetime_spend_alcohol_drinks,lifetime_spend_meat,lifetime_spend_fish,lifetime_spend_hygiene,lifetime_spend_videogames,lifetime_spend_petfood,lifetime_total_distinct_products,percentage_of_products_bought_promotion,year_first_transaction,loyalty_card_number,latitude,longitude
0,0,29930,April Clark,female,01/15/1972 02:27 PM,2.0,2.0,1.0,4.0,7789.0,5601.0,13.0,726.0,962.0,1213.0,1598.0,1894.0,457.0,412.0,428.0,386.0,0.158741,2018.0,,38.721807,-9.125534
1,1,6813,Bsc. Paul Ketchum,male,07/31/1944 10:53 AM,0.0,1.0,0.0,4.0,8653.0,35.0,14.0,792.0,102.0,104.0,741.0,346.0,394.0,75.0,226.0,73.0,1.22789,2013.0,971840.0,38.734668,-9.163533
2,2,39451,Mary Downing,female,11/13/1989 02:11 PM,2.0,3.0,0.0,7.0,15605.0,4275.0,14.0,1585.0,980.0,1872.0,1323.0,1971.0,920.0,335.0,192.0,319.0,0.101598,2011.0,,38.787126,-9.147077
3,3,21557,Manuel Kueny,male,08/09/1976 06:23 AM,0.0,0.0,1.0,1.0,13440.0,16366.0,14.0,28.0,269.0,1855.0,939.0,785.0,139.0,679.0,270.0,221.0,0.259943,2009.0,,38.741816,-9.1597
4,4,16415,Phd. Curtis Tharp,male,07/11/1966 08:12 AM,1.0,1.0,1.0,5.0,49250.0,3197.0,14.0,258.0,726.0,547.0,983.0,1492.0,1046.0,112.0,144.0,244.0,0.317822,2012.0,925367.0,38.785921,-9.149221


In [None]:
def load_basket(filepath):
    basket = pd.read_csv(filepath)
    return basket

def load_info(filepath):
    info = pd.read_csv(filepath)
    return info

customer_info= load_info('customer_info.csv')
customer_basket = load_basket('customer_basket.csv')



def missing_values(df, strategy='median'):
    handled_missing = df.copy()
    
    # create new df for numeric and categorical columns
    num_cols = df.select_dtypes(include=['number']).columns
    cat_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns

    # Use simple imputer to impute numeric columns by median
    if len(num_cols) > 0:
        num_imputer = SimpleImputer(strategy= 'median')
        handled_missing[num_cols] = num_imputer.fit_transform(df[num_cols])
    
    # Use simple imputer to impute categorical columns by most frequent
    if len(cat_cols) > 0:
        cat_imputer = SimpleImputer(strategy= 'most frequent')
        handled_missing[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

    return handled_missing

def encoding():
    return df


def scalling(df, scaler = 'robust'):
    if scaler == 'minmax':
        scaler = MinMaxScaler()
    elif scaler == 'robust':
        scaler = RobustScaler()
    else:     
        scaler = StandardScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    return df_scaled


def preprocess(path):
    df = load_info(path)
    df = missing_values(df)
    df = scalling(df)
    return df

In [None]:
def feature_selection(path, method, threshold=0.01, n_components=3, correlation_threshold=0.9):
    df = preprocess(path)
    original_features = df.columns.tolist()

    result_dict = {feature: False for feature in original_features} 

    if method == "variance":
        selected_features = df.columns[VarianceThreshold(threshold=threshold).fit(df).get_support()]
        result_dict.update({f: True for f in selected_features})
        return pd.Series(result_dict, name="Keep (Variance)")
    
    elif method == 'correlation':
        corr_matrix = df.corr().abs()
        to_drop = set()

        for i in range(len(corr_matrix.columns)):
            for j in range(i + 1, len(corr_matrix.columns)):
                col1 = corr_matrix.columns[i]
                col2 = corr_matrix.columns[j]
                if corr_matrix.iloc[i, j] > correlation_threshold:
                    to_drop.add(col2)  
        df_selected = df.drop(columns=list(to_drop))
        for f in df.columns:
            result_dict[f] = f not in to_drop
        return pd.Series(result_dict, name="Keep (Correlation)")

    elif method == "pca":
        pca = PCA(n_components=n_components)
        pca.fit(df)
        matrix = pd.DataFrame(pca.components_.T, index=df.columns, columns=[f"PC{i+1}" for i in range(n_components)])
        
        # A feature is important if it contributes highly to a PC
        importance_threshold = 0.3 
        important_features = set()

        for col in matrix.columns:
            important_features.update(matrix[matrix[col].abs() >= importance_threshold].index.tolist())

        for f in df.columns:
            result_dict[f] = f in important_features
        return pd.Series(result_dict, name="Important in PCA")

    else:
        raise ValueError("Choose method from 'variance', 'correlation', or 'pca'")

In [60]:
var_report = feature_selection("customer_info.csv", method="variance")
corr_report = feature_selection("customer_info.csv", method="correlation")
pca_report = feature_selection("customer_info.csv", method="pca", n_components=5)

summary = pd.concat([var_report, corr_report, pca_report], axis=1)
print(summary)
summary

  df['customer_birthdate'] = pd.to_datetime(df['customer_birthdate'], errors='coerce')
  df['customer_birthdate'] = pd.to_datetime(df['customer_birthdate'], errors='coerce')
  df['customer_birthdate'] = pd.to_datetime(df['customer_birthdate'], errors='coerce')


                                         Keep (Variance)  ...  Important in PCA
Unnamed: 0                                          True  ...             False
kids_home                                           True  ...              True
teens_home                                          True  ...             False
number_complaints                                   True  ...             False
distinct_stores_visited                             True  ...              True
lifetime_spend_groceries                            True  ...              True
lifetime_spend_electronics                          True  ...              True
typical_hour                                        True  ...             False
lifetime_spend_vegetables                           True  ...              True
lifetime_spend_nonalcohol_drinks                    True  ...             False
lifetime_spend_alcohol_drinks                       True  ...             False
lifetime_spend_meat                     

Unnamed: 0,Keep (Variance),Keep (Correlation),Important in PCA
Unnamed: 0,True,True,False
kids_home,True,True,True
teens_home,True,True,False
number_complaints,True,True,False
distinct_stores_visited,True,True,True
lifetime_spend_groceries,True,True,True
lifetime_spend_electronics,True,True,True
typical_hour,True,True,False
lifetime_spend_vegetables,True,True,True
lifetime_spend_nonalcohol_drinks,True,True,False


In [44]:
preprocess('customer_info.csv')

  df['customer_birthdate'] = pd.to_datetime(df['customer_birthdate'], errors='coerce')


Unnamed: 0.1,Unnamed: 0,kids_home,teens_home,number_complaints,distinct_stores_visited,lifetime_spend_groceries,lifetime_spend_electronics,typical_hour,lifetime_spend_vegetables,lifetime_spend_nonalcohol_drinks,lifetime_spend_alcohol_drinks,lifetime_spend_meat,lifetime_spend_fish,lifetime_spend_hygiene,lifetime_spend_videogames,lifetime_spend_petfood,lifetime_total_distinct_products,percentage_of_products_bought_promotion,year_first_transaction,latitude,longitude,age
0,-1.000000,1.0,1.0,0.0,0.5,-0.383664,0.594066,0.000000,0.330430,1.183721,0.545650,1.004418,1.242105,-0.266968,0.289720,0.569307,1.526012,-0.203400,0.428571,-0.753102,0.887667,-0.032258
1,-0.999941,-1.0,0.0,-1.0,0.5,-0.342682,-0.657002,0.142857,0.409733,-0.816279,-0.645542,-0.257732,-0.387368,-0.361991,-0.760125,-0.430693,-0.283237,2.460565,-0.285714,-0.379844,-0.183792,0.838710
2,-0.999883,1.0,2.0,-1.0,2.0,-0.012926,0.296022,0.142857,1.362571,1.225581,1.253491,0.599411,1.323158,0.431373,0.049844,-0.599010,1.138728,-0.345781,-0.571429,1.142705,0.280225,-0.612903
3,-0.999824,-1.0,-1.0,0.0,-1.0,-0.115619,3.013711,0.142857,-0.508261,-0.427907,1.235231,0.033873,0.074737,-0.746606,1.121495,-0.212871,0.572254,0.048761,-0.857143,-0.172381,-0.075719,-0.193548
4,-0.999765,0.0,0.0,0.0,1.0,1.582967,0.053720,0.142857,-0.231901,0.634884,-0.169710,0.098675,0.818947,0.621418,-0.644860,-0.836634,0.705202,0.192977,-0.428571,1.107723,0.219761,0.129032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34055,0.999765,0.0,0.0,-1.0,-0.5,2.202967,0.515172,0.428571,0.434965,0.174419,0.879699,-0.337261,1.024211,0.624434,0.186916,-0.272277,0.236994,-0.194453,0.428571,0.088765,1.500281,-0.903226
34056,0.999824,0.0,0.0,-1.0,0.0,0.009795,0.188357,0.000000,0.061280,0.686047,-0.011815,0.418262,-0.153684,0.475113,-0.174455,0.029703,1.485549,0.931401,0.571429,-0.079368,-0.342687,-0.451613
34057,0.999883,0.0,-1.0,0.0,-0.5,-0.059126,1.309508,0.571429,-0.058877,-0.444186,-0.237379,-0.599411,0.797895,-0.401207,1.510903,-1.004950,0.612717,0.735651,0.428571,0.582766,-0.432914,-0.967742
34058,0.999941,0.0,0.0,0.0,0.5,1.021831,0.391998,-0.571429,-0.307600,0.074419,0.000000,0.463918,0.200000,0.684766,0.255452,0.311881,0.167630,0.484980,0.000000,-0.443864,0.024242,0.322581
