In [1]:
import os
os.chdir('../')
import pandas as pd

data = pd.read_csv('Datasets/analysis_data.csv')

In [2]:
# Imputation
data['education_level'].fillna(data['education_level'].mode()[0],inplace=True)
data['online_shopping_freq'].fillna(data['online_shopping_freq'].median(),inplace=True)
data['utility_payment_count'].fillna(data['utility_payment_count'].median(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['education_level'].fillna(data['education_level'].mode()[0],inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['online_shopping_freq'].fillna(data['online_shopping_freq'].median(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will 

In [3]:
# Splitting data and getting dummies
X_train = data.drop(labels='monthly_spend',axis=1)
y = data.monthly_spend
categorical_variables=['gender','marital_status','education_level','region','employment_status','card_type']
X= pd.get_dummies(X_train, 
                         prefix_sep = '_', 
                         columns = categorical_variables, 
                         drop_first = True)


In [5]:
len(X.columns)

28

In [7]:
# Forward
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
sfs = SequentialFeatureSelector(LinearRegression(),
                                  k_features='best',       
                                  forward=True,
                                  floating=False,          
                                  scoring='r2',
                                  cv=5)
sfs = sfs.fit(X, y)
pd.DataFrame(sfs.get_metric_dict()).T.loc[:,['feature_names','avg_score']].sort_values('avg_score', ascending = False)

Unnamed: 0,feature_names,avg_score
25,"(owns_home, has_auto_loan, credit_score, credi...",0.77001
26,"(customer_id, owns_home, has_auto_loan, credit...",0.770007
27,"(customer_id, age, owns_home, has_auto_loan, c...",0.77
28,"(customer_id, age, owns_home, has_auto_loan, a...",0.769989
24,"(owns_home, has_auto_loan, credit_score, credi...",0.769968
23,"(owns_home, has_auto_loan, credit_score, credi...",0.769868
22,"(owns_home, has_auto_loan, credit_limit, tenur...",0.769619
21,"(owns_home, has_auto_loan, credit_limit, tenur...",0.769177
20,"(owns_home, has_auto_loan, credit_limit, tenur...",0.768713
19,"(owns_home, has_auto_loan, credit_limit, tenur...",0.767725


In [9]:
len(sfs.k_feature_names_)

25

In [11]:
# Backward
bfs = SequentialFeatureSelector(LinearRegression(),
          k_features='best',       
          forward=False,
          floating=False,          
          scoring='r2',
          cv=5)
bfs = bfs.fit(X, y)
pd.DataFrame(sfs.get_metric_dict()).T.loc[:,['feature_names','avg_score']].sort_values('avg_score', ascending = False)

Unnamed: 0,feature_names,avg_score
25,"(owns_home, has_auto_loan, credit_score, credi...",0.77001
26,"(customer_id, owns_home, has_auto_loan, credit...",0.770007
27,"(customer_id, age, owns_home, has_auto_loan, c...",0.77
28,"(customer_id, age, owns_home, has_auto_loan, a...",0.769989
24,"(owns_home, has_auto_loan, credit_score, credi...",0.769968
23,"(owns_home, has_auto_loan, credit_score, credi...",0.769868
22,"(owns_home, has_auto_loan, credit_limit, tenur...",0.769619
21,"(owns_home, has_auto_loan, credit_limit, tenur...",0.769177
20,"(owns_home, has_auto_loan, credit_limit, tenur...",0.768713
19,"(owns_home, has_auto_loan, credit_limit, tenur...",0.767725


In [13]:
len(bfs.k_feature_names_)

25

In [14]:
sfs.k_feature_names_==bfs.k_feature_names_

True

In [15]:
# Lets try Stepwise

stfs = SequentialFeatureSelector(LinearRegression(),
          k_features='best',       
          forward=True,
          floating=True,           
          scoring='r2',
          cv=5)
stfs = stfs.fit(X, y)
pd.DataFrame(sfs.get_metric_dict()).T.loc[:,['feature_names','avg_score']].sort_values('avg_score', ascending = False)

Unnamed: 0,feature_names,avg_score
25,"(owns_home, has_auto_loan, credit_score, credi...",0.77001
26,"(customer_id, owns_home, has_auto_loan, credit...",0.770007
27,"(customer_id, age, owns_home, has_auto_loan, c...",0.77
28,"(customer_id, age, owns_home, has_auto_loan, a...",0.769989
24,"(owns_home, has_auto_loan, credit_score, credi...",0.769968
23,"(owns_home, has_auto_loan, credit_score, credi...",0.769868
22,"(owns_home, has_auto_loan, credit_limit, tenur...",0.769619
21,"(owns_home, has_auto_loan, credit_limit, tenur...",0.769177
20,"(owns_home, has_auto_loan, credit_limit, tenur...",0.768713
19,"(owns_home, has_auto_loan, credit_limit, tenur...",0.767725


In [18]:
sfs.k_feature_names_==stfs.k_feature_names_

True

In [20]:
# All variations give same results

variables=bfs.k_feature_names_
variables=list(variables)
variables

['owns_home',
 'has_auto_loan',
 'credit_score',
 'credit_limit',
 'tenure',
 'num_transactions',
 'avg_transaction_value',
 'online_shopping_freq',
 'reward_points_balance',
 'travel_frequency',
 'utility_payment_count',
 'num_children',
 'num_credit_cards',
 'gender_male',
 'marital_status_single',
 'education_level_graduate',
 'education_level_high school',
 'region_northeast',
 'region_south',
 'region_west',
 'employment_status_self-employed',
 'employment_status_student',
 'employment_status_unemployed',
 'card_type_platinum',
 'card_type_standard']

In [21]:
# Fitting Regression model

reg=LinearRegression()
reg.fit(X[variables],y)
reg.predict(X[variables])

array([1501.68295728, 2123.32655683, 1733.85735472, ..., 1620.03274761,
       1545.07616861, 1938.50058269])

In [22]:
# Evaluation
y_hat=reg.predict(X[variables])
check_file=pd.DataFrame({'monthly_spend_hat':y_hat})
check_file.to_csv('Evaluation/submission3.csv',index=False)

In [23]:
# PREDICTION
scoring_data = pd.read_csv('Datasets/scoring_data.csv')
# Imputation


scoring_data['education_level'].fillna(data['education_level'].mode()[0],inplace=True)
scoring_data['online_shopping_freq'].fillna(data['online_shopping_freq'].median(),inplace=True)
scoring_data['utility_payment_count'].fillna(data['utility_payment_count'].median(),inplace=True)

X_test = pd.get_dummies(scoring_data, 
                         prefix_sep = '_', 
                         columns = categorical_variables, 
                         drop_first = True)

pred = reg.predict(X_test[variables])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  scoring_data['education_level'].fillna(data['education_level'].mode()[0],inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  scoring_data['online_shopping_freq'].fillna(data['online_shopping_freq'].median(),inplace=True)
The behavior will change in pandas 3.0. This inpl

In [24]:
submission_file = pd.DataFrame({'customer_id': scoring_data.customer_id, 'monthly_spend': pred})
submission_file.to_csv('Submissions/submission_file_3.csv',index = False)