Dataset Reading

In [2]:
import os
os.chdir('../')
import pandas as pd

data = pd.read_csv('Datasets/analysis_data.csv')

In [3]:
y=data['monthly_spend']
X=data.drop(labels=['monthly_spend'],axis=1)

numeric_variables=X.select_dtypes('number').columns
categorical_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
    ],
    remainder='passthrough' # Keep numeric columns as they are
)
X_encoded=preprocessor.fit_transform(X)
feature_names = list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_columns))
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()
new_cols = feature_names + numeric_cols

X_processed=pd.DataFrame(X_encoded, columns=new_cols)

estimator = RandomForestRegressor(n_estimators=10, random_state=2121)

# 3. Create the Iterative Imputer (MICE)
mice_imputer = IterativeImputer(
    estimator=estimator,
    max_iter=10,        # Number of imputation cycles (T)
    initial_strategy='mean', # Initial placeholder strategy
    random_state=2121
)

# 4. Fit the Imputer and Transform the Data
# The imputer automatically handles all columns, using each as a predictor for the others.
df_imputed_array = mice_imputer.fit_transform(X_processed)

# Convert the resulting NumPy array back to a Pandas DataFrame
df_imputed = pd.DataFrame(df_imputed_array, columns=X_processed.columns)

from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

stfs = SequentialFeatureSelector(LinearRegression(),
          k_features='best',       
          forward=True,
          floating=True,           
          scoring='r2',
          cv=5)
stfs = stfs.fit(df_imputed, y)
pd.DataFrame(stfs.get_metric_dict()).T.loc[:,['feature_names','avg_score']].sort_values('avg_score', ascending = False)



Unnamed: 0,feature_names,avg_score
31,"(gender_female, gender_male, marital_status_ma...",0.770015
29,"(gender_female, gender_male, marital_status_ma...",0.770015
30,"(gender_female, gender_male, marital_status_ma...",0.770015
32,"(gender_female, gender_male, marital_status_ma...",0.770014
27,"(gender_female, marital_status_married, educat...",0.770014
26,"(gender_female, marital_status_single, educati...",0.770014
28,"(gender_female, marital_status_married, educat...",0.770014
33,"(gender_female, gender_male, marital_status_ma...",0.770011
34,"(gender_female, gender_male, marital_status_ma...",0.770004
25,"(gender_female, marital_status_single, educati...",0.770003


In [6]:
X_train=df_imputed[list(stfs.k_feature_names_)]

reg=LinearRegression().fit(X_train,y)
y_hat=reg.predict(X_train)

from sklearn.metrics import root_mean_squared_error

root_mean_squared_error(y,y_hat)

257.3931557148718

Check on scoring Data

In [7]:
scoring_data=pd.read_csv('Datasets/scoring_data.csv')

numeric_variables=scoring_data.select_dtypes('number').columns
categorical_columns = scoring_data.select_dtypes(include=['object', 'category']).columns.tolist()

scoring_data_encoded=preprocessor.transform(scoring_data)
feature_names = list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_columns))
numeric_cols = scoring_data.select_dtypes(include=['number']).columns.tolist()
new_cols = feature_names + numeric_cols

scoring_data_processed=pd.DataFrame(scoring_data_encoded, columns=new_cols)

# Transform mice

scoring_data_imputed_array = mice_imputer.transform(scoring_data_processed)
scoring_data_pred= pd.DataFrame(scoring_data_imputed_array, columns=scoring_data_processed.columns)
scoring_data_pred=scoring_data_pred[list(stfs.k_feature_names_)]

In [8]:
pred=reg.predict(scoring_data_pred)

In [10]:
submission_file = pd.DataFrame({'customer_id': scoring_data.customer_id, 'monthly_spend': pred})
submission_file.to_csv('Submissions/submission_file_6.csv',index = False)