In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import numpy as np
from sklearn.metrics import mean_squared_error
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l1, l2, l1_l2
from sklearn.feature_selection import SelectFdr, f_regression, VarianceThreshold, SequentialFeatureSelector, SelectKBest
from sklearn.neural_network import MLPRegressor
# from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

In [97]:
import tensorflow as tf
print(tf.__version__)

2.15.0


In [2]:
import os
os.chdir("C:\\Users\\faizan\\Documents\\IMLChallenge02")

In [60]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [61]:
# Keep only numeric columns for correlation analysis
train_numeric = train_df.select_dtypes(include=[np.number])

# Correlation analysis for feature selection (example using Pearson correlation)
correlation_threshold = 0.1  # Adjust this threshold based on your analysis
correlation_with_target = train_numeric.corrwith(train_df['price_doc']).abs()
relevant_features = correlation_with_target[correlation_with_target > correlation_threshold].index

In [62]:
relevant_features.shape

(255,)

In [63]:
# Keep only relevant features
X_train_full = train_df[relevant_features].drop('price_doc', axis=1)
y_train_full = train_df['price_doc']

In [73]:
# Separate numerical and categorical columns
numerical_cols = X_train_full.select_dtypes(include=[np.number]).columns
categorical_cols = X_train_full.select_dtypes(include=['object']).columns

# Create transformers
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Assuming preprocessor is a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

In [74]:
# Create MLP Regressor
mlp_regressor = MLPRegressor(
    hidden_layer_sizes=(64, 32, 16, 8),
    activation='relu',
    solver='adam',
    alpha=0.05,  # L2 regularization (adjust as needed)
    max_iter=100  # Maximum number of iterations (adjust as needed)
)

# Create a pipeline with preprocessor and the MLP Regressor
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', mlp_regressor)
])

# Add EarlyStopping callback
early_stopping = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

In [52]:

# Input layer
input_layer = Input(shape=(X_train_full.shape[1],))

# First dense layer with ReLU activation and L1 regularization
dense_layer1 = Dense(units=64, activation='relu', kernel_regularizer=l1(0.01))(input_layer)
# Optional Dropout layer
# dropout_layer1 = Dropout(0.1)(dense_layer1)

# Second dense layer with ReLU activation and L1 regularization
dense_layer2 = Dense(units=32, activation='relu', kernel_regularizer=l1(0.01))(dense_layer1)
# Optional Dropout layer
# dropout_layer2 = Dropout(0.1)(dense_layer2)

# Third dense layer with ReLU activation and L1/L2 regularization
dense_layer3 = Dense(units=16, activation='relu')(dense_layer2)
# Optional Dropout layer
# dropout_layer3 = Dropout(0.1)(dense_layer3)

# Fourth dense layer with ReLU activation and L1/L2 regularization
dense_layer4 = Dense(units=8, activation='relu')(dense_layer3)
# Optional Dropout layer
# dropout_layer4 = Dropout(0.1)(dense_layer4)

# Output layer with linear activation
output_layer = Dense(units=1, activation='linear')(dense_layer4)

# Create the model
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Feature selection using f_regression
selector = SelectKBest(score_func=f_regression, k=50)  # Adjust k as needed

# Create a pipeline with preprocessor and the Keras model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    # ('pca', PCA(n_components=50)),
    ('variance_filter', VarianceThreshold(threshold=0.05)),
    #('selector', selector),
    ('regressor', model)
])

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Add EarlyStopping callback
early_stopping = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)


In [75]:
# Fit the model
pipeline.fit(X_train, y_train)



In [66]:
# Fit the pipeline
pipeline.fit(X_train, y_train, regressor__epochs=20, regressor__batch_size=40, regressor__verbose=1)


TypeError: BaseMultilayerPerceptron.fit() got an unexpected keyword argument 'epochs'

In [76]:
# Validation Prediction
y_pred_val = pipeline.predict(X_val)

# # Print the number of features used
# num_features_used = model.named_steps['preprocessor'].transform(X_train).shape[1]
# print(f'Number of Features Used: {num_features_used}')

# RMSE calculation for the validation set
rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
print(f'Validation RMSE: {rmse_val}')


Validation RMSE: 13248631.318552667


In [77]:

# Preparing the test set for final prediction
X_test = test_df.copy()

# Final Prediction for submission
predicted_price = pipeline.predict(X_test)

# Flatten predictions
predicted_price = predicted_price.flatten()

# Create submission DataFrame
submission_df = pd.DataFrame({
    'row ID': test_df['row ID'],
    'price_doc': predicted_price
})

# Save the DataFrame to a CSV file
submission_df.to_csv('prediction_neuralNetwork.csv', index=False)


In [59]:
del train_df
del test_df