In [197]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import PowerTransformer

In [2]:
import os
os.chdir("C:\\Users\\faizan\\Documents\\IMLChallenge02")

In [63]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [4]:
train_df.shape

(181507, 272)

In [64]:
train_df[train_df.isnull().any(axis=1)].head()
train_df = train_df.dropna()

test_df[test_df.isnull().any(axis=1)].head()
test_df = test_df.dropna()

In [166]:
# Keep only numeric columns for correlation analysis
train_numeric = train_df.select_dtypes(include=[np.number])

# Correlation analysis for feature selection (example using Pearson correlation)
correlation_threshold = 0.05  # Adjust this threshold based on your analysis
correlation_with_target = train_numeric.corrwith(train_df['price_doc']).abs()
relevant_features = correlation_with_target[correlation_with_target > correlation_threshold].index

In [167]:
relevant_features.shape

(256,)

In [175]:
# Keep only relevant features
X_train_full = train_df[relevant_features].drop('price_doc', axis=1)
y_train_full = train_df['price_doc']

# Subsample the data
fraction = 0.2  # Adjust the fraction as needed
X_train_full, _, y_train_full, _ = train_test_split(X_train_full, y_train_full, test_size=1-fraction, random_state=42)


In [169]:
X_train_full.shape

(127054, 255)

In [210]:

# Separate numerical and categorical columns
numerical_cols = X_train_full.select_dtypes(include=[np.number]).columns
categorical_cols = X_train_full.select_dtypes(include=['object']).columns

# Create transformers
numerical_transformer = Pipeline(steps=[
    # ('scaler', MinMaxScaler()),
    ('power_transform', PowerTransformer())
    # ('pca', PCA(n_components=50))  # Adjust the number of components as needed
])

# Assuming preprocessor is a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('drop_cat', 'drop', categorical_cols)  # Drop all categorical columns
    ])

In [211]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.feature_selection import SelectKBest, f_regression, VarianceThreshold

# Initialize and train the LGBM Regressor model
lgbm_model = LGBMRegressor(n_estimators=1250, learning_rate=0.004, max_depth = 7, force_col_wise='true')

# Create a pipeline with preprocessor and the LGBM Regressor
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    # ('feature_selection', SelectKBest(score_func=f_regression, k=200)),
    #  ('variance_threshold', VarianceThreshold(threshold=0.01)),
    # ('pca', PCA(n_components=100)),  # Adjust the number of components as needed
    # ('variance_threshold', VarianceThreshold(threshold=0.05)),
    ('regressor', lgbm_model)
])


# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

In [172]:
X_train.shape

(101643, 255)

In [212]:
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Total Bins 65025
[LightGBM] [Info] Number of data points in the train set: 29040, number of used features: 255
[LightGBM] [Info] Start training from score 14789676.385292


In [207]:
X_val.shape

(7261, 255)

In [213]:
# Validation Prediction
y_pred_val = pipeline.predict(X_val)

# # Print the number of features used
# num_features_used = model.named_steps['preprocessor'].transform(X_train).shape[1]
# print(f'Number of Features Used: {num_features_used}')

# RMSE calculation for the validation set
rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
print(f'Validation RMSE: {rmse_val}')

Validation RMSE: 12464871.457827695


In [214]:
# Preparing the test set for final prediction
X_test = test_df.copy()

# Final Prediction for submission
predicted_price = pipeline.predict(X_test)

# Flatten predictions
predicted_price = predicted_price.flatten()

# Create submission DataFrame
submission_df = pd.DataFrame({
    'row ID': test_df['row ID'],
    'price_doc': predicted_price
})

# Save the DataFrame to a CSV file
submission_df.to_csv('prediction_lgbm.csv', index=False)



In [62]:
del train_df
del test_df