In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFdr, f_regression, VarianceThreshold, SequentialFeatureSelector, SelectKBest

In [3]:
import os
os.chdir("C:\\Users\\faizan\\Documents\\IMLChallenge02")

In [4]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
sample_df = pd.read_csv('./sample_submission.csv')

In [4]:
train_df.shape

(181507, 272)

In [5]:
# Identify numeric columns
numeric_cols = train_df.select_dtypes(include=[np.number]).columns

# Keep only numeric columns for correlation analysis
train_numeric = train_df[numeric_cols]

# Correlation analysis for feature selection (example using Pearson correlation)
correlation_threshold = 0.05  # Adjust this threshold based on your analysis
correlation_with_target = train_numeric.corrwith(train_df['price_doc']).abs()
relevant_features = correlation_with_target[correlation_with_target > correlation_threshold].index


In [5]:
relevant_features.shape

(256,)

In [6]:
# Keep only relevant features
X_train_full = train_df[relevant_features].drop('price_doc', axis=1)
y_train_full = train_df['price_doc']

In [7]:
# Separate numerical and categorical columns
numerical_cols = X_train_full.select_dtypes(include=[np.number]).columns
categorical_cols = X_train_full.select_dtypes(include=['object']).columns

# Create transformers
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore')),  # OneHotEncoder for most categorical columns
])

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create pipeline with preprocessor, SelectKBest, PolynomialFeatures, and Linear Regression
model = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=22)),
     ('variance_filter', VarianceThreshold(threshold=0.05)),
    # ('poly', PolynomialFeatures(degree=2, include_bias=False)),  # Use interaction terms only
    ('regressor', LinearRegression())
])

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)


# # Assuming X_train_sparse and y_train are sparse matrices
# X_train_sparse = csr_matrix(X_train.values)
# y_train_sparse = csr_matrix(y_train.values)

In [8]:
# Model Training
model.fit(X_train, y_train)

In [9]:
# Validation Prediction
y_pred_val = model.predict(X_val)

# Print the number of features used
num_features_used = model.named_steps['preprocessor'].transform(X_train).shape[1]
print(f'Number of Features Used: {num_features_used}')

# RMSE calculation for the validation set
rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
print(f'Validation RMSE: {rmse_val}')

# Preparing the test set for final prediction
X_test = test_df.copy()

# # Feature selection and transformation for test set
# X_test_transformed = model.named_steps['preprocessor'].transform(X_test)
# X_test_selected = model.named_steps['feature_selection'].transform(X_test_transformed)

# Final Prediction for submission
predicted_price = model.predict(X_test)

# Create submission DataFrame
submission_df = pd.DataFrame({
    'row ID': test_df['row ID'],
    'price_doc': predicted_price
})

# Save the DataFrame to a CSV file
submission_df.to_csv('prediction_linearRegression1.csv', index=False)

Number of Features Used: 255
Validation RMSE: 13490956.98875595


In [23]:
del train_df
del test_df