In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectFdr, f_regression, VarianceThreshold, SequentialFeatureSelector, SelectKBest

In [4]:
import os
os.chdir("C:\\Users\\faizan\\Documents\\IMLChallenge02")

In [5]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
sample_df = pd.read_csv('./sample_submission.csv')

In [11]:
# Select features and target variable
features = train_df.drop(['price_doc'], axis=1)
target = train_df['price_doc']

# Identify categorical columns
categorical_cols = features.select_dtypes(include=['object']).columns

# Identify numerical columns
numerical_cols = features.select_dtypes(include=['number']).columns

# Create transformers
numerical_transformer = Pipeline(steps=[('numerical', 'passthrough')])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Integrate feature selection with SelectKBest
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k=100)),
    ('regressor', LinearRegression())
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=2, random_state=100)


In [12]:
# Train the model
model.fit(X_train, y_train)

# Print the number of features used
num_features_used = model.named_steps['preprocessor'].transform(X_train).shape[1]
print(f'Number of Features Used: {num_features_used}')

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Now, load the test data
test_data = pd.read_csv('test.csv')

# Make predictions on the test data
test_predictions = model.predict(test_data)

# Create a new DataFrame with 'row ID' and 'price_doc' columns
output_df = pd.DataFrame({'row ID': test_data['row ID'], 'price_doc': test_predictions})

# Save the output to a new CSV file
output_df.to_csv('prediction_linearRegression.csv', index=False)

Number of Features Used: 2214
Mean Squared Error: 546195090437316.5


In [6]:
X = train_df.drop(['price_doc'], axis=1)
y = train_df['price_doc']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Separate numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=[np.number]).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Variance-based filter
variance_filter = VarianceThreshold(threshold=0.10)  # Adjust the threshold as needed

# Create preprocessor with a variance-based filter and one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('num', variance_filter, numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ])

# Create pipeline with preprocessor, PCA, and Linear Regression
model = Pipeline([
    ('preprocessor', preprocessor),
    ('svd', TruncatedSVD(n_components=100)),  # Adjust the number of components as needed
    ('regressor', LinearRegression())
])


In [7]:
# Model Training
model.fit(X_train, y_train)

In [8]:
# Validation Prediction
y_pred_val = model.predict(X_val)

# Print the number of features used
num_features_used = model.named_steps['preprocessor'].transform(X_train).shape[1]
print(f'Number of Features Used: {num_features_used}')

# RMSE calculation for the validation set
rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
print(f'Validation RMSE: {rmse_val}')

# Preparing the test set for final prediction
X_test = test_df.copy()

# # Feature selection and transformation for test set
# X_test_transformed = model.named_steps['preprocessor'].transform(X_test)
# X_test_selected = model.named_steps['feature_selection'].transform(X_test_transformed)

# Final Prediction for submission
predicted_price = model.predict(X_test)

# Create submission DataFrame
submission_df = pd.DataFrame({
    'row ID': test_df['row ID'],
    'price_doc': predicted_price
})

# Save the DataFrame to a CSV file
submission_df.to_csv('prediction_linearRegression.csv', index=False)

Number of Features Used: 2208
Validation RMSE: 13774654.570801115


In [51]:
# Model Training
model.fit(X_train, y_train)

In [20]:
# Data Preparation
train_df.fillna(method='ffill', inplace=True)
test_df.fillna(method='ffill', inplace=True)

  train_df.fillna(method='ffill', inplace=True)
  test_df.fillna(method='ffill', inplace=True)


DUMMY ENCODING

In [47]:
# Convert categorical variables to dummy variables
train_df_dummies = pd.get_dummies(train_df)
test_df_dummies = pd.get_dummies(test_df)

In [48]:
# Ensure that both datasets have the same dummy variables
train_df_dummies, test_df_dummies = train_df_dummies.align(test_df_dummies, join='left', axis=1)
test_df_dummies.fillna(0, inplace=True)

In [49]:
# Split train data into features and target
# Drop 'price_doc' only if 'id' is not in the DataFrame
if 'id' in train_df_dummies.columns:
    X = train_df_dummies.drop(['price_doc', 'id'], axis=1)
else:
    X = train_df_dummies.drop(['price_doc'], axis=1)
y = train_df_dummies['price_doc']

LABEL ENCODING

In [4]:
# Label encoding for categorical variables
label_encoder = LabelEncoder()

for col in train_df.select_dtypes(include=['object']).columns:
    train_df[col] = label_encoder.fit_transform(train_df[col])
    test_df[col] = label_encoder.transform(test_df[col])

    # Extract features and target variable
X = train_df.drop(['price_doc'], axis=1)
y = train_df['price_doc']

In [5]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

STANDARD SCALER

In [6]:
# Normalize numerical variables
scaler = StandardScaler()
numerical_cols = X_train.select_dtypes(include=[np.number]).columns
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])

MIN MAX SCALER

In [51]:
# Normalize numerical variables using MinMaxScaler
scaler = MinMaxScaler()
numerical_cols = X_train.select_dtypes(include=[np.number]).columns
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])

# Apply the same scaler to validation and test sets
X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
X_test = test_df_dummies.copy()
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

FEATURE SELECTION (FORWARD SELECTION)

In [None]:
# Forward Selection using mlxtend
feature_selector = SequentialFeatureSelector(LinearRegression(),
                                             k_features='best',
                                             forward=True,
                                             floating=False,
                                             scoring='neg_mean_squared_error',
                                             cv=5)

feature_selector.fit(X_train, y_train)

# Selected features
selected_features = list(X_train.columns[list(feature_selector.k_feature_idx_)])

In [13]:
# Model Training
model = LinearRegression(fit_intercept=False)
model.fit(X_train, y_train)

Validation prediction when forward selection is applied

In [None]:
# Validation Prediction
X_val[selected_features] = scaler.transform(X_val[selected_features])  # Normalize validation set
y_pred_val = model.predict(X_val[selected_features])

In [64]:
# Validation Prediction
X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])  # Normalize validation set
y_pred_val = model.predict(X_val)

In [65]:
# RMSE calculation for the validation set
import numpy as np
rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
print(f'Validation RMSE: {rmse_val}')

Validation RMSE: 22891288.332712375


In [14]:
# Validation Prediction
X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])  # Normalize validation set
y_pred_val = model.predict(X_val)

# RMSE calculation for the validation set
rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
print(f'Validation RMSE: {rmse_val}')

# Preparing the test set for final prediction
X_test = test_df.copy()
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])  # Normalize test set

# Ensure the test set has the same columns as the training set
X_test = X_test.reindex(columns=X_train.columns).fillna(0)

# Final Prediction for submission
predicted_price = model.predict(X_test)

# Create submission DataFrame
submission_df = pd.DataFrame({
    'row ID': test_df['row ID'],
    'price_doc': predicted_price
})

# Save the DataFrame to a CSV file
submission_df.to_csv('prediction_linearRegression.csv', index=False)

Validation RMSE: 622921576.0661032


In [None]:
# Preparing the test set for final prediction
X_test = test_df_dummies.copy()
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])  # Normalize test set

# Ensure the test set has the same columns as the training set
X_test = X_test.reindex(columns=X_train.columns).fillna(0)

# Final Prediction for submission
predicted_price = model.predict(X_test)

# Create submission DataFrame
submission_df = pd.DataFrame({
    'row ID': test_df['row ID'],
    'price_doc': predicted_price
})

# Save the DataFrame to a CSV file
submission_df.to_csv('prediction_linearRegression.csv', index=False)

Final pred when using forward selection

In [None]:
# Preparing the test set for final prediction
X_test = test_df.copy()
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])  # Normalize test set

# Ensure the test set has the same columns as the training set
X_test = X_test[selected_features].fillna(0)

# Final Prediction for submission
predicted_price = model.predict(X_test)

# Create submission DataFrame
submission_df = pd.DataFrame({
    'row ID': test_df['row ID'],
    'price_doc': predicted_price
})
# Save the DataFrame to a CSV file
submission_df.to_csv('prediction_linearRegression.csv', index=False)

In [41]:

# Preparing the test set for final prediction
# Drop 'id' only if it is in the DataFrame
if 'id' in test_df_dummies.columns:
    X_test = test_df_dummies.drop(['id'], axis=1)
else:
    X_test = test_df_dummies.copy()

# Ensure the test set has the same columns as the training set
X_test = X_test.reindex(columns=X_train.columns).fillna(0)

# Rest of your code for prediction and creating submission file...


In [42]:
# Final Prediction for submission
predicted_price = model.predict(X_test)

In [25]:
test_df.columns

Index(['row ID', 'full_sq', 'life_sq', 'floor', 'product_type', 'sub_area',
       'area_m', 'raion_popul', 'green_zone_part', 'indust_part',
       ...
       'cafe_count_5000_price_1500', 'cafe_count_5000_price_2500',
       'cafe_count_5000_price_4000', 'cafe_count_5000_price_high',
       'big_church_count_5000', 'church_count_5000', 'mosque_count_5000',
       'leisure_count_5000', 'sport_count_5000', 'market_count_5000'],
      dtype='object', length=272)

In [1]:
# Create submission DataFrame
submission_df = pd.DataFrame({
    'row ID': test_df['row ID'],
    'price_doc': predicted_price
})

# Save the DataFrame to a CSV file
submission_df.to_csv('C:\\Users\\AKE\\Desktop\\IMLChallenge02\\prediction_linearRegression.csv', index=False)

NameError: name 'pd' is not defined

In [2]:
del train_df
del test_df

NameError: name 'train_df' is not defined