In [1]:
import pandas as pd
import numpy as np

# load the training data
raw_train_data = pd.read_csv('my_train.csv')

for col in raw_train_data.columns:
    if pd.api.types.is_integer_dtype(raw_train_data[col]):
        raw_train_data[col] = raw_train_data[col].astype(float)

# convert the dtypes Series to a dictionary
data_types = raw_train_data.dtypes.apply(lambda x: x.name).to_dict()

# now load the development data with the same data types
raw_dev_data = pd.read_csv('my_dev.csv', dtype=data_types)

raw_train_data = raw_train_data.drop(raw_train_data.columns[0], axis=1)
raw_dev_data = raw_dev_data.drop(raw_dev_data.columns[0], axis=1)

print("Train Data Shape:", raw_train_data.shape)
print("Dev Data Shape:", raw_dev_data.shape)

raw_train_data.head()


Train Data Shape: (1314, 80)
Dev Data Shape: (146, 80)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60.0,RL,65.0,8450.0,Pave,,Reg,Lvl,AllPub,Inside,...,0.0,,,,0.0,2.0,2008.0,WD,Normal,208500.0
1,20.0,RL,80.0,9600.0,Pave,,Reg,Lvl,AllPub,FR2,...,0.0,,,,0.0,5.0,2007.0,WD,Normal,181500.0
2,60.0,RL,68.0,11250.0,Pave,,IR1,Lvl,AllPub,Inside,...,0.0,,,,0.0,9.0,2008.0,WD,Normal,223500.0
3,70.0,RL,60.0,9550.0,Pave,,IR1,Lvl,AllPub,Corner,...,0.0,,,,0.0,2.0,2006.0,WD,Abnorml,140000.0
4,60.0,RL,84.0,14260.0,Pave,,IR1,Lvl,AllPub,FR2,...,0.0,,,,0.0,12.0,2008.0,WD,Normal,250000.0


In [2]:
!for i in `seq 2 80`; do cat my_train.csv | cut -d ',' -f $i | sort | uniq | wc -l; done | awk '{s+=$1-1} END {print s}'

7227


# Part 2: Naive Binarization

In [3]:
from sklearn.preprocessing import OneHotEncoder

train_data = raw_train_data.copy().astype(str)
dev_data = raw_dev_data.copy().astype(str)

X_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]
X_dev = dev_data.iloc[:, :-1]
y_dev = dev_data.iloc[:, -1]

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(X_train)

# more preprocessing
X_train = encoder.transform(X_train)
X_dev = encoder.transform(X_dev)

# convert target to float and take log
y_train = np.log(y_train.astype(float))
y_dev = np.log(y_dev.astype(float))

# print shapes
print("Shape of X_train:", X_train.shape)
print("Shape of Y_train:", y_train.shape)
print("Shape of X_dev:", X_dev.shape)
print("Shape of Y_dev:", y_dev.shape)

print("Feature dimension size:", len(encoder.get_feature_names_out()))


Shape of X_train: (1314, 7226)
Shape of Y_train: (1314,)
Shape of X_dev: (146, 7226)
Shape of Y_dev: (146,)
Feature dimension size: 7226


In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_squared_log_error

model = LinearRegression()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_dev = model.predict(X_dev)

In [5]:
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
dev_rmse = np.sqrt(mean_squared_error(y_dev, y_pred_dev))

print("Train RMSE:", train_rmse)
print("Dev RMSE:", dev_rmse)

Train RMSE: 8.138076860823108e-15
Dev RMSE: 0.15233237053005208


# Part 3: Smart Binarization

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

train_data = raw_train_data.copy()
dev_data = raw_dev_data.copy()

X_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]
X_dev = dev_data.iloc[:, :-1].astype(X_train.dtypes.to_dict())
y_dev = dev_data.iloc[:, -1]

# define the processors for numerical and categorical columns
cat_processor = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
num_processor = MinMaxScaler(feature_range=(0, 2))


# identify categorical and numerical columns
cat_cols = X_train.select_dtypes(include=['object']).columns
num_cols = X_train.select_dtypes(exclude=['object']).columns

print("Number of categorical columns:", len(cat_cols))
print("Number of numerical columns:", len(num_cols))


# define transformers
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()) 
])


# create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

# print shapes
print("Shape of X_train:", X_train.shape)
print("Shape of Y_train:", y_train.shape)
print("Shape of X_dev:", X_dev.shape)
print("Shape of Y_dev:", y_dev.shape)

print("Feature dimension size:", len(encoder.get_feature_names_out()))

Number of categorical columns: 43
Number of numerical columns: 36
Shape of X_train: (1314, 79)
Shape of Y_train: (1314,)
Shape of X_dev: (146, 79)
Shape of Y_dev: (146,)
Feature dimension size: 7226


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# create a pipeline that first preprocesses the data, then fits a linear model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])
# train the model
model.fit(X_train, np.log(y_train))  # Using log of y_train for RMSLE

y_pred_train = model.predict(X_train)   # predict on the training set
y_pred_dev = model.predict(X_dev)       # predict on the development set

# calculate RMSLE
train_rmsle = np.sqrt(mean_squared_error(np.log(y_train), y_pred_train))
dev_rmsle = np.sqrt(mean_squared_error(np.log(y_dev), y_pred_dev))
print("Train RMSLE:", train_rmsle)
print("Dev RMSLE:", dev_rmsle)

Train RMSLE: 0.09222022836049133
Dev RMSLE: 0.12409504198160849


# Part 4: Experimentation

In [8]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import numpy as np


# Define a range of alpha values to test
alpha_values = np.logspace(-3, 3, 7)

# Dictionary to store RMSLE for each alpha
rmsle_scores = {}

for alpha in alpha_values:
    # Create a Ridge Regression model with the current alpha
    ridge_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', Ridge(alpha=alpha))
    ])

    # Train the model
    ridge_model.fit(X_train, np.log(y_train))  # Using log transformation for the target

    # Predict on the development set
    y_pred_dev = ridge_model.predict(X_dev)

    # Calculate RMSLE and store it
    rmsle = np.sqrt(mean_squared_error(np.log(y_dev), y_pred_dev))
    rmsle_scores[alpha] = rmsle
    print("Training model with alpha = {}, Dev RMSLE = {}".format(alpha, rmsle))

# Find the best alpha and its corresponding RMSLE
best_alpha = min(rmsle_scores, key=rmsle_scores.get)
best_rmsle = rmsle_scores[best_alpha]
print("Best alpha:", best_alpha)
print("Best RMSLE on dev set:", best_rmsle)



Training model with alpha = 0.001, Dev RMSLE = 0.12410816587281105
Training model with alpha = 0.01, Dev RMSLE = 0.12419736996477485
Training model with alpha = 0.1, Dev RMSLE = 0.12493386537196319
Training model with alpha = 1.0, Dev RMSLE = 0.12807766794586814
Training model with alpha = 10.0, Dev RMSLE = 0.12758433865586125
Training model with alpha = 100.0, Dev RMSLE = 0.12810140975489417
Training model with alpha = 1000.0, Dev RMSLE = 0.13867804740978126
Best alpha: 0.001
Best RMSLE on dev set: 0.12410816587281105


In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# create a pipeline that first preprocesses the data, then fits a linear model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', Ridge(alpha=0.001))])
# train the model
model.fit(X_train, np.log(y_train))  # Using log of y_train for RMSLE

y_pred_train = model.predict(X_train)   # predict on the training set
y_pred_dev = model.predict(X_dev)       # predict on the development set

# calculate RMSLE
train_rmsle = np.sqrt(mean_squared_error(np.log(y_train), y_pred_train))
dev_rmsle = np.sqrt(mean_squared_error(np.log(y_dev), y_pred_dev))
print("Train RMSLE:", train_rmsle)
print("Dev RMSLE:", dev_rmsle)

Train RMSLE: 0.09224811532127092
Dev RMSLE: 0.12410816587281105
