# Baseline Linear Regression Model with NO Data Preprocessing

In [19]:
import os
import numpy as np
import pandas as pd
import datetime
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Paths for dataset
DATA_PATH = "dataset"
TRAIN_FILE = 'train.csv'
BUILDING_METADATA_FILE = 'building_metadata.csv'
WEATHER_TRAIN_FILE = 'weather_train.csv'

# Read the data files
train_data = pd.read_csv(os.path.join(DATA_PATH, TRAIN_FILE))
building_data = pd.read_csv(os.path.join(DATA_PATH, BUILDING_METADATA_FILE))
weather_data = pd.read_csv(os.path.join(DATA_PATH, WEATHER_TRAIN_FILE))

# Merge the datasets
train_data = train_data.merge(building_data, on='building_id', how='left')
train_data = train_data.merge(weather_data, on=['site_id', 'timestamp'], how='left')

def fill_nan_with_previous(data):
    """Fill NaN values with the previous non-NaN value in each column."""
    for column in data.columns:
        if data[column].dtype == np.float64 or data[column].dtype == np.float32:
            data[column].fillna(method='pad', inplace=True)
    return data

def reduce_memory_usage(data, use_float16=False):
    """Reduce memory usage by converting data types within a dataframe."""
    start_mem = data.memory_usage().sum() / 1024**2
    print(f"Initial memory usage: {start_mem:.2f} MB")

    for column in data.columns:
        if is_datetime(data[column]) or is_categorical_dtype(data[column]):
            continue
        col_type = data[column].dtype
        if col_type != object:
            c_min, c_max = data[column].min(), data[column].max()
            if str(col_type)[:3] == "int":
                data[column] = pd.to_numeric(data[column], downcast='integer')
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[column] = data[column].astype(np.float16)
                else:
                    data[column] = pd.to_numeric(data[column], downcast='float')

        else:
            data[column] = data[column].astype('category')

    end_mem = data.memory_usage().sum() / 1024**2
    print(f"Reduced memory usage: {end_mem:.2f} MB, decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%")
    return data

# Optimize memory usage
train_data = reduce_memory_usage(train_data, use_float16=True)
train_data_dropped = train_data.copy()

# Fill NaN values with the previous non-NaN value in each column
train_data = fill_nan_with_previous(train_data)



# Drop rows with NaN values
print(f"Number of records Before dropping: {train_data_dropped.shape[0]}")
train_data_dropped.dropna(inplace=True)
print(f"Number of records After dropping: {train_data_dropped.shape[0]}")

# Convert timestamp to UNIX and drop unnecessary columns
train_data['timestamp'] = train_data['timestamp'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').timestamp())
train_data.drop(columns=['primary_use', 'year_built', 'floor_count', 'air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed'], inplace=True)
train_data_dropped['timestamp'] = train_data_dropped['timestamp'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').timestamp())
train_data_dropped.drop(columns=['primary_use', 'year_built', 'floor_count', 'air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed'], inplace=True)

# Prepare target variable
target_variable = np.log1p(train_data["meter_reading"])
target_variable_drop = np.log1p(train_data_dropped["meter_reading"])
train_data_dropped.drop(columns=["meter_reading"], inplace=True)
train_data.drop(columns=["meter_reading"], inplace=True)

Initial memory usage: 2467.79 MB


  if is_datetime(data[column]) or is_categorical_dtype(data[column]):
  if is_datetime(data[column]) or is_categorical_dtype(data[column]):
  if is_datetime(data[column]) or is_categorical_dtype(data[column]):
  if is_datetime(data[column]) or is_categorical_dtype(data[column]):
  if is_datetime(data[column]) or is_categorical_dtype(data[column]):
  if is_datetime(data[column]) or is_categorical_dtype(data[column]):
  if is_datetime(data[column]) or is_categorical_dtype(data[column]):
  if is_datetime(data[column]) or is_categorical_dtype(data[column]):
  if is_datetime(data[column]) or is_categorical_dtype(data[column]):
  if is_datetime(data[column]) or is_categorical_dtype(data[column]):
  if is_datetime(data[column]) or is_categorical_dtype(data[column]):
  if is_datetime(data[column]) or is_categorical_dtype(data[column]):


Reduced memory usage: 713.66 MB, decreased by 71.1%


  data[column].fillna(method='pad', inplace=True)


Number of records Before dropping: 20216100
Number of records After dropping: 321728


# Linear Regression Baseline with Datacleaning and No Data Preprocessing
Filling NAN values with last value.

In [20]:
# K-fold linear regression for filled data
kf = KFold(n_splits=5, shuffle=True)
rmse_scores = []

for train_idx, test_idx in kf.split(train_data):
    X_train, X_test = train_data.iloc[train_idx], train_data.iloc[test_idx]
    y_train, y_test = target_variable.iloc[train_idx], target_variable.iloc[test_idx]

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)
    print(f"Fold RMSE: {rmse}")

print(f"Average RMSE: {np.mean(rmse_scores)}")

Fold RMSE: 1.9777923212738855
Fold RMSE: 1.9763479763448124
Fold RMSE: 1.9784587539393268
Fold RMSE: 1.9776286575058237
Fold RMSE: 1.9784522755747833
Average RMSE: 1.9777359969277264


# Linear Regression Baseline with Datacleaning and No Data Preprocessing
Dropping NAN values

Number of records Before dropping: 20216100

Number of records After dropping: 321728

In [22]:
# K-fold linear regression for dropped data
kf = KFold(n_splits=5, shuffle=True)
rmse_scores = []

for train_idx, test_idx in kf.split(train_data_dropped):
    X_train, X_test = train_data_dropped.iloc[train_idx], train_data_dropped.iloc[test_idx]
    y_train, y_test = target_variable_drop.iloc[train_idx], target_variable_drop.iloc[test_idx]

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)
    print(f"Fold RMSE: {rmse}")

print(f"Average RMSE: {np.mean(rmse_scores)}")


Fold RMSE: 1.2740955691508031
Fold RMSE: 1.2787556994143219
Fold RMSE: 1.275769019550566
Fold RMSE: 1.2737226984807124
Fold RMSE: 1.2748829638527512
Average RMSE: 1.275445190089831
