In [81]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

# Read data

- Read data

In [56]:
data = pd.read_csv('C:/Users/Ananya/Downloads/sem 2/DSP/train.csv')

- Show head of data (first 5 rows)

In [57]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Feature Selection


In [197]:

    continuous_features = ['LotArea', 'YearBuilt']
    categorical_features = ['Neighborhood', 'BldgType']
    target = 'SalePrice'
 

# Training

- Create X and y

In [198]:
# Split dataset into features and target
def build_model(data: pd.DataFrame, continuous_features: list, categorical_features: list, target: str) -> dict:
    X = data[continuous_features + categorical_features]
    y = data[target]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 



### Feature Processing

In [199]:

    # Define pre-processing for categorical features
    onehot = OneHotEncoder(handle_unknown='ignore', sparse=False)
    onehot.fit(X_train[categorical_features])
    
    # Define pre-processing for continuous features
    scaler = StandardScaler()
    scaler.fit(X_train[continuous_features])

    # Transform the training data
    X_train_categorical = onehot.transform(X_train[categorical_features])
    X_train_continuous = scaler.transform(X_train[continuous_features])

    # Combine processed features
    X_train_processed = np.hstack([X_train_categorical, X_train_continuous])

    



### Model Inference

In [200]:
# Initialize and train the model
model = LinearRegression()
model.fit(X_train_processed, y_train)

# Save the encoder objects
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(onehot, 'Encoder.joblib')

# Load the encoder objects
loaded_onehot = joblib.load('Encoder.joblib')
loaded_scaler = joblib.load('scaler.joblib')

# Preprocess the test set
X_test_categorical = loaded_onehot.transform(X_test[categorical_features])
X_test_continuous = loaded_scaler.transform(X_test[continuous_features])

# Combine processed features
X_test_processed = np.hstack([X_test_categorical, X_test_continuous])

# Make predictions
y_pred = model.predict(X_test_processed)

# Print predicted prices
print("Predicted prices for the test data:")
print(y_pred)


Predicted prices for the test data:
[135616. 318528. 142976. 136768. 232448. 104064. 220992. 132800.  83072.
 141632. 146752. 150080. 128064. 327872. 193216. 141120. 197312. 145344.
 100608. 210816. 136000. 250112. 210176. 140032. 204800. 130624. 201088.
 159680. 193856. 181504. 101760. 327616. 181696. 143424. 254080. 153088.
 135872. 207616. 320576. 176192. 125632. 193344. 159680. 335168. 135744.
  98240. 150336. 137728. 336192. 149120. 135872. 158784. 109440. 205504.
 143296. 214144. 192960. 195840. 150848. 140736. 118912. 145088. 327104.
 257472. 234560. 131712. 157248. 227648. 173632. 173312. 153920. 140352.
 144448. 115392. 336448. 210816. 317824. 315328. 145728. 176064.  99840.
 101568. 151936. 149568. 149568. 126464. 250432. 204224. 143296. 214336.
 162368. 181696. 161728. 187520. 121408. 200896. 118208. 174016. 211584.
 195392. 161984. 207552. 166848. 120640. 266304. 144192. 155392. 317632.
 200192. 160192. 127488. 183488. 132672. 111808. 211904. 153408. 165504.
 157760. 128704


## Model Evaluation


In [201]:
# Initialize and train the model
def compute_rmsle(y_true: np.ndarray, y_pred: np.ndarray, precision: int = 2)-> float:
    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
    return round(rmsle, precision)

rmsle = compute_rmsle(y_test, y_pred)
print("Root-Mean-Squared-Error (RMSE) =", rmsle)





Root-Mean-Squared-Error (RMSE) = 0.27


# Model Training

In [205]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
import os

# Define constants
CONTINUOUS_FEATURES = ['LotArea', 'YearBuilt']
CATEGORICAL_FEATURES = ['Neighborhood', 'BldgType']
TARGET = 'SalePrice'
MODEL_PATH = 'C:/Users/Ananya/dsp-ananya-gownivari-ravindrareddy/models/'

# Function to compute Root Mean Squared Logarithmic Error (RMSLE)
def compute_rmsle(y_true: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
    return round(rmsle, precision)

def build_model(data: pd.DataFrame, continuous_features: list, categorical_features: list, target: str) -> dict:
    # Split dataset into features and target
    X = data[continuous_features + categorical_features]
    y = data[target]

    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define pre-processing for categorical features
    onehot = OneHotEncoder(handle_unknown='ignore', sparse=False)
    onehot.fit(X_train[categorical_features])

    # Define pre-processing for continuous features
    scaler = StandardScaler()
    scaler.fit(X_train[continuous_features])

    # Transform the training data
    X_train_categorical = onehot.transform(X_train[categorical_features])
    X_train_continuous = scaler.transform(X_train[continuous_features])

    # Combine processed features
    X_train_processed = np.hstack([X_train_categorical, X_train_continuous])

    # Initialize and train the model
    model = LinearRegression()
    model.fit(X_train_processed, y_train)

    # Save the model and preprocessors
    joblib.dump(scaler, os.path.join(MODEL_PATH, 'scaler.joblib'))
    joblib.dump(onehot, os.path.join(MODEL_PATH, 'Encoder.joblib'))
    joblib.dump(model, os.path.join(MODEL_PATH, 'model.joblib'))

    # Transform the test data
    X_test_categorical = onehot.transform(X_test[categorical_features])
    X_test_continuous = scaler.transform(X_test[continuous_features])
    X_test_processed = np.hstack([X_test_categorical, X_test_continuous])

    # Make predictions
    y_pred = model.predict(X_test_processed)

    # Compute performance metrics
    rmsle = compute_rmsle(y_test, y_pred)
    return {'rmsle': rmsle}

def make_predictions(input_data: pd.DataFrame, continuous_features: list, categorical_features: list) -> np.ndarray:
    # Load the model and preprocessors
    loaded_model = joblib.load(os.path.join(MODEL_PATH, 'model.joblib'))
    loaded_onehot = joblib.load(os.path.join(MODEL_PATH, 'Encoder.joblib'))
    loaded_scaler = joblib.load(os.path.join(MODEL_PATH, 'scaler.joblib'))

    # Preprocess the input data
    X_new_categorical = loaded_onehot.transform(input_data[categorical_features])
    X_new_continuous = loaded_scaler.transform(input_data[continuous_features])
    X_new_processed = np.hstack([X_new_categorical, X_new_continuous])

    # Make predictions
    new_predictions = loaded_model.predict(X_new_processed)
    return new_predictions

# Example Usage
if __name__ == "__main__":
    # Load the training data
    train_data = pd.read_csv('C:/Users/Ananya/Downloads/sem 2/DSP/train.csv')

    # Build the model and get performance metrics
    performance = build_model(train_data, CONTINUOUS_FEATURES, CATEGORICAL_FEATURES, TARGET)
    print(f'Model Performance: {performance}')

    # Load the new data
    new_data = pd.read_csv('C:/Users/Ananya/Downloads/sem 2/DSP/test.csv')

    # Make predictions
    predictions = make_predictions(new_data, CONTINUOUS_FEATURES, CATEGORICAL_FEATURES)
    new_data['PredictedSalePrice'] = predictions
    print(new_data[['PredictedSalePrice']])




Model Performance: {'rmsle': 0.27}
      PredictedSalePrice
0               152384.0
1               152896.0
2               197824.0
3               193792.0
4               262080.0
...                  ...
1454             83072.0
1455            102656.0
1456            155456.0
1457            174272.0
1458            174208.0

[1459 rows x 1 columns]
