In [99]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


# Read data

In [101]:
data = pd.read_csv('D:/SEM2/DSP/train.csv')

- Show head of data (first 5 rows)

In [103]:
data.head () 

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [156]:
X = data.drop(columns=["SalePrice"])  # Features
y = data["SalePrice"]  # Target variable

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Feature Selection

In [158]:
# Continuous Features
continuous_features = ["LotArea", "YearBuilt"]  # Example continuous features
# Categorical Features
categorical_features = ["Neighborhood", "BldgType"]  # Example categorical features

 Feature Processing

In [162]:
# Create persistent transformers
scaler = StandardScaler()
onehot = OneHotEncoder(handle_unknown='ignore')

# Fit transformers
X_train_continuous_scaled = scaler.fit(X_train[continuous_features])
X_train_categorical_encoded = onehot.fit(X_train[categorical_features])

# Transform the data
X_train_continuous_scaled = scaler.transform(X_train[continuous_features])
X_train_categorical_encoded = onehot.transform(X_train[categorical_features])
# Concatenate transformed features
X_train_processed = np.concatenate([X_train_continuous_scaled, X_train_categorical_encoded.toarray()], axis=1)

 Model Inferenece

In [168]:
#Train the model
model = LinearRegression()
model.fit(X_train_processed, y_train)

# Save the encoder objects
joblib.dump(scaler, 'C:/Users/ASUS/dsp-anmol-jadhav/models/scaler.joblib')
joblib.dump(onehot, 'C:/Users/ASUS/dsp-anmol-jadhav/models/Encoder.joblib')

# Load the encoder objects
loaded_scaler = joblib.load('C:/Users/ASUS/dsp-anmol-jadhav/models/scaler.joblib')
loaded_onehot = joblib.load('C:/Users/ASUS/dsp-anmol-jadhav/models/Encoder.joblib')
# Transform the test data
X_test_continuous_scaled = loaded_scaler.transform(X_test[continuous_features])
X_test_categorical_encoded = loaded_onehot.transform(X_test[categorical_features])
X_test_processed = np.concatenate([X_test_continuous_scaled, X_test_categorical_encoded.toarray()], axis=1)

# Predicting the house prices of the test data using the trained model
y_pred = model.predict(X_test_processed)

# Display the predicted prices
print("Predicted prices for the test data:")
print(y_pred)


Predicted prices for the test data:
[135794.25332878 318545.57470216 143490.76839424 134556.01397672
 232586.1546261  108233.92559488 220317.85245327 130896.41748433
  87345.45043966 141210.22259921 148051.10135966 150202.27974852
 129130.47207225 328115.10446675 193307.10453913 141017.42882002
 197435.20696208 145273.32018106 101645.40278421 210458.41112953
 136205.28949577 249890.91308842 209709.56312304 139884.87987529
 204386.12818702 131127.15992849 201219.43356442 160353.75545514
 194069.90136051 181553.11921168  99876.82085978 327744.72311758
 179159.78434361 143339.87552252 253911.86859511 153049.21255909
 135817.80699281 207191.60473505 320601.77449226 176639.60136106
 123562.61773121 193480.70150798 159621.05789008 335360.25704785
 135879.13784495  99049.85640943 150221.76871281 137819.83811145
 336386.02100295 149086.20728859 135817.80699281 158899.10559427
 113678.02751062 205898.11106952 143105.86003966 213701.29572352
 193087.79045954 196124.471966   150163.75112318 14067

In [65]:


    
Model Evaluation

In [170]:
# Evaluate the model
def compute_rmsle(y_true: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
    return round(rmsle, precision)

rmsle = compute_rmsle(y_test, y_pred)
print("Root-Mean-Squared-Error (RMSE) =", rmsle)

Root-Mean-Squared-Error (RMSE) = 0.27


In [None]:
Model Training

In [182]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error

# Function to build the model
def build_model(data: pd.DataFrame) -> dict:
    X = data.drop(columns=["SalePrice"])  # Features
    y = data["SalePrice"]  # Target variable

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Continuous Features
    continuous_features = ["LotArea", "YearBuilt"]  
    # Categorical Features
    categorical_features = ["Neighborhood", "BldgType"]
    
    # Create and fit transformers
    scaler = StandardScaler()
    onehot = OneHotEncoder(handle_unknown='ignore')
    
    X_train_continuous_scaled = scaler.fit_transform(X_train[continuous_features])
    X_train_categorical_encoded = onehot.fit_transform(X_train[categorical_features])

    # Concatenate transformed features
    X_train_processed = np.concatenate([X_train_continuous_scaled, X_train_categorical_encoded.toarray()], axis=1)

    # Train the model
    model = LinearRegression()
    model.fit(X_train_processed, y_train)

    # Save transformers
    joblib.dump(scaler, 'C:/Users/ASUS/dsp-anmol-jadhav/models/scaler.joblib')
    joblib.dump(onehot, 'C:/Users/ASUS/dsp-anmol-jadhav/models/Encoder.joblib')
    joblib.dump(model, 'C:/Users/ASUS/dsp-anmol-jadhav/models/model.joblib')

    # Evaluate the model
    y_pred = model.predict(X_train_processed)
    rmsle = compute_rmsle(y_train, y_pred)
    
    return {'rmse': rmsle}

# Function to make predictions
def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
    # Load transformers
    scaler = joblib.load('C:/Users/ASUS/dsp-anmol-jadhav/models/scaler.joblib')
    onehot = joblib.load('C:/Users/ASUS/dsp-anmol-jadhav/models/encoder.joblib')
    
    # Transform input data
    continuous_features = ["LotArea", "YearBuilt"]
    categorical_features = ["Neighborhood", "BldgType"]
    X_test_continuous_scaled = scaler.transform(input_data[continuous_features])
    X_test_categorical_encoded = onehot.transform(input_data[categorical_features])
    X_test_processed = np.concatenate([X_test_continuous_scaled, X_test_categorical_encoded.toarray()], axis=1)
    
    # Load trained model
    loaded_model = joblib.load('C:/Users/ASUS/dsp-anmol-jadhav/models/model.joblib')

    # Make predictions
    predicted_prices = loaded_model.predict(X_test_processed)
    return predicted_prices

# Evaluation function
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

# Example usage:
if _name_ == "_main_":
    data = pd.read_csv('train.csv')
    model_performance = build_model(data)
    print("Model performance:", model_performance)

NameError: name '_name_' is not defined