In [13]:
import pandas as pd

In [15]:
import numpy as np

In [17]:
# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [19]:
train_data

Unnamed: 0,LotArea,OverallQual,YearBuilt,GrLivArea,FullBath,BedroomAbvGr,SalePrice
0,7849,3,2044,1898,3,1,288448
1,8009,8,2059,850,3,5,411569
2,8940,9,2080,1934,3,4,359252
3,11786,1,2009,948,2,3,263051
4,11766,3,2036,2442,3,3,118565
...,...,...,...,...,...,...,...
95,11244,9,2070,2483,3,3,338072
96,14608,1,2016,2248,2,2,340724
97,14075,5,2031,3505,3,4,296955
98,12664,6,2031,3052,3,4,275109


In [20]:
test_data

Unnamed: 0,LotArea,OverallQual,YearBuilt,GrLivArea,FullBath,BedroomAbvGr
0,8920,6,2035,1118,3,5
1,8505,4,2050,3844,2,2
2,12349,1,2008,1379,2,4
3,14299,9,2078,1349,2,3
4,14709,5,2009,2659,2,3
...,...,...,...,...,...,...
95,9793,7,2073,3431,3,4
96,10858,1,2019,1308,3,5
97,14457,7,2052,2238,2,4
98,8484,7,2059,2086,2,5


In [21]:
# Explore the data
print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)
print("Missing values in the trained data:", train_data.isnull().sum().sort_values(ascending = False))

Train data shape: (100, 7)
Test data shape: (100, 6)
Missing values in the trained data: LotArea         0
OverallQual     0
YearBuilt       0
GrLivArea       0
FullBath        0
BedroomAbvGr    0
SalePrice       0
dtype: int64


In [22]:
# Drop irrelevant features (e.g., 'Id') if they exist
if 'Id' in train_data.columns:
    train_data.drop(columns=['Id'], inplace=True)

# Save test data IDs for final submission if 'Id' exists
if 'Id' in test_data.columns:
    test_data_ids = test_data['Id']  # Store test IDs separately
    test_data.drop(columns=['Id'], inplace=True)
else:
    test_data_ids = None  # Handle case where 'Id' is missing


In [23]:
# Fill missing values
# For numerical columns, use the median
train_data.fillna(train_data.median(), inplace = True)
test_data.fillna(test_data.median(), inplace = True)

In [29]:
# One-hot encode categorical features
train_data = pd.get_dummies(train_data, drop_first=True)
test_data = pd.get_dummies(test_data, drop_first=True)

In [30]:
# Align train and test data to have the same features, excluding SalePrice
train_data, test_data = train_data.align(test_data, join = 'left', axis = 1)

#This ensures that both the train and test datasets have the same columns (features) after one-hot encoding.

In [33]:
# Ensure the target variable 'SalePrice' is not part of the test data
test_data = test_data.drop(columns=['SalePrice'], errors='ignore')

In [35]:
# Separate features (X) and target variable (y)
x = train_data.drop(columns=['SalePrice'])
y = train_data['SalePrice']                    

In [36]:
# Split the data into training and validation sets
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

In [38]:
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
test_data = scaler.transform(test_data)

NameError: name 'X_train' is not defined

### **Feature Scaling with `StandardScaler`**
```python
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
test_data = scaler.transform(test_data)
```
This code **scales the numerical features** in `X_train`, `X_val`, and `test_data` using **Standardization (Z-score normalization)**.

---

### **Why Feature Scaling?**
Machine learning models (especially gradient-based ones like **Linear Regression, Logistic Regression, SVMs, and Neural Networks**) perform better when numerical features have **similar scales**. Otherwise, features with larger values can dominate learning.  

**StandardScaler** ensures that:
- Each feature has a **mean of 0** and a **standard deviation of 1**.
- All numerical features contribute **equally** to model training.

---

### **Breaking Down the Code**
#### **Step 1: Create a `StandardScaler` object**
```python
scaler = StandardScaler()
```
- `StandardScaler()` is from `sklearn.preprocessing`.
- It computes the **mean** and **standard deviation** of each feature.

#### **Step 2: Fit and Transform `X_train`**
```python
X_train = scaler.fit_transform(X_train)
```
- `fit_transform(X_train)` does two things:
  1. **Computes mean and standard deviation** of `X_train`.
  2. **Applies transformation**:  
     \[
     X_{\text{scaled}} = \frac{X - \mu}{\sigma}
     \]
     Where:
     - \( X \) = original feature value  
     - \( \mu \) = mean of feature  
     - \( \sigma \) = standard deviation  

#### **Step 3: Transform `X_val` and `test_data`**
```python
X_val = scaler.transform(X_val)
test_data = scaler.transform(test_data)
```
- We **only transform** `X_val` and `test_data`, using the **same mean and standard deviation from `X_train`**.
- We **do not use `.fit_transform()`** here because:
  - **Fitting on validation/test data would cause data leakage.**
  - We need to apply the same scaling as `X_train` for consistency.

---

### **Example Before and After Scaling**
#### **Original Data (`X_train`)**
| Feature  | Value |
|----------|-------|
| Age      | 50    |
| Salary   | 100000 |
| Height   | 175   |

#### **After Standard Scaling (`X_train`)**
| Feature  | Scaled Value |
|----------|-------------|
| Age      | 0.12        |
| Salary   | 1.56        |
| Height   | -0.43       |

Now, all features are on the **same scale**.

---

### **Key Takeaways**
✅ **Feature scaling is important** for ML models that rely on gradient-based optimization.  
✅ `StandardScaler()` **scales numerical features** by making them have a mean of `0` and standard deviation of `1`.  
✅ `fit_transform(X_train)` **computes and applies scaling** based on training data.  
✅ `transform(X_val)` and `transform(test_data)` **apply the same scaling** without recomputing, preventing data leakage.  

This step **improves model performance** and **speeds up convergence**! 🚀

In [39]:
# Train a Linear Regression model

from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

In [41]:
# Train a Decision Tree Regressor
dt_model = DecisionTreeRegressor(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)


NameError: name 'X_train' is not defined

### **Training a Decision Tree Regressor**
```python
dt_model = DecisionTreeRegressor(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)
```
This code trains a **Decision Tree Regressor** to predict a target variable `y_train` based on features in `X_train`.

---

### **Step-by-Step Explanation**
#### **1️⃣ Create a Decision Tree Regressor**
```python
dt_model = DecisionTreeRegressor(max_depth=5, random_state=42)
```
- `DecisionTreeRegressor()` is from `sklearn.tree`.
- It creates a decision tree model that learns patterns in the data to make predictions.

**Parameters:**
- `max_depth=5`:  
  ✅ Limits the depth of the tree to **5 levels** to prevent overfitting.  
  ✅ A deeper tree captures more details but may overfit.  
  ✅ A shallow tree generalizes better but might underfit.
  
- `random_state=42`:  
  ✅ Ensures reproducibility by setting a fixed random seed.

---

#### **2️⃣ Train the Model**
```python
dt_model.fit(X_train, y_train)
```
- **Fitting** means the model learns patterns from `X_train` (features) and `y_train` (target).
- It recursively splits data into **smaller regions** based on the best feature-value splits.
- The model minimizes the error (e.g., Mean Squared Error) at each split.

---

### **How a Decision Tree Works?**
1. **Select the Best Feature to Split On**  
   - The model picks a feature that **best separates the data** (based on minimizing variance).
   
2. **Split Data Recursively**  
   - The dataset is split into branches based on feature values.
   - This process continues until a **stopping criterion** (like `max_depth=5`) is met.

3. **Make Predictions**  
   - Each leaf node contains the **average target value** of the samples in that region.

---

### **Example: Predicting House Prices 🏡**
#### **Training Data (`X_train` & `y_train`)**
| Square Feet | Bedrooms | Price (y_train) |
|------------|---------|----------------|
| 1200       | 2       | 250,000        |
| 1800       | 3       | 350,000        |
| 2200       | 4       | 450,000        |

#### **How the Decision Tree Learns?**
1. **Splits data** at `Square Feet ≤ 1800`
   - Left branch: Small houses → Average Price = 250,000
   - Right branch: Larger houses → Further splits

2. **Splits further** based on `Bedrooms`
   - More branches refine price predictions.

#### **Final Tree (if `max_depth=5`)**
```
         Square Feet ≤ 1800?
        /                  \
   Yes (250K)         No (Bedrooms ≤ 3?)
                     /            \
                Yes (350K)     No (450K)
```
Now, if we give a **new house (2000 sq ft, 3 beds)**, the tree predicts **₹350,000**.

---

### **Key Takeaways**
✅ `DecisionTreeRegressor` **learns patterns** by splitting data based on feature values.  
✅ `max_depth=5` **limits the complexity**, preventing overfitting.  
✅ `.fit(X_train, y_train)` trains the model to make predictions.  
✅ **Used for regression problems**, like predicting house prices, stock values, or sales revenue.

🚀 Now, you can use `dt_model.predict(X_val)` to test the model!

In [44]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Train Decision Tree Regressor
dt_model = DecisionTreeRegressor(max_depth=5, random_state=42)
dt_model.fit(x_train, y_train)

# Define evaluation function
def evaluate_model(model, x, y_true, model_name):
    y_pred = model.predict(x)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} - MSE: {mse:.2f}, R²: {r2:.2f}")
    return y_pred

# Evaluate models
print("\nModel Evaluation on Validation Set:")
lr_predictions = evaluate_model(lr_model, x_val, y_val, "Linear Regression")
dt_predictions = evaluate_model(dt_model, x_val, y_val, "Decision Tree")



Model Evaluation on Validation Set:
Linear Regression - MSE: 26848543386.19, R²: -0.28
Decision Tree - MSE: 38754344244.01, R²: -0.85


Let's break down the code **line by line** to understand how it works:

---

## **Importing Required Libraries**
```python
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
```
- `DecisionTreeRegressor` → Creates and trains a **Decision Tree model** for regression tasks.
- `mean_squared_error` → Measures how far the predicted values are from the actual values.
- `r2_score` → Evaluates how well the model explains the variance in the data.
- `StandardScaler` → Used for **feature scaling** (though it's not used in this code snippet).

---

## **Training a Decision Tree Regressor**
```python
dt_model = DecisionTreeRegressor(max_depth=5, random_state=42)
dt_model.fit(x_train, y_train)
```
- Creates a **Decision Tree Regressor model**.
- `max_depth=5` → Limits the depth of the tree to **prevent overfitting**.
- `random_state=42` → Ensures **reproducibility** by making the random splits the same each time.
- `.fit(x_train, y_train)` → Trains the model using the **training features (`x_train`)** and **target variable (`y_train`)**.

---

## **Defining an Evaluation Function**
```python
def evaluate_model(model, x, y_true, model_name):
    y_pred = model.predict(x)  # Predict target values
    mse = mean_squared_error(y_true, y_pred)  # Compute Mean Squared Error
    r2 = r2_score(y_true, y_pred)  # Compute R² Score
    
    print(f"{model_name} - MSE: {mse:.2f}, R²: {r2:.2f}")  # Print evaluation metrics
    
    return y_pred  # Return predicted values
```
- **Purpose:** Evaluates a trained model on a given dataset (`x`).
- `model.predict(x)` → Uses the trained model to make predictions.
- `mean_squared_error(y_true, y_pred)` → Calculates the **Mean Squared Error (MSE)**:
  - Measures the **average squared difference** between actual and predicted values.
  - Lower MSE = **Better model**.
- `r2_score(y_true, y_pred)` → Calculates the **R² score**:
  - Ranges from `-∞ to 1` (closer to `1` is better).
  - Represents the proportion of variance explained by the model.
- `print(f"{model_name} - MSE: {mse:.2f}, R²: {r2:.2f}")` → Displays **MSE and R² score**.
- `return y_pred` → Returns predicted values for further use.

---

## **Evaluating Models**
```python
print("\nModel Evaluation on Validation Set:")
lr_predictions = evaluate_model(lr_model, x_val, y_val, "Linear Regression")
dt_predictions = evaluate_model(dt_model, x_val, y_val, "Decision Tree")
```
- **Prints a heading** to indicate model evaluation.
- Calls `evaluate_model()` **for Linear Regression (`lr_model`)**:
  - **Error:** `lr_model` is not defined! Ensure you trained a `LinearRegression()` model before using it.
- Calls `evaluate_model()` **for Decision Tree (`dt_model`)**:
  - Evaluates the trained Decision Tree on **validation data (`x_val, y_val`)**.
  - Prints **MSE and R² score** for performance comparison.

---

### **Example Output (Hypothetical)**
```
Model Evaluation on Validation Set:
Linear Regression - MSE: 5000.23, R²: 0.85
Decision Tree - MSE: 4500.76, R²: 0.89
```
- **Lower MSE is better** (Decision Tree performed slightly better).
- **Higher R² is better** (Decision Tree explains 89% of variance, better than Linear Regression).

---

### **Key Fixes & Next Steps**
✅ Ensure `lr_model` is trained before evaluation:
```python
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(x_train, y_train)
```
✅ Make sure `x_train`, `x_val`, `y_train`, and `y_val` **exist and are correctly preprocessed**.

🚀 **Now your code should run correctly!**

In [46]:
# Use the better model (Decision Tree in this case) to predict house prices for the test set
final_predictions = dt_model.predict(test_data)

In [50]:

# Save the predictions to a CSV file
submission = pd.DataFrame({'Id': test_data_ids, 'SalePrice': final_predictions})
submission.to_csv('submission.csv', index=False)

print("\nPredictions saved to 'submission.csv'")


Predictions saved to 'submission.csv'


In [51]:
import pickle

# Save Decision Tree model
with open('dt_model.pkl', 'wb') as file:
    pickle.dump(dt_model, file)

# Save Linear Regression model
with open('lr_model.pkl', 'wb') as file:
    pickle.dump(lr_model, file)

# Ensure scaler exists
if 'scaler' in globals():
    with open("scaler.pkl", "wb") as file:
        pickle.dump(scaler, file)
else:
    print("Error: scaler is not defined. Ensure StandardScaler() was created and fitted before saving.")


In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Drop irrelevant features (e.g., 'Id')
if 'Id' in train_data.columns:
    train_data.drop(columns=['Id'], inplace=True)

if 'Id' in test_data.columns:
    test_data_ids = test_data['Id']
    test_data.drop(columns=['Id'], inplace=True)
else:
    test_data_ids = None

# Fill missing values with median
train_data.fillna(train_data.median(), inplace=True)
test_data.fillna(test_data.median(), inplace=True)

# One-hot encode categorical features
train_data = pd.get_dummies(train_data, drop_first=True)
test_data = pd.get_dummies(test_data, drop_first=True)

# Align train and test data
train_data, test_data = train_data.align(test_data, join='left', axis=1)

# Ensure 'SalePrice' is not in test data
test_data = test_data.drop(columns=['SalePrice'], errors='ignore')

# Separate features and target variable
X = train_data.drop(columns=['SalePrice'])
y = train_data['SalePrice']

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
test_data = scaler.transform(test_data)

# Train a Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Train a Decision Tree Regressor
dt_model = DecisionTreeRegressor(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)


In [54]:
from sklearn.preprocessing import StandardScaler



In [59]:
print(scaler)


StandardScaler()


In [60]:
from sklearn.preprocessing import StandardScaler

# Initialize and fit StandardScaler again if not defined
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)
test_data = scaler.transform(test_data)

# Save the scaler
import pickle
with open("scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)




In [61]:
import pandas as pd

df = pd.read_csv("train.csv")
print(df.head())  # View first few rows
print(df.columns)  # View all column names


   LotArea  OverallQual  YearBuilt  GrLivArea  FullBath  BedroomAbvGr  \
0     7849            3       2044       1898         3             1   
1     8009            8       2059        850         3             5   
2     8940            9       2080       1934         3             4   
3    11786            1       2009        948         2             3   
4    11766            3       2036       2442         3             3   

   SalePrice  
0     288448  
1     411569  
2     359252  
3     263051  
4     118565  
Index(['LotArea', 'OverallQual', 'YearBuilt', 'GrLivArea', 'FullBath',
       'BedroomAbvGr', 'SalePrice'],
      dtype='object')


In [62]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Explore the data
print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)
print("Missing values in the trained data:", train_data.isnull().sum().sort_values(ascending=False))

# Drop irrelevant features (e.g., 'Id') if they exist
if 'Id' in train_data.columns:
    train_data.drop(columns=['Id'], inplace=True)

if 'Id' in test_data.columns:
    test_data_ids = test_data['Id']  # Store test IDs separately
    test_data.drop(columns=['Id'], inplace=True)
else:
    test_data_ids = None  # Handle case where 'Id' is missing

# Fill missing values
train_data.fillna(train_data.median(), inplace=True)
test_data.fillna(test_data.median(), inplace=True)

# One-hot encode categorical features
train_data = pd.get_dummies(train_data, drop_first=True)
test_data = pd.get_dummies(test_data, drop_first=True)

# Align train and test data to have the same features
train_data, test_data = train_data.align(test_data, join='left', axis=1)

test_data = test_data.drop(columns=['SalePrice'], errors='ignore')

# Separate features (X) and target variable (y)
x = train_data.drop(columns=['SalePrice'])
y = train_data['SalePrice']

# Split the data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)
test_data = scaler.transform(test_data)

# Train a Linear Regression model
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

# Train a Decision Tree Regressor
dt_model = DecisionTreeRegressor(max_depth=5, random_state=42)
dt_model.fit(x_train, y_train)

# Define evaluation function
def evaluate_model(model, x, y_true, model_name):
    y_pred = model.predict(x)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} - MSE: {mse:.2f}, R²: {r2:.2f}")
    return y_pred

# Evaluate models
print("\nModel Evaluation on Validation Set:")
lr_predictions = evaluate_model(lr_model, x_val, y_val, "Linear Regression")
dt_predictions = evaluate_model(dt_model, x_val, y_val, "Decision Tree")

# Use the better model (Decision Tree in this case) to predict house prices for the test set
final_predictions = dt_model.predict(test_data)

# Save the predictions to a CSV file
submission = pd.DataFrame({'Id': test_data_ids, 'SalePrice': final_predictions})
submission.to_csv('submission.csv', index=False)

print("\nPredictions saved to 'submission.csv'")


Train data shape: (100, 7)
Test data shape: (100, 6)
Missing values in the trained data: LotArea         0
OverallQual     0
YearBuilt       0
GrLivArea       0
FullBath        0
BedroomAbvGr    0
SalePrice       0
dtype: int64

Model Evaluation on Validation Set:
Linear Regression - MSE: 26848543386.19, R²: -0.28
Decision Tree - MSE: 38754344244.01, R²: -0.85

Predictions saved to 'submission.csv'


In [5]:
train_data["YearBuilt"].describe()


count      10.000000
mean     1973.500000
std        33.440661
min      1915.000000
25%      1947.500000
50%      1984.500000
75%      2000.750000
max      2004.000000
Name: YearBuilt, dtype: float64

In [11]:
train_data["YearBuilt"].value_counts().head(10)


YearBuilt
2003    1
1976    1
2001    1
1915    1
2000    1
1993    1
2004    1
1973    1
1931    1
1939    1
Name: count, dtype: int64