In [1]:
#Step 1 Load dataset

In [2]:
import pandas as pd

# Load the dataset
file_path = 'D:/task5/HousingData.csv'  # Update this path to match your file
df = pd.read_csv(file_path)

# Display the first few rows
df.head()


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


In [3]:
#Step 2: Data Cleaning, Feature Engineering, and Splitting the Data

In [4]:
# Print the column names
print(df.columns)


Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Print the column names
print(df.columns)

# Check for missing values
print(df.isnull().sum())


Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')
CRIM       20
ZN         20
INDUS      20
CHAS       20
NOX         0
RM          0
AGE        20
DIS         0
RAD         0
TAX         0
PTRATIO     0
B           0
LSTAT      20
MEDV        0
dtype: int64


In [6]:
# Drop rows with missing target values
df = df.dropna(subset=['MEDV'])

# Impute missing feature values (if any)
df = df.fillna(df.mean())

In [7]:
# Feature Engineering: Standardize the features
X = df.drop('MEDV', axis=1)  # Use 'MEDV' as the target variable
y = df['MEDV']

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [8]:
#Step 3: Train the Machine Learning Model

In [9]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)


In [10]:
#Step 4: Evaluate the Model (Evaluate the model using the updated target variable.)

In [11]:

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Make predictions
y_pred = model.predict(X_test)


In [12]:
# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

RMSE: 2.871192074970351


In [13]:
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)

print(f'MAE: {mae}')

MAE: 2.067480392156863


In [14]:
# Evaluate the model
r2 = r2_score(y_test, y_pred)

print(f'R²: {r2}')

R²: 0.8875859995747116


In [15]:
#Step5 Save the model

In [16]:
import pickle

# Save the trained model to a file
model_path = 'D:/task5/model.pkl'
with open(model_path, 'wb') as file:
    pickle.dump(model, file)


In [21]:
#!python app.py

In [18]:
#Step 6: Create an API with Flask

In [22]:
pip install Flask





In [20]:
from flask import Flask, request, jsonify
import pickle
import numpy as np

app = Flask(__name__)

# Load the trained model
with open('model.pkl', 'rb') as file:
    model = pickle.load(file)

@app.route('/predict', methods=['POST'])
def predict():
    # Get the data from the POST request
    data = request.get_json(force=True)
    features = np.array(data['features']).reshape(1, -1)

    # Predict the price
    prediction = model.predict(features)
    
    # Return the result as JSON
    return jsonify({'price': prediction[0]})

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with stat
