# Pricing Model using Machine Learning (Colab-ready)

This notebook reproduces the steps described in the internship report and demonstrates a classification and regression pipeline for pricing. The dataset is synthetically generated for demonstration purposes. Replace the synthetic data with your CSV if available.


In [ ]:

# If running on Colab, uncomment the following to install missing packages
# !pip install -r /content/drive/MyDrive/path/to/requirements.txt
print('Notebook ready.')


In [ ]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, r2_score, mean_squared_error, mean_absolute_percentage_error
from joblib import dump
import os
np.random.seed(42)
print('Libraries loaded.')


In [ ]:

# ------------------ GENERATE_SYNTHETIC_DATA ------------------
# We'll create a synthetic dataset representative of a pricing problem.
# Columns: product_id, technical_score (0-100), market_segment (A/B/C),
# competitor_price, cost, demand_index, seasonal_index, offer_status (0/1), price (continuous target)

n = 2000
product_id = np.arange(1, n+1)
technical_score = np.clip(np.random.normal(60, 15, n), 10, 100)
market_segment = np.random.choice(['A','B','C'], size=n, p=[0.4,0.35,0.25])
competitor_price = np.random.normal(50000, 15000, n).clip(10000, 150000)
cost = competitor_price * np.random.uniform(0.6, 0.9, n)
demand_index = np.random.beta(2,5, n) * 100
seasonal_index = 1 + 0.2 * np.sin(np.linspace(0, 6.28, n))  # mild seasonality

# Simulate a "recommended price" influenced by features
base_price = cost * np.random.uniform(1.05, 1.5, n)
price = base_price * (1 + (technical_score-50)/400) * (1 + demand_index/500) * seasonal_index
price = np.round(price, -2)  # round to 100s for realism

# Offer status: whether the offer was accepted (1) or rejected (0) - synthetic rule + noise
prob_accept = 1 / (1 + np.exp(-( (technical_score-50)/10 + (competitor_price-price)/20000 + (demand_index-20)/20 )))
offer_status = (np.random.rand(n) < prob_accept).astype(int)

df = pd.DataFrame({
    'product_id': product_id,
    'technical_score': technical_score,
    'market_segment': market_segment,
    'competitor_price': competitor_price.astype(int),
    'cost': cost.astype(int),
    'demand_index': demand_index,
    'seasonal_index': seasonal_index,
    'price': price.astype(int),
    'offer_status': offer_status
})

df.head()


In [ ]:

# Basic EDA
print('Shape:', df.shape)
display(df.describe(include='all'))

# Plot price distribution and offer status counts
plt.figure(figsize=(8,4))
plt.hist(df['price'], bins=40)
plt.title('Price distribution (synthetic)')
plt.xlabel('price')
plt.ylabel('count')
plt.show()

plt.figure(figsize=(6,3))
plt.bar(df['offer_status'].value_counts().index.astype(str), df['offer_status'].value_counts().values)
plt.title('Offer status counts (0=rejected,1=accepted)')
plt.show()


In [ ]:

# Preprocessing
X = df.drop(columns=['product_id','offer_status','price'])
y_clf = df['offer_status']
y_reg = df['price']

numeric_features = ['technical_score','competitor_price','cost','demand_index','seasonal_index']
categorical_features = ['market_segment']

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Classification pipeline
clf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('clf', RandomForestClassifier(n_estimators=100, random_state=42))])

X_train, X_test, y_train, y_test = train_test_split(X, y_clf, test_size=0.2, random_state=42, stratify=y_clf)
clf_pipeline.fit(X_train, y_train)
y_pred = clf_pipeline.predict(X_test)
print('Classification accuracy:', accuracy_score(y_test, y_pred))
print('\nClassification report:\n', classification_report(y_test, y_pred))


In [ ]:

# Regression: predict price
X_r = df.drop(columns=['product_id','price','offer_status'])
y_r = df['price']

Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_r, y_r, test_size=0.2, random_state=42)
reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('reg', RandomForestRegressor(n_estimators=150, random_state=42))])

reg_pipeline.fit(Xr_train, yr_train)
yr_pred = reg_pipeline.predict(Xr_test)
print('Regression R2:', r2_score(yr_test, yr_pred))
print('RMSE:', mean_squared_error(yr_test, yr_pred, squared=False))
print('MAPE:', mean_absolute_percentage_error(yr_test, yr_pred))


In [ ]:

# Create artifacts folder and save models
os.makedirs('artifacts', exist_ok=True)
dump(clf_pipeline, 'artifacts/offer_status_classifier.joblib')
dump(reg_pipeline, 'artifacts/price_regressor.joblib')
df.sample(200).to_csv('artifacts/sample_synthetic_data.csv', index=False)
print('Saved models and a sample CSV in artifacts/.')



# If you have a real dataset:
# Replace the synthetic generation block with:
# df = pd.read_csv('path_to_your_file.csv')
# and make sure the notebook's expected columns are present, or adapt the feature list accordingly.
