<a href="https://colab.research.google.com/github/BadgujarHarshal/Demo/blob/main/Lab_Exam_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from itertools import product
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from flask import Flask, request, jsonify
import joblib
import warnings
warnings.filterwarnings("ignore")

In [2]:
sales = pd.read_csv('sales_train.csv')
items = pd.read_csv('items.csv')
item_categories = pd.read_csv('item_categories.csv')
shops = pd.read_csv('shops.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
sales['date'] = pd.to_datetime(sales['date'], format='%d.%m.%Y')

In [3]:
sales = sales[(sales['item_price'] > 0) & (sales['item_price'] < 100000)]
sales = sales[(sales['item_cnt_day'] >= 0) & (sales['item_cnt_day'] < 1000)]

sales['month'] = sales['date'].dt.to_period('M')
monthly_sales = sales.groupby(['month', 'shop_id', 'item_id'])['item_cnt_day'].sum().reset_index()
monthly_sales.rename(columns={'item_cnt_day': 'item_cnt_month'}, inplace=True)

monthly_sales['month'] = monthly_sales['month'].astype(str)
monthly_sales['date_block_num'] = pd.factorize(monthly_sales['month'])[0]

In [4]:
grid = []
for block_num in monthly_sales['date_block_num'].unique():
    cur_shops = monthly_sales[monthly_sales['date_block_num'] == block_num]['shop_id'].unique()
    cur_items = monthly_sales[monthly_sales['date_block_num'] == block_num]['item_id'].unique()
    grid += list(product([block_num], cur_shops, cur_items))

grid_df = pd.DataFrame(grid, columns=['date_block_num', 'shop_id', 'item_id'])
data = pd.merge(grid_df, monthly_sales, how='left', on=['date_block_num', 'shop_id', 'item_id']).fillna(0)

In [5]:
def lag_feature(df, lags, col):
    for lag in lags:
        temp = df[['date_block_num', 'shop_id', 'item_id', col]].copy()
        temp['date_block_num'] += lag
        temp.rename(columns={col: col+'_lag_'+str(lag)}, inplace=True)
        df = pd.merge(df, temp, on=['date_block_num', 'shop_id', 'item_id'], how='left')
    return df

data = lag_feature(data, [1, 2, 3], 'item_cnt_month')
data.fillna(0, inplace=True)

In [6]:
price_data = sales.groupby(['shop_id', 'item_id', 'month'])['item_price'].mean().reset_index()
price_data['date_block_num'] = pd.factorize(price_data['month'])[0]
data = pd.merge(data, price_data[['shop_id', 'item_id', 'date_block_num', 'item_price']],
                on=['shop_id', 'item_id', 'date_block_num'], how='left')
data['item_price'].fillna(0, inplace=True)
data = lag_feature(data, [1], 'item_price')

In [7]:
if 'month' in data.columns:
     data = data.drop('month', axis=1)

In [8]:
X = data[data['date_block_num'] < 33].drop(['item_cnt_month'], axis=1)
y = data[data['date_block_num'] < 33]['item_cnt_month']
X_test = data[data['date_block_num'] == 33].drop(['item_cnt_month'], axis=1)
y_test = data[data['date_block_num'] == 33]['item_cnt_month']

In [9]:
if not X.columns.equals(X_test.columns):
    print("Columns in X and X_test do not match AFTER splitting:")
    print("Columns in X:", X.columns.tolist())
    print("Columns in X_test:", X_test.columns.tolist())

In [10]:
model = xgb.XGBRegressor(
    max_depth=8,
    n_estimators=100,
    learning_rate=0.1,
    objective='reg:squarederror'
)

model.fit(X, y)
preds = model.predict(X_test)

In [11]:
print(f"preds shape: {preds.shape}")
print(f"test shape: {test.shape}")

preds shape: (238084,)
test shape: (214200, 3)


In [12]:
test['date_block_num'] = 33
predict_data = pd.merge(test, data.drop('item_cnt_month', axis=1), on=['shop_id', 'item_id', 'date_block_num'], how='left')
X_test = predict_data[X.columns]
X_test.fillna(0, inplace=True)
preds = model.predict(X_test)

In [13]:
test['item_cnt_month'] = preds.clip(0, 20)
test[['ID', 'item_cnt_month']].to_csv('submission.csv', index=False)

In [14]:
joblib.dump(model, 'xgb_model.pkl')
X.columns.to_series().to_csv('model_features.txt', index=False)

In [15]:
from flask import Flask, request, jsonify
import pandas as pd
import joblib
import xgboost as xgb

app = Flask(__name__)
model = joblib.load("xgb_model.pkl")
features = pd.read_csv('model_features.txt')['0'].values.tolist()

@app.route('/predict', methods=['POST'])
def predict():
    input_data = request.json
    df = pd.DataFrame([input_data], columns=features)
    prediction = model.predict(df)[0]
    return jsonify({'predicted_item_cnt_month': float(prediction)})