In [1]:
import pandas as pd


In [2]:
data = pd.read_csv('sample.csv')

In [3]:
data

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Outlet_Sales
0,FDW58,OUT049,1000
1,FDW14,OUT017,1000
2,NCN55,OUT010,1000
3,FDQ58,OUT017,1000
4,FDY38,OUT027,1000
...,...,...,...
5676,FDB58,OUT046,1000
5677,FDD47,OUT018,1000
5678,NCO17,OUT045,1000
5679,FDJ26,OUT017,1000


In [4]:
data.columns

Index(['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales'], dtype='object')

In [5]:
data.info

<bound method DataFrame.info of      Item_Identifier Outlet_Identifier  Item_Outlet_Sales
0              FDW58            OUT049               1000
1              FDW14            OUT017               1000
2              NCN55            OUT010               1000
3              FDQ58            OUT017               1000
4              FDY38            OUT027               1000
...              ...               ...                ...
5676           FDB58            OUT046               1000
5677           FDD47            OUT018               1000
5678           NCO17            OUT045               1000
5679           FDJ26            OUT017               1000
5680           FDU37            OUT045               1000

[5681 rows x 3 columns]>

In [6]:
print(data.isna().sum())

Item_Identifier      0
Outlet_Identifier    0
Item_Outlet_Sales    0
dtype: int64


In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


# Data preprocessing

In [8]:

# Here, we convert categorical data (Item_Identifier and Outlet_Identifier) into numerical values
data = pd.get_dummies(data, columns=['Item_Identifier', 'Outlet_Identifier'], drop_first=True)


In [9]:
data

Unnamed: 0,Item_Outlet_Sales,Item_Identifier_DRA24,Item_Identifier_DRA59,Item_Identifier_DRB01,Item_Identifier_DRB13,Item_Identifier_DRB24,Item_Identifier_DRB25,Item_Identifier_DRB48,Item_Identifier_DRC01,Item_Identifier_DRC12,...,Item_Identifier_NCZ54,Outlet_Identifier_OUT013,Outlet_Identifier_OUT017,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049
0,1000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1000,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1000,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,1000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5676,1000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5677,1000,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5678,1000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5679,1000,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [10]:
# Split the data into features and target
X = data.drop('Item_Outlet_Sales', axis=1)
y = data['Item_Outlet_Sales']

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
# Create a linear regression model
model = LinearRegression()

In [13]:
model

LinearRegression()

In [14]:
# Fit the model on the training data
model.fit(X_train, y_train)

LinearRegression()

In [15]:
# Make predictions on the test data
y_pred = model.predict(X_test)


In [16]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


Mean Squared Error: 0.0


In [17]:
# Create a DataFrame for new data with the same one-hot encoded columns as the training data
new_data = pd.DataFrame(columns=X.columns)

# Set the values for the features in new_data
new_data['Item_Identifier_FDQ56'] = [1]  # Example input for Item_Identifier_FDQ56
new_data['Outlet_Identifier_OUT045'] = [1]  # Example input for Outlet_Identifier_OUT045

# Ensure that all other columns are set to 0 (or the appropriate values for your case)
for column in new_data.columns:
    if column not in ['Item_Identifier_FDQ56', 'Outlet_Identifier_OUT045']:
        new_data[column] = 0  # Set to 0 or the appropriate value for your data

# Predict sales for the new data
new_sales_prediction = model.predict(new_data)
print(f'Sales Prediction for New Data: {new_sales_prediction[0]}')


Sales Prediction for New Data: 1000.0


In [18]:
import pickle

# Saving a trained model to a file
with open('sample', 'wb') as model_file:
    pickle.dump(model, model_file)

# Loading a model from a file
with open('sample', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

# Make predictions with the loaded model
predictions = loaded_model.predict(new_data)


# Model evaluation

In [19]:
import pickle
from sklearn.model_selection import cross_val_score

# Load your dataset and define features (X) and target (y)
X = data.drop('Item_Outlet_Sales', axis=1)
y = data['Item_Outlet_Sales']

# Load your trained model using pickle
with open('sample', 'rb') as model_file:
    model = pickle.load(model_file)

# Perform cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
rmse_scores = (-(scores)) ** 0.5

# Calculate mean RMSE
mean_rmse = rmse_scores.mean()
print(f'Mean RMSE: {mean_rmse}')


Mean RMSE: 0.0
