In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics

In [None]:
sales_train = pd.read_csv('Train.csv')
sales_train.head()

In [None]:
sales_test = pd.read_csv('Test.csv')
sales_test.head()

In [None]:
sales_train.shape

In [None]:
sales_train.info()

In [None]:
# checking missing values

In [None]:
sales_train.isnull().sum()

In [None]:
# to fill numerical column we will use mean 
# to fill categorical column we will use mode

In [None]:
sales_train['Item_Weight'].mean()

In [None]:
sales_train['Item_Weight'].fillna(sales_train['Item_Weight'].mean(), inplace=True)
sales_test['Item_Weight'].fillna(sales_test['Item_Weight'].mean(), inplace=True)

In [None]:
sales_train.isnull().sum()

In [None]:
# replacing the missing value of "outlet_size" with mode

In [None]:
sales_train['Outlet_Size'].value_counts()

In [None]:
sales_train['Outlet_Size'].fillna(sales_train['Outlet_Size'].mode()[0], inplace = True)
sales_test['Outlet_Size'].fillna(sales_test['Outlet_Size'].mode()[0], inplace = True)

In [None]:
sales_train

In [None]:
sales_test

In [None]:
sales_train.isnull().sum()

In [None]:
# plots according to numerical data

In [None]:
sns.set()
plt.figure(figsize=(6,6))
sns.distplot(sales_train['Item_Weight'])
plt.show()

In [None]:
sns.set()
plt.figure(figsize=(6,6))
sns.distplot(sales_train['Item_Visibility'])
plt.show()

In [None]:
# countplot

In [None]:
sns.set()
plt.figure(figsize=(6,6))
sns.countplot(sales_train['Outlet_Establishment_Year'])
plt.show()

In [None]:
# plots for categorical data

In [None]:
sns.set()
plt.figure(figsize=(6,6))
sns.countplot(sales_train['Item_Fat_Content'])
plt.show()

In [None]:
sns.set()
plt.figure(figsize=(30,6))
sns.countplot(sales_train['Item_Type'])
plt.show()

In [None]:
sales_train.head()

In [None]:
# Data Preprocessing

In [None]:
sales_train['Item_Fat_Content'].value_counts()

In [None]:
sales_train.replace({'Item_Fat_Content': {'low fat':'Low Fat', 'LF':'Low Fat', 'reg':'Regular'}}, inplace=True)

In [None]:
sales_train['Item_Fat_Content'].value_counts()

In [None]:
# Label Encoding  

In [None]:
encoder = LabelEncoder()

In [None]:
sales_train['Item_Identifier'] = encoder.fit_transform(sales_train['Item_Identifier'])

sales_train['Item_Fat_Content'] = encoder.fit_transform(sales_train['Item_Fat_Content'])

sales_train['Item_Type'] = encoder.fit_transform(sales_train['Item_Type'])

sales_train['Outlet_Identifier'] = encoder.fit_transform(sales_train['Outlet_Identifier'])

sales_train['Outlet_Size'] = encoder.fit_transform(sales_train['Outlet_Size'])

sales_train['Outlet_Location_Type'] = encoder.fit_transform(sales_train['Outlet_Location_Type'])

sales_train['Outlet_Type'] = encoder.fit_transform(sales_train['Outlet_Type'])

In [None]:
sales_train.head()

In [None]:
# Splitting features and target

In [None]:
X = sales_train.drop(columns = 'Item_Outlet_Sales', axis=1)
Y = sales_train['Item_Outlet_Sales']

In [None]:
print(X)

In [None]:
print(Y)

In [None]:
# train_test_split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

In [None]:
# Model Training

In [None]:
# XGBoost Regressor

In [None]:
from xgboost import XGBRegressor

In [None]:
regressor = XGBRegressor()

In [None]:
regressor.fit(X_train,X_train)

In [None]:
# Model Evaluation

In [None]:
# prediction on training data

In [None]:
training_data_prediction = regressor.predict(X_train)

In [None]:
# R squared value
r2_train = metrics.r2_score(X_train, training_data_prediction)

In [None]:
print('R sqaured value =', r2_train)

In [None]:
# prediction on test data

In [None]:
testing_data_prediction = regressor.predict(X_test)

In [None]:
# R squared value
r2_test = metrics.r2_score(X_test, testing_data_prediction)

In [None]:
print('R sqaured value =', r2_test)