In [41]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [42]:
# Read the CSV file
train_df = pd.read_csv('/content/Train.csv')

# Print the first few rows of the data
train_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [43]:
# Read the CSV file
test_df = pd.read_csv('/content/Test.csv')

# Print the first few rows of the data
test_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


In [44]:
#check for missing values
print(train_df.isnull().sum())

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64


In [45]:
#check for missing values
print(test_df.isnull().sum())

Item_Identifier                 0
Item_Weight                   976
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64


In [46]:
#function to clean numerical columns from nulls
def clean_column(df, col_name):
    Q1 = df[col_name].quantile(0.25)
    Q3 = df[col_name].quantile(0.75)
    IQR = Q3 - Q1
    upper_limit = Q3 + (1.5 * IQR)
    lower_limit = Q1 - (1.5 * IQR)

    for x in df.index:
        if df.loc[x, col_name] > upper_limit:
            df.loc[x, col_name] = upper_limit
        elif df.loc[x, col_name] < lower_limit:
            df.loc[x, col_name] = lower_limit

    mean = df[col_name].mean()
    df[col_name].replace(np.nan, mean, inplace=True)

    print(df.isnull().sum())

In [47]:
clean_column(train_df, 'Item_Weight')
clean_column(test_df, 'Item_Weight')

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64
Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64


In [48]:
mode = train_df['Outlet_Size'].mode()[0]
train_df['Outlet_Size'].fillna(mode, inplace=True)
print(train_df.isnull().sum())

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64


In [49]:
mode = test_df['Outlet_Size'].mode()[0]
test_df['Outlet_Size'].fillna(mode, inplace=True)
print(test_df.isnull().sum())

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
dtype: int64


In [50]:
# Loop through each column in the DataFrame object
for col in train_df.columns:
    # Check if the column is categorical
    if train_df[col].dtype == 'object':
        # Count the number of unique categories in the column
        num_categories = len(train_df[col].unique())
        print(f"{col} has {num_categories} categories")

Item_Identifier has 1559 categories
Item_Fat_Content has 5 categories
Item_Type has 16 categories
Outlet_Identifier has 10 categories
Outlet_Size has 3 categories
Outlet_Location_Type has 3 categories
Outlet_Type has 4 categories


In [51]:
# Loop through each column in the DataFrame object
for col in test_df.columns:
    # Check if the column is categorical
    if test_df[col].dtype == 'object':
        # Count the number of unique categories in the column
        num_categories = len(test_df[col].unique())
        print(f"{col} has {num_categories} categories")

Item_Identifier has 1543 categories
Item_Fat_Content has 5 categories
Item_Type has 16 categories
Outlet_Identifier has 10 categories
Outlet_Size has 3 categories
Outlet_Location_Type has 3 categories
Outlet_Type has 4 categories


In [52]:
def one_hot_encode(df, col_name):
    # Perform one-hot encoding on the specified categorical column
    df_onehot = pd.get_dummies(df[col_name], prefix=col_name)

    # Concatenate the one-hot encoded columns with the original DataFrame object
    df = pd.concat([df, df_onehot], axis=1)

    # Drop the original categorical column
    df.drop(col_name, axis=1, inplace=True)

    return df

In [53]:
train_df = one_hot_encode(train_df, 'Item_Fat_Content')
train_df = one_hot_encode(train_df, 'Item_Type')
train_df = one_hot_encode(train_df, 'Outlet_Identifier')
train_df = one_hot_encode(train_df, 'Outlet_Size')
train_df= one_hot_encode(train_df, 'Outlet_Location_Type')
train_df = one_hot_encode(train_df, 'Outlet_Type')

In [54]:
train_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales,Item_Fat_Content_LF,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Fat_Content_low fat,...,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,FDA15,9.3,0.016047,249.8092,1999,3735.138,0,1,0,0,...,0,1,0,1,0,0,0,1,0,0
1,DRC01,5.92,0.019278,48.2692,2009,443.4228,0,0,1,0,...,0,1,0,0,0,1,0,0,1,0
2,FDN15,17.5,0.01676,141.618,1999,2097.27,0,1,0,0,...,0,1,0,1,0,0,0,1,0,0
3,FDX07,19.2,0.0,182.095,1998,732.38,0,0,1,0,...,0,1,0,0,0,1,1,0,0,0
4,NCD19,8.93,0.0,53.8614,1987,994.7052,0,1,0,0,...,1,0,0,0,0,1,0,1,0,0


In [55]:
test_df = one_hot_encode(test_df, 'Item_Fat_Content')
test_df = one_hot_encode(test_df, 'Item_Type')
test_df = one_hot_encode(test_df, 'Outlet_Identifier')
test_df = one_hot_encode(test_df, 'Outlet_Size')
test_df= one_hot_encode(test_df, 'Outlet_Location_Type')
test_df = one_hot_encode(test_df, 'Outlet_Type')

In [56]:
test_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Fat_Content_LF,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Fat_Content_low fat,Item_Fat_Content_reg,...,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,FDW58,20.75,0.007565,107.8622,1999,0,1,0,0,0,...,0,1,0,1,0,0,0,1,0,0
1,FDW14,8.3,0.038428,87.3198,2007,0,0,0,0,1,...,0,1,0,0,1,0,0,1,0,0
2,NCN55,14.6,0.099575,241.7538,1998,0,1,0,0,0,...,0,1,0,0,0,1,1,0,0,0
3,FDQ58,7.315,0.015388,155.034,2007,0,1,0,0,0,...,0,1,0,0,1,0,0,1,0,0
4,FDY38,12.695633,0.118599,234.23,1985,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,1


In [None]:
sns.heatmap(train_df.corr())

In [None]:
#feature selection

X = train_df[['Item_Weight','Item_Fat_Content','Item_Visibility','Item_Type','Item_MRP','Outlet_Identifier','Outlet_Establishment_Year','Outlet_Size','Outlet_Location_Type']]
y = train_df['Item_Outlet_Sales']

In [None]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Validate the model
y_pred_val = model.predict(X_val)
mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = mean_squared_error(y_val, y_pred_val, squared=False)
r2_val = r2_score(y_val, y_pred_val)

# Test the model
X_test = test_df
y_pred_test = model.predict(X_test)
mse_test = mean_squared_error(y, y_pred_test)
rmse_test = mean_squared_error(y, y_pred_test, squared=False)
r2_test = r2_score(y, y_pred_test)

In [None]:
#random forest model
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
xx=np.ravel(x_train)
yy=np.ravel(y_train)
rfc.fit(x_train, yy)
y_pred = rfc.predict(x_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
# Create a confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:")
print(cm)

# Create a classification report
report = classification_report(y_test, y_pred)
print("Classification report:")
print(report)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)