In [324]:
import pandas as pd 
import numpy as np 
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [325]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

# Data exploration

In [326]:
df_train.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,Y
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,8.23
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,6.09
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,7.65
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,6.6
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,6.9


In [327]:
df_train["X1"].value_counts()

X1
FDP28    8
DRE49    8
NCF42    8
NCQ43    8
FDW24    8
        ..
FDN50    1
FDY43    1
FDM57    1
FDZ50    1
FDX13    1
Name: count, Length: 1553, dtype: int64

In [328]:
df_train["X9"].value_counts()

X9
Medium    1935
Small     1682
High       672
Name: count, dtype: int64

In [329]:
df_train["X3"].value_counts()

X3
Low Fat    3595
Regular    2030
LF          220
reg          81
low fat      74
Name: count, dtype: int64

In [330]:
print(df_train["X2"].isnull().value_counts())
print(df_train["X9"].isnull().value_counts())

X2
False    4994
True     1006
Name: count, dtype: int64
X9
False    4289
True     1711
Name: count, dtype: int64


In [331]:
print("\n" , df_train.isna().sum() / df_train.shape[0] * 100)
print("\n" , df_test.isna().sum() / df_test.shape[0] * 100)



 X1      0.000000
X2     16.766667
X3      0.000000
X4      0.000000
X5      0.000000
X6      0.000000
X7      0.000000
X8      0.000000
X9     28.516667
X10     0.000000
X11     0.000000
Y       0.000000
dtype: float64

 X1      0.000000
X2     18.113357
X3      0.000000
X4      0.000000
X5      0.000000
X6      0.000000
X7      0.000000
X8      0.000000
X9     27.705113
X10     0.000000
X11     0.000000
dtype: float64


## Dropping the unneccesary columns.

drop X9 for lots of nulls and X1 is unuseful

In [332]:
df_train = df_train.drop("X1", axis='columns')
df_train = df_train.drop("X9", axis='columns')

In [333]:
df_test = df_test.drop("X1", axis='columns')
df_test = df_test.drop("X9", axis='columns')

# Data cleaning

### We need to handle columns With missing values

In [334]:
df_train["X3"] = df_train["X3"].replace({"low fat": "Low Fat", "LF": "Low Fat", "reg": "Regular"})
print("\n" , df_train["X3"].value_counts())

df_test["X3"] = df_test["X3"].replace({"low fat": "Low Fat", "LF": "Low Fat", "reg": "Regular"})
print("\n" , df_test["X3"].value_counts())


 X3
Low Fat    3889
Regular    2111
Name: count, dtype: int64

 X3
Low Fat    1628
Regular     895
Name: count, dtype: int64


In [335]:
df_train = df_train.drop_duplicates()

In [336]:
df_train.head()

Unnamed: 0,X2,X3,X4,X5,X6,X7,X8,X10,X11,Y
0,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Tier 1,Supermarket Type1,8.23
1,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Tier 3,Supermarket Type2,6.09
2,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Tier 1,Supermarket Type1,7.65
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Tier 3,Grocery Store,6.6
4,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,Tier 3,Supermarket Type1,6.9


### results after changing the duplicated values "LF ==> Low fat , etc..."

In [337]:
def Showw_Values(data):
    for i in data.columns:
        print("\n \n" , data[i].value_counts())
Showw_Values(df_train)


 
 X2
17.600    65
12.150    64
11.800    53
16.000    50
13.650    50
          ..
5.325      1
6.325      1
6.440      1
7.275      1
6.895      1
Name: count, Length: 410, dtype: int64

 
 X3
Low Fat    3889
Regular    2111
Name: count, dtype: int64

 
 X4
0.000000    360
0.076975      3
0.144338      2
0.076841      2
0.059160      2
           ... 
0.079146      1
0.022075      1
0.013091      1
0.076506      1
0.054920      1
Name: count, Length: 5577, dtype: int64

 
 X5
Fruits and Vegetables    875
Snack Foods              840
Household                643
Frozen Foods             621
Dairy                    487
Canned                   455
Baking Goods             454
Health and Hygiene       342
Soft Drinks              311
Meat                     294
Breads                   182
Hard Drinks              150
Others                   117
Starchy Foods            108
Breakfast                 75
Seafood                   46
Name: count, dtype: int64

 
 X6
196.5768    6
188.1

In [338]:
df_train["X2"].fillna(df_train["X2"].mean() , inplace= True)

df_test["X2"].fillna(df_test["X2"].mean() , inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train["X2"].fillna(df_train["X2"].mean() , inplace= True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test["X2"].fillna(df_test["X2"].mean() , inplace= True)


In [339]:
print(df_train.isna().sum())

X2     0
X3     0
X4     0
X5     0
X6     0
X7     0
X8     0
X10    0
X11    0
Y      0
dtype: int64


In [340]:
# columns_to_encode = ["X3" , "X5" , "X7"]
# encoder = OneHotEncoder(sparse_output=False, drop=None)  # Use `drop='first'` to avoid multicollinearity

# # Fit and transform the selected columns
# encoded_data = encoder.fit_transform(df_train[columns_to_encode])

# # Convert the encoded data to a DataFrame
# encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(columns_to_encode))

# df_encoded = pd.concat([df_train.drop(columns=columns_to_encode), encoded_df], axis=1)

# # def Encode_col(data , col = Encoding_col):
# #     for i in col:
# #         x = encoder.fit_transform(data[i])
        
# # Encode_col(df_train , Encoding_col)
df_train.head()


Unnamed: 0,X2,X3,X4,X5,X6,X7,X8,X10,X11,Y
0,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Tier 1,Supermarket Type1,8.23
1,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Tier 3,Supermarket Type2,6.09
2,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Tier 1,Supermarket Type1,7.65
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Tier 3,Grocery Store,6.6
4,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,Tier 3,Supermarket Type1,6.9


In [341]:
# # Encode nominal features (One-Hot Encoding)
# nominal_cols = ['X3', 'X5', 'X11']
# df_train = pd.get_dummies(df_train, columns=nominal_cols)

# # # Encode ordinal features (Ordinal Encoding)
# # ordinal_cols = {'X9': ['Small', 'Medium', 'Large']}
# # for col, order in ordinal_cols.items():
# #     encoder = OrdinalEncoder(categories=[order])
# #     df[col] = encoder.fit_transform(df[[col]])
# df_train.head()

In [342]:
columns_to_encode = ['X3', 'X5', 'X11' , "X7" , "X10"]
encoders = {}
le = LabelEncoder()
for i in columns_to_encode :
    df_train[i] = le.fit_transform(df_train[i])

    df_test[i] = le.transform(df_test[i])

df_train.head()

Unnamed: 0,X2,X3,X4,X5,X6,X7,X8,X10,X11,Y
0,9.3,0,0.016047,4,249.8092,9,1999,0,1,8.23
1,5.92,1,0.019278,14,48.2692,3,2009,2,2,6.09
2,17.5,0,0.01676,10,141.618,9,1999,0,1,7.65
3,19.2,1,0.0,6,182.095,0,1998,2,0,6.6
4,8.93,0,0.0,9,53.8614,1,1987,2,1,6.9


In [343]:
df_test.head()

Unnamed: 0,X2,X3,X4,X5,X6,X7,X8,X10,X11
0,12.618604,0,0.021273,2,229.1326,5,1985,2,3
1,17.35,1,0.027588,3,86.1856,1,1987,2,1
2,9.3,0,0.111782,5,182.3292,0,1998,2,0
3,6.71,1,0.029606,0,65.4142,9,1999,0,1
4,17.1,0,0.129141,6,109.4886,2,2007,1,1


## Data Splitting

In [344]:
x = df_train.drop("Y" , axis = "columns")
y = df_train["Y"]

In [345]:
x_train,x_val,y_train,y_val = train_test_split(x,y , random_state=42 , test_size= 0.2)

In [346]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train,y_train)

In [347]:
print(x_train.dtypes)  # Check feature columns
print(y_train.dtype)   # Check target variable

X2     float64
X3       int32
X4     float64
X5       int32
X6     float64
X7       int32
X8       int64
X10      int32
X11      int32
dtype: object
float64


In [349]:
y_pred = model.predict(df_test)
print(y_pred)

[9.00489568 6.41819241 6.52192273 ... 6.91079312 7.37903348 6.89268005]


In [361]:
sub_example = pd.read_csv("sample_submission.csv")

submission = pd.DataFrame({
    "row_id":sub_example["row_id"],  
    "Y": y_pred  
})


submission.to_csv("submission.csv", index=False)
print("Submission file saved!")

Submission file saved!


In [362]:
submission.head()

Unnamed: 0,row_id,Y
0,0,9.004896
1,1,6.418192
2,2,6.521923
3,3,6.955576
4,4,6.752235
