In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.impute import KNNImputer
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.feature_selection import RFE
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("/kaggle/input/cse-281-24-predict-the-item-price/train.csv")
df_test = pd.read_csv("/kaggle/input/cse-281-24-predict-the-item-price/test.csv")

df['X8'] = 2024 - df['X8']

replacement_mapping_X3 = {
    'LF': 'Low Fat',
    'low fat': 'Low Fat',
    'reg': 'Regular'
}
df['X3'] = df['X3'].replace(replacement_mapping_X3)
df_test['X3'] = df_test['X3'].replace(replacement_mapping_X3)

df['X2'].fillna(df['X2'].mean(), inplace=True)
df_test['X2'].fillna(df_test['X2'].mean(), inplace=True)

mode_X9 = df.pivot_table(values = 'X9', index = 'X11', aggfunc=(lambda x: x.mode()[0]))
mode_X9_test = df_test.pivot_table(values = 'X9', index = 'X11', aggfunc=(lambda x: x.mode()[0]))

missing_values = df['X9'].isnull()
df.loc[missing_values, 'X9'] = df.loc[missing_values, 'X11'].apply(lambda x: mode_X9.loc[x])
missing_values_test = df_test['X9'].isnull()
df_test.loc[missing_values_test, 'X9'] = df_test.loc[missing_values_test, 'X11'].apply(lambda x: mode_X9_test.loc[x])

df['X1'] = df['X1'].apply(lambda x: x[:2])
df_test['X1'] = df_test['X1'].apply(lambda x: x[:2])

df.replace({'X1': {'FD': 'Food', 'DR':'Drink','NC':'Non Consumable'}}, inplace = True)
df_test.replace({'X1': {'FD': 'Food', 'DR':'Drink','NC':'Non Consumable'}}, inplace = True)


le = LabelEncoder()
df['X11'] = le.fit_transform(df['X11'])
df_test['X11'] = le.transform(df_test['X11'])

columns_to_encode = ['X1', 'X3', 'X5', 'X7', 'X9', 'X10']
df = pd.get_dummies(df, columns=columns_to_encode, prefix_sep='_')
df_test = pd.get_dummies(df_test, columns=columns_to_encode, prefix_sep ='_')

df.drop('X5_Others', axis=1)

features = []
for col in df.columns:
    if df[col].corr(df['Y']) >= 0.2 or df[col].corr(df['Y']) <= -0.2:
        features.append(col)

print(len(features))

columns_to_test = ['X2','X4', 'X6', 'X8', 'X11']
temp_list = []
for col1 in columns_to_test:
    temp_list.append(col1)
    for col2 in columns_to_test:
        if col2 not in temp_list:
            df[f'{col1}_{col2}'] = df[col1] * df[col2]
            df_test[f'{col1}_{col2}'] = df_test[col1] * df_test[col2]
            if df[f'{col1}_{col2}'].corr(df['Y']) >= 0.5 or df[f'{col1}_{col2}'].corr(df['Y']) <= -0.5:
                features.append(f'{col1}_{col2}')

features.remove('Y')
features.remove('X9_Small')
features.remove('X7_OUT027')
# features.remove('X7_OUT010')
features.remove('X7_OUT019')
features.append('X10_Tier 1')
print(df['X7_OUT019'].corr(df['Y']))
# features.append('X7_OUT046')
print(len(features))
print(features)

features_to_scale = ['X2','X4', 'X6', 'X8', 'X11', 'X6_X11']
scaler = MinMaxScaler()
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])
df_test[features_to_scale] = scaler.transform(df_test[features_to_scale])

X = df[features]
y = df['Y']
x_test = df_test[features]

svr_model = SVR(kernel='rbf', C=200, epsilon=0.0000001, gamma = 'scale')
svr_model.fit(X, y)
Y_pred = svr_model.predict(x_test)

submission = pd.DataFrame({'Y': Y_pred})
submission['row_id'] = range(submission.shape[0])
submission = submission[['row_id', 'Y']]
submission.to_csv('submission.csv', index=False)
print("done")

8
-0.43256845594836846
6
['X6', 'X11', 'X7_OUT010', 'X9_Medium', 'X6_X11', 'X10_Tier 1']
done
