## Importing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression,ElasticNet,Lasso,BayesianRidge,Ridge,HuberRegressor,ARDRegression,TheilSenRegressor,GammaRegressor,QuantileRegressor,RANSACRegressor,PoissonRegressor,PassiveAggressiveRegressor,SGDRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,BaggingRegressor,ExtraTreesRegressor,GradientBoostingRegressor,HistGradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor,ExtraTreeRegressor
from sklearn.svm import SVR,LinearSVR,NuSVR,LinearSVR
from catboost import CatBoostRegressor
from sklearn.pipeline import Pipeline
from sklearn.kernel_ridge import KernelRidge
from sklearn.neural_network import MLPRegressor
from sklearn.isotonic import IsotonicRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.neighbors import KNeighborsRegressor,RadiusNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error,root_mean_squared_error
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler,PowerTransformer
from sklearn.model_selection import  GridSearchCV,train_test_split

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

import optuna

## Data

In [2]:
df = pd.read_csv(r"diamond-price-prediciton-2024\train.csv").drop("Id",axis=1)
TRY_FIX_OUT = 1
categorical_features = ['cut', 'color', 'clarity']
numeric_features = ['carat', 'depth', 'table', 'x', 'y', 'z']

df= df[(df["z"]<20)]
df=df[df["y"]<20]

## Dealing With Dublicate Data

In [None]:
df= df[
    ~(
    # Remove All Dublicate 
    #  df.duplicated()
    # Remove price Dublicate 
    #  | df.drop(["price"],axis=1).duplicated() 
    # Remove depth Dublicate 
    # | df.drop(["depth"],axis=1).duplicated() 
    # Remove Price & Depth Dublicate 
    # | df.drop(["price","depth"],axis=1).duplicated()
   )
   ]
df.reset_index(drop=True,inplace=True)
df

## Dealing With Z

In [None]:
# Test 1 : Remove all Z
if 1:
    df=df[~(df["z"]==0)]
    df= df[(df["z"]<20)]
    df=df[df["y"]<20]
    
    df.reset_index(drop=True,inplace=True)
# Tdest 2: Remove All X=0 and Fix Z=0
else:
    def func_z(d,x,y):
        return np.round((d * ((y+x))/200),2)
    df=df[df["x"]!=0]
    df["z"] = np.vectorize(func_z)(df["depth"],df["x"],df["y"])
    df.reset_index(drop=True,inplace=True)


## Fixing Outlire

In [None]:
from sklearn.neighbors import LocalOutlierFactor

MATCH = 2
if TRY_FIX_OUT == 1:
# Test 1 Using Z score
    match MATCH:
        case 1:
            df_z_score =pd.DataFrame(StandardScaler().fit_transform(df[numeric_features]))
            df=df.drop(df_z_score[(np.abs(df_z_score)>3).any(axis=1)].index).reset_index(drop=True)
    # Test 2 Using IQR
        case 2:
            Q1 = df[numeric_features].quantile(0.25)
            Q3 = df[numeric_features].quantile(0.75)
            IQR = Q3 - Q1
            df = df[~((df[numeric_features] < (Q1 - 1.5 * IQR)) |(df[numeric_features] > (Q3 + 1.5 * IQR))).any(axis=1)]
    # Test 3 Using LocalOutlierFactor
        case 3:
            ones = LocalOutlierFactor().fit_predict(df[numeric_features])
            df = df[ones==1]
TRY_FIX_OUT+=1
df

In [None]:
# Initialize figure with 10 subplots in a row
fig, ax = plt.subplots(1, 10, figsize=(22, 6))

# Add padding between the subplots
plt.subplots_adjust(wspace=0.5)

# Define the variables for each subplot
variables = df.columns  # Assuming categorical_col contains the names of your categorical columns
colors = ['brown', 'g', 'y', 'b', 'r', 'purple', 'orange', 'pink', 'cyan', 'magenta']
x_labels = df.columns

# Draw boxplots for each variable in its corresponding subplot
for i, variable in enumerate(variables):
    sns.boxplot(data=df[variable], ax=ax[i], color=colors[i])
    ax[i].set_xlabel(x_labels[i])

    # Remove x-tick labels
    ax[i].set_xticklabels([])
df.hist(figsize=(23, 8),bins=50);

## Dealing With Category

In [3]:
cut_cate = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_cate = ['J', 'I', 'H', 'G', 'F', 'E', 'D']
clarity_cate = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
# Test 1: Using Ordrnaly (Winner)
if 1:
    oe = OrdinalEncoder(categories=[cut_cate,color_cate,clarity_cate])
    df[categorical_features] = oe.fit_transform(df[categorical_features])
# Test 2: Using OneHotEncoder (Losser)
else:
    ohe = OneHotEncoder()
    ohe.fit(df[categorical_features])
    cat_tranformed= pd.DataFrame(ohe.fit_transform(df[categorical_features]).toarray(),columns=ohe.get_feature_names_out())
    df = pd.concat([cat_tranformed,df],axis=1).drop(categorical_features,axis=1)
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.06,4.0,1.0,1.0,61.8,57.0,4270,6.57,6.60,4.07
1,1.51,3.0,3.0,5.0,60.9,58.0,15164,7.38,7.42,4.51
2,0.32,4.0,4.0,3.0,61.3,56.0,828,4.43,4.41,2.71
3,0.53,4.0,3.0,3.0,61.2,56.0,1577,5.19,5.22,3.19
4,0.70,3.0,2.0,5.0,61.0,57.0,2596,5.76,5.72,3.50
...,...,...,...,...,...,...,...,...,...,...
43147,0.52,4.0,5.0,3.0,61.5,56.0,1760,5.16,5.18,3.18
43148,0.72,2.0,6.0,3.0,62.1,59.0,3016,5.70,5.73,3.55
43149,0.44,3.0,1.0,6.0,61.5,58.0,990,4.95,4.87,3.02
43150,0.31,3.0,5.0,4.0,60.2,58.0,734,4.38,4.43,2.65


## StanderScaler Vs PowerTrasnForm

In [None]:
# Test 1: StanderScaler
if 0:
    ss = StandardScaler()
    df[numeric_features] = ss.fit_transform(df[numeric_features])
# Test 1: PowerTransformer
else:
    pt = PowerTransformer()
    df[numeric_features] = pt.fit_transform(df[numeric_features])
df.head()

## Evaluate

In [4]:
from sklearn.feature_selection import chi2
test = pd.read_csv(r"diamond-price-prediciton-2024\test.csv")
test[categorical_features] = oe.transform(test[categorical_features])
test_col = np.array(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y','z'])

X,y= df.drop(["price"],axis=1),df["price"]

pf = PolynomialFeatures(degree=5,include_bias=False)
data = pd.DataFrame(pf.fit_transform(X))
data_col = data.columns
st,p = chi2(data,y)

best=data_col[pd.DataFrame(p)[(pd.DataFrame(p)==0).any(axis=1)].index]
to_train = data[best]
to_predict = pd.DataFrame(pf.transform(test[test_col]))[best]

cbr=CatBoostRegressor(verbose=0)

cbr.fit(to_train,y)

predicted_cbr = cbr.predict(to_predict)
pd.DataFrame({"ID":test["Id"],"price":predicted_cbr}).to_csv(f"my_prediction\\cbr.csv",index=False)

## Last Results

1. (MUST) Dealing With Category:
* Test 1: 
    * LGBM:  538.5322642945632
    * CBR:  521.9648672683152
    * ETR:  540.5759253000721
* Test 2:
    * LGBM: 551.491093357798
    * CBR:  538.8184560113942
    * ETR:  547.6814615685914

2. Dealing With Dublicate Data:
* Test 1.1.1:
    * LGBM: 537.9718146128236
    * CBR:  522.6869146661223
    * ETR:  543.2365214857966
* Test 1.1.2:
    * LGBM: 549.8999796567604
    * CBR:  537.6181753879687
    * ETR:  551.6193769467156
* Test 2.1.1:
    * LGBM: 539.0098268914629
    * CBR:  525.1542850769277
    * ETR:  545.6446806710444
* Test 3.1.1:
    * LGBM: 538.4863642745916
    * CBR:  524.9319455365396
    * ETR:  544.5617803404047
* Test 4.1.1:(Best)
    * LGBM: 538.4451116365063
    * CBR:  525.3189534427063
    * ETR:  544.8084409194425
 

3. Dealing With Z:
* Test 1.1.1: (Best)
    * LGBM: 535.1451336323013
    * CBR:  521.9355602866478
    * ETR:  537.6941427142666
* Test 2:
    * LGBM: 535.7702595472263
    * CBR:  522.2481363984887
    * ETR:  538.978381477964

4. Fixing Outlire:
* Test 1: 
    * LGBM: 503.38206966556817
    * CBR:  489.8893540568787
    * ETR:  509.47814894522855
* Test 2: (Best)
    * LGBM: 427.87447210479496
    * CBR:  416.86451968676846
    * ETR:  435.7719129067838
* Test 3:
    * LGBM: 
    * CBR:  
    * ETR:  

5. StanderScaler Vs PowerTrasnForm: 
* Test 1:
    * LGBM: 
    * CBR:  
    * ETR:  
* Test 2:
    * LGBM: 
    * CBR:  
    * ETR:  