<a href="https://colab.research.google.com/github/ArifaAsha/ArifaAsha.github.io/blob/master/MachineLearning/weighBridgeProject/XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import string
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [39]:
import pandas as pd
file_path = '/content/weighBridgeDB_Dev.dataset.csv'
data = pd.read_csv(file_path)

In [40]:
data.dtypes

Unnamed: 0,0
_id,object
chassisNo,int64
modelName,object
aFrameLength,float64
frontLength,float64
wheelboxSize,float64
rearLength,float64
doorPosition,float64
overallChassisLength,float64
ballWeight,float64


## Find Categorical Columns

In [41]:
for col in ['tyreSize', 'rimSizeAndProfile', 'spareWheel', 'suspensionType', 'suspensionRating', 'toolBoxModel', 'batteryQty']:
    print("Column : {}".format(col))
    print(data[col].value_counts())
    print()

Column : tyreSize
tyreSize
235 / 75 R15    364
265 / 75 R16    235
245 / 75 R16    220
265 / 70 R17     37
275 / 70 R18     34
205 / 70 R15     24
265 / 65 R18      1
Name: count, dtype: int64

Column : rimSizeAndProfile
rimSizeAndProfile
R16 X 6 STUD     455
R15 X 6 STUD     388
R17 X 6 STUD      37
R 18 X 8 STUD     15
R18 X 6 STUD      13
R 18 X 6 STUD      2
R18 X 8 STUD       2
Name: count, dtype: int64

Column : spareWheel
spareWheel
(x1) spare & (x1) bracket                    880
(x2) Spares & (x2) Brackets*                  12
Sway Control System                            5
(x1) Spare & (x1) Bracket*                     3
Sway Control System*                           3
(x1) Spare & (x1) Bracket (Supply loose)*      1
12'' Electric (Drum)*                          1
(x1) Spare & (x2) Brackets*                    1
Name: count, dtype: int64

Column : suspensionType
suspensionType
Independent 2.7T - TuffRide (Coil)               150
1600kg (x2) - Roller Rocker                  

#Drop irrelavan columns

In [42]:
data = data.drop(columns=['_id', 'chassisNo', 'ImageData'])

## Handling Missing Data

In [43]:
for column in data.columns:
    count = len(data.loc[data[column] == ' '])
    print(f"{column}: {count}")


modelName: 0
aFrameLength: 0
frontLength: 0
wheelboxSize: 0
rearLength: 0
doorPosition: 0
overallChassisLength: 0
ballWeight: 0
tareWeight: 0
atm: 0
tyreSize: 0
rimSizeAndProfile: 0
spareWheel: 0
suspensionType: 0
suspensionRating: 0
toolBoxModel: 0
gasBottleQty: 0
batteryQty: 0


In [44]:
data.dtypes

Unnamed: 0,0
modelName,object
aFrameLength,float64
frontLength,float64
wheelboxSize,float64
rearLength,float64
doorPosition,float64
overallChassisLength,float64
ballWeight,float64
tareWeight,float64
atm,float64


# Split data into Dependent and Independant Variable

In [46]:
X = data.drop('ballWeight', axis=1).copy()
X.head()

Unnamed: 0,modelName,aFrameLength,frontLength,wheelboxSize,rearLength,doorPosition,overallChassisLength,tareWeight,atm,tyreSize,rimSizeAndProfile,spareWheel,suspensionType,suspensionRating,toolBoxModel,gasBottleQty,batteryQty
0,ARAMIS,1850.0,2618.0,1740.0,1761.0,1083.0,6119.0,2683.0,3500.0,265_/_75_R16,R16_X_6_STUD,(x1)_spare_&_(x1)_bracket,Independent_3.7T_-_TuffRide_(Coil)*,(x1)_spare_&_(x1)_bracket,CRU0051_+_CRU0011_-_Firewood_Box__+_Toolbox_-_...,2.0,Yes_(x2)*
1,EXTREME,1850.0,2575.0,1740.0,2001.0,1323.0,6316.0,2922.0,3500.0,265_/_70_R17,R17_X_6_STUD,(x1)_spare_&_(x1)_bracket,Independent_3.7T_-_TuffRide_(Coil)*,(x1)_spare_&_(x1)_bracket,CRU0051_+_CRU0011_-_Firewood_Box__+_Toolbox_-_...,2.0,Yes_(x2)*
2,ARAMIS,1450.0,2618.0,1740.0,1761.0,1083.0,6119.0,2275.0,3200.0,235_/_75_R15,R15_X_6_STUD,(x1)_spare_&_(x1)_bracket,Independent_3.7T_-_TuffRide_(Coil)*,(x1)_spare_&_(x1)_bracket,CRU0051_+_CRU0011_-_Firewood_Box__+_Toolbox_-_...,2.0,Yes_(x2)*
3,HURRICANE,1600.0,2075.0,870.0,1455.0,0.0,4400.0,1912.0,3000.0,275_/_70_R18,R18_X_6_STUD,Sway_Control_System,,Sway_Control_System,Front_Doorside_,2.0,Yes_(x2)
4,SERENITY,1850.0,2875.0,1740.0,1951.0,1273.0,6566.0,2731.0,3500.0,265_/_75_R16,R16_X_6_STUD,(x1)_spare_&_(x1)_bracket,Independent_3.7T_(Overlay)_-_TuffRide_(Coil)*,(x1)_spare_&_(x1)_bracket,"CRU011_Toolbox,_2_x_J/C_Holders,_\r\nGenny_sli...",2.0,Yes_(x2)*


In [47]:
y = data['ballWeight'].copy()
y.head()

Unnamed: 0,ballWeight
0,213.0
1,159.0
2,154.0
3,173.0
4,181.0


## Categorical to Numeric (Using Label Encoder)

In [54]:
# Columns to check for filtering
columns_to_check = ['aFrameLength', 'frontLength', 'wheelboxSize', 'rearLength', 'doorPosition',
                    'overallChassisLength']

# Filter out rows where any of the specified columns have a value of 0
X_encoded = X[~(X[columns_to_check] == 0).any(axis=1)].copy()

# Apply the same filtering to y
y_filtered = y.loc[X_encoded.index]

# Fill missing values in categorical columns with a placeholder
categorical_columns = ['modelName', 'tyreSize', 'rimSizeAndProfile', 'spareWheel',
                       'suspensionType', 'suspensionRating', 'toolBoxModel', 'batteryQty']
for col in categorical_columns:
    X_encoded[col] = X_encoded[col].fillna(' ')

# Encode the categorical columns using LabelEncoder
for col in categorical_columns:
    X_encoded[col + '_encod'] = LabelEncoder().fit_transform(X_encoded[col])

# Drop the original columns after encoding
X_encoded = X_encoded.drop(columns=categorical_columns)

# Split the filtered X_encoded and y_filtered into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_filtered, test_size=0.2, random_state=123)


In [62]:
# Instantiate the XGBRegressor: xg_reg
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', seed=123, n_estimators=10)

# Fit the regressor to the training set
xg_reg.fit(X_train, y_train)

# Predict the labels of the test set
preds = xg_reg.predict(X_test)

# compute the rmse: rmse
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 22.587620


In [61]:
min_value = y.min()
max_value = y.max()

target_range = max_value - min_value

print("Target Variable Range:")
print(f"Min Value: {min_value}")
print(f"Max Value: {max_value}")
print(f"Range: {target_range}")


Target Variable Range:
Min Value: 81.0
Max Value: 339.0
Range: 258.0
