### Your Goal
The goal of this competition is to predict the age of abalone from various physical measurements.

In [1]:
# Importing necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
# Importing the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Checking the number of rows and columns of data
print("train: ",train.shape)
print("test: ", test.shape)

train:  (90615, 10)
test:  (60411, 9)


In [3]:
# See the first few lines of train data
train.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [4]:
# See the first few lines of test data
test.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight
0,90615,M,0.645,0.475,0.155,1.238,0.6185,0.3125,0.3005
1,90616,M,0.58,0.46,0.16,0.983,0.4785,0.2195,0.275
2,90617,M,0.56,0.42,0.14,0.8395,0.3525,0.1845,0.2405
3,90618,M,0.57,0.49,0.145,0.874,0.3525,0.1865,0.235
4,90619,I,0.415,0.325,0.11,0.358,0.1575,0.067,0.105


In [5]:
# See general information of the train dataset
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90615 entries, 0 to 90614
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              90615 non-null  int64  
 1   Sex             90615 non-null  object 
 2   Length          90615 non-null  float64
 3   Diameter        90615 non-null  float64
 4   Height          90615 non-null  float64
 5   Whole weight    90615 non-null  float64
 6   Whole weight.1  90615 non-null  float64
 7   Whole weight.2  90615 non-null  float64
 8   Shell weight    90615 non-null  float64
 9   Rings           90615 non-null  int64  
dtypes: float64(7), int64(2), object(1)
memory usage: 6.9+ MB


In [6]:
# See general information of the test dataset
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60411 entries, 0 to 60410
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              60411 non-null  int64  
 1   Sex             60411 non-null  object 
 2   Length          60411 non-null  float64
 3   Diameter        60411 non-null  float64
 4   Height          60411 non-null  float64
 5   Whole weight    60411 non-null  float64
 6   Whole weight.1  60411 non-null  float64
 7   Whole weight.2  60411 non-null  float64
 8   Shell weight    60411 non-null  float64
dtypes: float64(7), int64(1), object(1)
memory usage: 4.1+ MB


In [7]:
# Look at the number of unique values in each column
train.nunique()

id                90615
Sex                   3
Length              157
Diameter            126
Height               90
Whole weight       3175
Whole weight.1     1799
Whole weight.2      979
Shell weight       1129
Rings                28
dtype: int64

In [8]:
# Mapping categorical data and converting it to numeric values

train['Sex'] = train['Sex'].map({'M': 0, 'F': 1, 'I': 2})
test['Sex'] = test['Sex'].map({'M': 0, 'F': 1, 'I': 2})

In [9]:
# Create input properties by dropping id and Rings columns

X = train.drop(['id', 'Rings'], axis=1).values
y = train['Rings'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [10]:
# Create an XGBoost Regression using the XGBRegressor class and then fit the training data to the model using the fit() method

from xgboost.sklearn import XGBRegressor

regressor = XGBRegressor()

regressor.fit(X_train, y_train)

In [11]:
# Make predictions based on input features (X_test) in the test set using a trained regression model
y_pred = regressor.predict(X_test)

In [12]:
print(y_pred)

[ 6.9006243  6.887447   7.7801805 ... 12.617158   9.926331  13.030232 ]


In [13]:
# Create a new data frame by removing certain column from the data frame named "test"
test1 = test.drop(columns = 'id')

In [14]:
# Create a NumPy array by selecting all rows and columns of dataframe test1
unseen = test1.iloc[:, :].values

In [15]:
# Make predictions on "unseen" data with a model called regressor
unseen_pred = regressor.predict(unseen)

In [16]:
# Adds a new column named "Rings" to the test data frame and assigns the values of the array named "unseen_pred" to this column
test['Rings'] = unseen_pred

In [17]:
test.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,90615,0,0.645,0.475,0.155,1.238,0.6185,0.3125,0.3005,9.495631
1,90616,0,0.58,0.46,0.16,0.983,0.4785,0.2195,0.275,9.698995
2,90617,0,0.56,0.42,0.14,0.8395,0.3525,0.1845,0.2405,10.412237
3,90618,0,0.57,0.49,0.145,0.874,0.3525,0.1865,0.235,10.004494
4,90619,2,0.415,0.325,0.11,0.358,0.1575,0.067,0.105,7.584878


In [18]:
# Calculating root mean square logarithmic error (RMSLE)

from sklearn.metrics import mean_squared_log_error
import math
 
MSE = mean_squared_log_error(y_test, y_pred)
 
RMSLE = math.sqrt(MSE)
print("Root Mean Square Logarithmic Error:\n")
print(RMSLE)

Root Mean Square Logarithmic Error:

0.15052210110975292


In [19]:
# Create a DataFrame containing predictions
# submission = test[['id', 'Rings']]

# Convert to the format required by the submission file (for example, CSV format)
# submission.to_csv('submission.csv', index=False)