In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

In [None]:
df = pd.read_csv('./dataset/abalone.data')
column_names = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']
data = pd.read_csv('./dataset/abalone.data', names=column_names)


In [None]:
# Perform exploratory data analysis
# Display basic statistics of the dataset
print(data.describe())
# df.isnull().any()
print(data.head())

In [None]:
# Create a label encoder instance
label_encoder = LabelEncoder()

# Apply label encoding to the "Sex" column
data['Sex'] = label_encoder.fit_transform(data['Sex'])

# Normalize the encoded values to the range [0, 1]
data['Sex'] = data['Sex'] / data['Sex'].max()

# Display the updated DataFrame
print(data.head())

In [None]:
# Visualize the correlation matrix using a heatmap
correlation_matrix = data.corr(numeric_only=True)
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
# Plot some scatter plots to visualize relationships
sns.pairplot(data, x_vars=['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight'], y_vars=['Rings'], kind='scatter')
plt.show()

In [None]:
# Prepare the data for linear regression
X = data[['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight']]
y = data['Rings']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create and train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

In [None]:
# Predict age for a new sample (you can replace the values with your own)
# Create a DataFrame with column names for your new sample
new_sample = pd.DataFrame(data=[[0.5, 0.4, 0.1, 0.6, 0.2, 0.1, 0.2]], columns=['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight'])
predicted_age = model.predict(new_sample)
print(f'Predicted Age for the new sample: {predicted_age[0]} years')