<a href="https://colab.research.google.com/github/Dondada101/AI-Coursework--2025/blob/main/lab_practice7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub
import os

# Download latest version
path = kagglehub.dataset_download("shivam2503/diamonds")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/shivam2503/diamonds?dataset_version_number=1...


100%|██████████| 733k/733k [00:00<00:00, 66.1MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/shivam2503/diamonds/versions/1





In [20]:
import pandas as pd
import numpy as nm

data_dir="/root/.cache/kagglehub/datasets/shivam2503/diamonds/versions/1"
print(os.listdir(data_dir))
file_path=os.path.join(data_dir,'diamonds.csv')
df=pd.read_csv(file_path)
print(df)

carat_cut=df.groupby('cut')['carat'].describe()
range_df = df.groupby(['cut','color','clarity'])['carat'].agg(['min', 'max'])
price_range = df.groupby(['color','clarity'])['price'].agg(['min', 'max'])
price_range['range'] = price_range['max'] - price_range['min']
#print(carat_cut)
#print(range_df)
#print(price_range)

['diamonds.csv']
       Unnamed: 0  carat        cut color clarity  depth  table  price     x  \
0               1   0.23      Ideal     E     SI2   61.5   55.0    326  3.95   
1               2   0.21    Premium     E     SI1   59.8   61.0    326  3.89   
2               3   0.23       Good     E     VS1   56.9   65.0    327  4.05   
3               4   0.29    Premium     I     VS2   62.4   58.0    334  4.20   
4               5   0.31       Good     J     SI2   63.3   58.0    335  4.34   
...           ...    ...        ...   ...     ...    ...    ...    ...   ...   
53935       53936   0.72      Ideal     D     SI1   60.8   57.0   2757  5.75   
53936       53937   0.72       Good     D     SI1   63.1   55.0   2757  5.69   
53937       53938   0.70  Very Good     D     SI1   62.8   60.0   2757  5.66   
53938       53939   0.86    Premium     H     SI2   61.0   58.0   2757  6.15   
53939       53940   0.75      Ideal     D     SI2   62.2   55.0   2757  5.83   

          y     z  
0 

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import Lasso, Ridge

diamonds_model = df.sample(n=12500, random_state=42)

X = diamonds_model.drop('price', axis=1)
y = diamonds_model['price']
categorical_cols = ['cut', 'color', 'clarity']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'  # keep numerical columns unchanged
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = nm.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R² Score: {r2:.3f}")


# Create Lasso pipeline
lasso_model = Pipeline(steps=[
    ('preprocessor', preprocessor),  # same preprocessing as before
    ('regressor', Lasso(alpha=0.3, max_iter=10000))  # alpha controls regularization strength
])

# Train the model
lasso_model.fit(X_train, y_train)

# Make predictions
y_pred_lasso = lasso_model.predict(X_test)

# Evaluate
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
rmse_lasso = nm.sqrt(mean_squared_error(y_test, y_pred_lasso))
r2_lasso = r2_score(y_test, y_pred_lasso)

print(f"Lasso Regression -> MAE: {mae_lasso:.2f}, RMSE: {rmse_lasso:.2f}, R²: {r2_lasso:.3f}")

#RIDGE

# Create Ridge pipeline
ridge_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=1.0))  # alpha controls regularization strength
])

# Train the model
ridge_model.fit(X_train, y_train)

# Make predictions
y_pred_ridge = ridge_model.predict(X_test)

# Evaluate
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
rmse_ridge = nm.sqrt(mean_squared_error(y_test, y_pred_ridge))
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"Ridge Regression -> MAE: {mae_ridge:.2f}, RMSE: {rmse_ridge:.2f}, R²: {r2_ridge:.3f}")


Mean Absolute Error: 742.11
Root Mean Squared Error: 1203.67
R² Score: 0.912
Lasso Regression -> MAE: 739.48, RMSE: 1194.58, R²: 0.913
Ridge Regression -> MAE: 740.52, RMSE: 1187.90, R²: 0.914


In [26]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


# Select only continuous/numeric columns
numeric_cols = df.select_dtypes(include=nm.number).columns.tolist()

# Compute correlation with price
correlations = df[numeric_cols].corr()['price'].sort_values(ascending=False)
print(correlations)

features_for_pca = ['carat','x', 'y', 'z']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[features_for_pca])

# Apply PCA to reduce to 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Convert to DataFrame for easier handling
X_pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X_pca_df, df['price'], test_size=0.3, random_state=42)

# Train Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict
y_pred = lr.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = nm.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.3f}")

price         1.000000
carat         0.921591
x             0.884435
y             0.865421
z             0.861249
table         0.127134
depth        -0.010647
Unnamed: 0   -0.306873
Name: price, dtype: float64
MAE: 1206.73
RMSE: 1695.69
R² Score: 0.816


In [29]:
from sklearn.linear_model import Lasso, Ridge

# Create Lasso pipeline
lasso_model = Pipeline(steps=[
    ('preprocessor', preprocessor),  # same preprocessing as before
    ('regressor', Lasso(alpha=0.1))  # alpha controls regularization strength
])

# Train the model
lasso_model.fit(X_train, y_train)

# Make predictions
y_pred_lasso = lasso_model.predict(X_test)

# Evaluate
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
rmse_lasso = nm.sqrt(mean_squared_error(y_test, y_pred_lasso))
r2_lasso = r2_score(y_test, y_pred_lasso)

print(f"Lasso Regression -> MAE: {mae_lasso:.2f}, RMSE: {rmse_lasso:.2f}, R²: {r2_lasso:.3f}")


ValueError: A given column is not a column of the dataframe