In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score

# Load the CSV file
file_path = 'diamonds.csv'
df = pd.read_csv(file_path)

# Drop the unnecessary column 'Unnamed: 0'
df = df.drop(columns=['Unnamed: 0'])

# Define features and target variable
X = df.drop(columns='price')
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Combine training and testing data to fit the label encoder on the entire dataset for each feature
combined_data = pd.concat([X_train, X_test])

# Label Encoding
label_encoder_cut = LabelEncoder()
label_encoder_color = LabelEncoder()
label_encoder_clarity = LabelEncoder()

# Fit the label encoders on the combined data
label_encoder_cut.fit(combined_data['cut'])
label_encoder_color.fit(combined_data['color'])
label_encoder_clarity.fit(combined_data['clarity'])

# Transform the training and testing data
X_train['cut'] = label_encoder_cut.transform(X_train['cut'])
X_train['color'] = label_encoder_color.transform(X_train['color'])
X_train['clarity'] = label_encoder_clarity.transform(X_train['clarity'])

X_test['cut'] = label_encoder_cut.transform(X_test['cut'])
X_test['color'] = label_encoder_color.transform(X_test['color'])
X_test['clarity'] = label_encoder_clarity.transform(X_test['clarity'])

# Train a Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Predict with the Linear Regression model
y_pred_linear = linear_model.predict(X_test)

# Calculate the mean squared error for Linear Regression
mse_linear = mean_squared_error(y_test, y_pred_linear)

# Calculate the R^2 score for Linear Regression
r2_linear = r2_score(y_test, y_pred_linear)

# Train a Ridge Regression model
ridge_model = Ridge()
ridge_model.fit(X_train, y_train)

# Predict with the Ridge Regression model
y_pred_ridge = ridge_model.predict(X_test)

# Calculate the mean squared error for Ridge Regression
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

# Calculate the R^2 score for Ridge Regression
r2_ridge = r2_score(y_test, y_pred_ridge)

mse_linear, r2_linear, mse_ridge, r2_ridge


(1772260.3205072281, 0.8863641648837631, 1772401.514974226, 0.8863551116137749)