In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder

# -----------------------------------------
# Load Dataset
# -----------------------------------------
df = pd.read_csv("gemstone.csv")

print(df.head())
print(df.info())
print(df.describe())

# -----------------------------------------
# Basic EDA
# -----------------------------------------
print("\nMissing Values:\n", df.isnull().sum())

# Correlation heatmap (numeric only)
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# Histogram of price
plt.hist(df["price"], bins=40)
plt.title("Price Distribution")
plt.xlabel("Price")
plt.ylabel("Count")
plt.show()

# -----------------------------------------
# Prepare Data (MANUAL, no pipeline)
# -----------------------------------------

# Target and features
X = df.drop(columns=["price"])
y = df["price"]

# Identify categorical columns
cat_cols = X.select_dtypes(include=['object']).columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns

# -------------------------------
# One hot encode categorical data
# -------------------------------
encoder = OneHotEncoder(drop='first', sparse=False)
X_cat = encoder.fit_transform(X[cat_cols])

# Convert numeric directly
X_num = X[num_cols].values

# Combine numeric + encoded categorical
import numpy as np
X_final = np.hstack([X_num, X_cat])

# -----------------------------------------
# Train-Test Split
# -----------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42
)

# -----------------------------------------
# Train Linear Regression
# -----------------------------------------
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# -----------------------------------------
# Model Evaluation
# -----------------------------------------
print("\nRMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R2 Score:", r2_score(y_test, y_pred))
