In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('diamonds.csv')

In [3]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [4]:
print(df.isnull().sum())

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64


In [5]:
numerical_features = ['carat', 'depth', 'table', 'x', 'y', 'z']
categorical_features = ['cut', 'color', 'clarity']

In [6]:
X = df[['carat', 'cut', 'color', 'clarity']]
y = df['price'] 

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['carat']),
        ('cat', OneHotEncoder(), ['cut', 'color', 'clarity'])
    ])

In [9]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=50, random_state=42))
])

In [10]:
model.fit(X_train, y_train)

In [11]:
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

Mean Absolute Error: 298.19251412494356


In [12]:
numerical_column_count = len(df.select_dtypes(include=['float64', 'int64']).columns) - 1  # Exclude 'price'
categorical_column_count = len(df.select_dtypes(include=['object']).columns)

In [13]:
price_mean = df['price'].mean()
price_median = df['price'].median()
carat_mean = df['carat'].mean()
carat_std = df['carat'].std()
depth_min = df['depth'].min()

In [14]:
cut_cardinality = df['cut'].nunique()
color_cardinality = df['color'].nunique()
clarity_cardinality = df['clarity'].nunique()

In [15]:
print(f"Numerical Columns in X_train: {numerical_column_count}")
print(f"Categorical Columns in X_train: {categorical_column_count}")
print(f"Price Mean: {price_mean}, Price Median: {price_median}")
print(f"Carat Mean: {carat_mean}, Carat Standard Deviation: {carat_std}")
print(f"Minimum Depth: {depth_min}")
print(f"Cut Cardinality: {cut_cardinality}, Color Cardinality: {color_cardinality}, Clarity Cardinality: {clarity_cardinality}")

Numerical Columns in X_train: 6
Categorical Columns in X_train: 3
Price Mean: 3932.799721913237, Price Median: 2401.0
Carat Mean: 0.7979397478680014, Carat Standard Deviation: 0.4740112444054184
Minimum Depth: 43.0
Cut Cardinality: 5, Color Cardinality: 7, Clarity Cardinality: 8


In [16]:
total_data_points = len(df)
train_70_30 = int(total_data_points * 0.7)
test_75_25 = int(total_data_points * 0.25)
train_80_20 = int(total_data_points * 0.8)

In [17]:
print(f"Training Data Points (70-30 split): {train_70_30}")
print(f"Test Data Points (75-25 split): {test_75_25}")
print(f"Training Data Points (80-20 split): {train_80_20}")

Training Data Points (70-30 split): 37758
Test Data Points (75-25 split): 13485
Training Data Points (80-20 split): 43152


In [18]:
mean_carat = X_train['carat'].mean()
std_depth = df['depth'].std()

In [19]:
print(f"Mean of carat column after scaling is close to zero: {mean_carat}")
print(f"Standard deviation of depth column after scaling is close to one: {std_depth}")

Mean of carat column after scaling is close to zero: 0.7997629641400498
Standard deviation of depth column after scaling is close to one: 1.432621318833661
