In [None]:
import pandas as pd     #handles datasets
import seaborn as sns   #data visualization library built on top of Matplotlib
import matplotlib.pyplot as plt     #handles gui aspect to show plotting
from sklearn.model_selection import train_test_split # sklearn imports that manage regresion components
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from sklearn.decomposition import PCA
df = pd.read_csv('diamonds.csv')

#QUESTION 1
print("QUESTION 1")
print("\n")
print("table before changes",df.shape)  # for before
print("check table for duplicates")
print("sum of duplicates",df.duplicated().sum())
print("\n")

# remove duplicates
df = df.drop_duplicates()

# remove column x, y and z as they are irrelevant
df= df.drop(['x', 'y', 'z'], axis=1)

print("check for any missing data in the dataset")
print(df.isna().any())
print("dataset returns null so dataset is complete")

#using the inter Quartile range method under box plot, detect outliers to remove
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
outliers = df[(df['price'] < lower) | (df['price'] > upper)]

print("\nTable for outliers")
print(outliers.head())

#dropping the outliers identified
df = df.drop([23820, 23821, 23822, 23823, 23824], axis=0)

print("\ncheck the table after")
print(df.shape)  # for after


#QUESTION 2
print("QUESTION 2")
#bins- is used to divide/groups the vlaues into a set number of intervals
#kde- is used to add a curve to the graph that makes it easier to see how data is skewed

#Insight 1: histogram of price
sns.histplot(df['price'], bins=30, kde=True)
plt.title('Distribution of Diamond Prices')
plt.xlabel('Price')
plt.ylabel('Count')
plt.show()

#Insight 2: histogram of carat
sns.histplot(df['carat'], bins=30, kde=True)
plt.title('Distribution of Diamond Carat Sizes')
plt.xlabel('Carat')
plt.ylabel('Count')
plt.show()

#Insight 3: histogram of cut, with the various Qualities
sns.histplot(data=df, x='price', hue='cut', bins=30, kde=True)
plt.title('Price Distribution by Cut Quality')
plt.xlabel('Price')
plt.ylabel('Count')
plt.show()

diamonds_model = df.sample(n=12500)
print("random selection samplesize",diamonds_model.shape)


# QUESTION 3: Linear Regression Model
print("\n")
print("QUESTION 3")
print("\n")
#Choose features and target
X = diamonds_model.drop('price', axis=1)
y = diamonds_model['price']

#Convert categorical columns into numbers
#get_dummies converts categorical variables into numerical format
#drop first removes the first category to avoid redundancy
X = pd.get_dummies(X, drop_first=True)

#Split data into train and test sets
#this splits data for the machine to learn from one and test from the other
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Create and train model
model = LinearRegression()
model.fit(X_train, y_train)

#Predict using test data
y_pred = model.predict(X_test)

#Measure accuracy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Linear Regression Results:")
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)
print("\n")


# QUESTION 4
print("QUESTION 4")
print("\n")

# selecting continuous variables
continuousV = ['carat', 'depth', 'table']
X = diamonds_model[continuousV]
y = diamonds_model['price']

#use PCA to reduce to 2 main features
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Split and train data again
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2)

pca_model = LinearRegression()
pca_model.fit(X_train, y_train)


y_pred = pca_model.predict(X_test)
r2_pca = r2_score(y_test, y_pred)

print("PCA + Linear Regression Results:")
print("R-squared:", r2_pca)
print("Explained Variance Ratio:", pca.explained_variance_ratio_)
#pca.explained_variance_ratio_= shows how much of the total data  (variance) is captured by each components.
