In [2]:
# Darragh Tate

# The diamond data set comprises of 53940 records, with each record containing the selling price, cut, colour, 
# clarity, depth percentage (the depth / width), table percentage (the flat polished top), height, width and depth.

# Data Source: "https://vincentarelbundock.github.io/Rdatasets/doc/ggplot2/diamonds.csv"
# Data Documentation: "https://vincentarelbundock.github.io/Rdatasets/doc/ggplot2/diamonds.html"


# Data Collection and Cleaning

! pip install pandas
! pip install sklearn
! pip install numpy

import pandas as pd

# Used to set random states, for consistency
random_state = 23


df = pd.read_csv('diamonds.csv')

df = df[["carat","cut","color","clarity","depth","table","price","x","y","z"]]
# .astype is pandas categorisation function.
df[['cut', 'color', 'clarity']] = df[['cut', 'color', 'clarity']].astype('category')
# .cat.codes is pandas function to take the categorical information and replace the string variables with the numerics
df['cut'] = df['cut'].cat.codes
df['color'] = df['color'].cat.codes
df['clarity'] = df['clarity'].cat.codes

# x is the dataframe conaining the independant variables
X_values = df[["carat","cut","color","clarity","depth","table","x","y","z"]]
# y is the dataframe containing the dependant variables
y_values = df['price']

# Legacy code - Used to split the price bands into categories.
categories = pd.qcut(y_values, 10)
y_categorical = pd.DataFrame(y_values, columns={'price'})
y_categorical.loc[(y_categorical['price']<500), "categories"] = "< 500"
y_categorical.loc[(y_categorical['price']>=500) & (y_categorical['price']<1000), 'categories'] = "500 - 1000"
y_categorical.loc[(y_categorical['price']>=1000) & (y_categorical['price']<1500), 'categories'] = "1000 - 1500"
y_categorical.loc[(y_categorical['price']>=1500) & (y_categorical['price']<2000), 'categories'] = "1500 - 2000"
y_categorical.loc[(y_categorical['price']>=2000) & (y_categorical['price']<2500), 'categories'] = "2000 - 2500"
y_categorical.loc[(y_categorical['price']>=2500) & (y_categorical['price']<3000), 'categories'] = "2500 - 3000"
y_categorical.loc[(y_categorical['price']>=3000) & (y_categorical['price']<3500), 'categories'] = "3000 - 3500"
y_categorical.loc[(y_categorical['price']>=3500), "categories"] = "3500+"



In [3]:
# Linear Regression / Polynomial Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures

# Linear regression conceptually plots the data on a graph and tries to draw a line between the different data points.
# By plotting additional data points on to the trained line, the algorithm can determine an an approximate value.
# In simpler terms, if this graph had the dependant variable (say "height") on the y-axis and the independent variable (say "age")
# on the x-axis, we could see the projected heights of someone of any age by seeing where the line lies along the y-axis in
# relation to the age on the x-axis.
# Polynomial regression allows for this line to bend, and as such is is much more useful.
# It is very easy to overfit this model, however, as drawing a line straight through every point isn't generalised, it's tied to
# the training data. 
# Also, as the algorithm takes every point into consideration, outlying variables can really throw off the line and the projected variables.

linear_model = LinearRegression(copy_X = True, fit_intercept = True, n_jobs = None, normalize = False)

# Fits the model with the training data
linear_model.fit(X_values, y_values)

# Prediction for the y-values by testing the X values with the algorithm.
y_pred= linear_model.predict(X_values)

# The correlation between the actual y-values and the ones predicted by the model
score = r2_score(y_values, y_pred)
print(f"R squared (No polynomials): {score:.5f}\n")

# This fits the model with a 4-th degree polynomial. This resulted in the highest r^2 score, however it may well be overfit.
degree = 4
poly_reg_model = PolynomialFeatures(degree = degree)

# Transforms the X_values into their scaled version as defined by poly_reg_model
X_poly = poly_reg_model.fit_transform(X_values)
polynomial_model = LinearRegression()

# Fits the new X_poly values to the new model
polynomial_model.fit(X_poly, y_values)
y_poly_prediction = polynomial_model.predict(X_poly)
poly_score = r2_score(y_values, y_poly_prediction)

print(f'R squared with polynomial degree of {degree}: {poly_score:.5f}')

R squared (No polynomials): 0.88507

R squared with polynomial degree of 4: 0.95211


In [7]:
# K-Means Clustering
# Score: 0.9451085392000287, with 5 neighbours
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# KNN is a clustering algorithm that uses distance to classify data points.
# Essentially, the labelled data is plotted on a graph, and when the model is given unlabelled data, it is classified as belonging
# to whatever class member is closer (as per Euclidean distance). The K value is used to determine how many points are used
# for comparison. Foe example, if K = 1, then only the nearest labelled data point is used for comparison, but if K = 10, then
# the 10 nearest points are used, and the unlabelled data is classified as whatever the majority of the K closest points are.
# For example, if K = 10, 7 are class A and 3 are class B, then the data would be classified as being of class A.
# It works better for categorical data, as that has more defined separation between classes, unlike continuous data, which
# operates on a sliding scale and as such is harder to separate into clearly defined classes.
# Knn Regressors work by finding the weighted averages of the K nearest neighbours, in this case of price.


X_train, X_test, y_train, y_test = train_test_split(X_values, y_values, test_size = .33, random_state = random_state)

# To save time in testing different numbers of neighbours, this iterates through it comparing differing values,
# The best model is saved and displayed at the end.
best_score = 0
best_num_neighbours = 0
for i in range(1, 21):
    # K is i, the number the loop is iterating with
    knn_model = KNeighborsRegressor(n_neighbors=i)
    # Fit the training data to the model
    knn_model.fit(X_train, y_train.values.ravel())
    model_score = knn_model.score(X_test, y_test)
    print(f'Model score with {i} neighbours: {model_score}')
    if (model_score > best_score):
        print(f'New best model found')
        best_score = model_score
        best_num_neighbours = i

print(f'Best Model found: {best_num_neighbours} neighbours which gives an r^2 value of {best_score}')

Model score with 1 neighbours: 0.9245091644918888
New best model found
Model score with 2 neighbours: 0.9392135331132148
New best model found
Model score with 3 neighbours: 0.9426962202475522
New best model found
Model score with 4 neighbours: 0.9447424162405536
New best model found
Model score with 5 neighbours: 0.9451085392000287
New best model found
Model score with 6 neighbours: 0.9450532346230047
Model score with 7 neighbours: 0.9448318982497693
Model score with 8 neighbours: 0.9445590631890071
Model score with 9 neighbours: 0.9439180670086385
Model score with 10 neighbours: 0.9434160956921591
Model score with 11 neighbours: 0.9426248259983606
Model score with 12 neighbours: 0.9422821707821275
Model score with 13 neighbours: 0.9419353371012229
Model score with 14 neighbours: 0.9416948168679979
Model score with 15 neighbours: 0.941127821554051
Model score with 16 neighbours: 0.940497980151835
Model score with 17 neighbours: 0.9398692445353455
Model score with 18 neighbours: 0.93916

In [None]:
# Neural Network
# Score: 0.943772889411137

# Neural Networks are collections of Perceptrons, which are binary classifiers that can take in multiple sources of data 
# and "activate" (return a 1) if the activation function requirements are satisified by the input data.
# The MLP models in scikit-learn are Multilayer Perceptrions, which facilitate non-binary outputs, allowing for more complex categorisation and regression.
# Each node recieves data, and then is activated or not activated. This value is given a weight and passed on to the next node, and the process repeats.
# After passing through multiple nodes, the final output can be either a classification or a calculated value (regression).
# This uses the MLPRegressor, as we are predicting values, not splitting into classes.

# Source: "Hands-on Machine Learning with Scikit-Learn, Keras & Tensorflow", Aurelien Geron, O'Reilly 2019, ISBN 978-1-492-03264-9, p279-289, Retrieved 10/04/'21

# I believe the biggest issue with the Neural Network was model speed - using Colab with 12GB of RAM, it could take more than an
# hour to get a model, which made testing hard (I had to use smaller sections of the dataset.)
# It can also be very memory intensive - if the hidden layer sizes is too big, it can run out of memory (all 12 gb) and crash.
# Lastly, it is a black box (can't inspect the decision making process), so if there is an error in the logic or dataset it's not easy to detect.

import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X_values, y_values, test_size = .33)

# Scales the data to lie within the range of (x - u) / s, with x = sample, u = mean of x, s =standard deviation of x
# Source: 'https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html', retrieved 12/04/'21
# Compresses range so that outlying values don't have as drastic an effect on the algorithm
scaling = StandardScaler().fit(X_train)

# Fits the values to the scaling
X_train = scaling.transform(X_train)
X_test = scaling.transform(X_test)

# Number of neurons in the hidden layers. This configuration was mostly decided by trial and error.
hidden_layer_sizes = (10,20,40,80)

# Maximum number of iterations available. Will stop early if results converge.
max_iter = 10000

# Rectified Linear Unit - activation function. Outputs 0 if negative input, or the input if positive.
# Source: 'https://machinelearningmastery.com/rectified-linear-activation-function-for-deep-learning-neural-networks', retrieved 12/04/'21
activation = 'relu'

# Adaptive Moment Estimation. Stochastic Optimisation as proposed by Diederik Kingma & Jimmy Ba.
# Source: "https://machinelearningmastery.com/adam-optimization-algorithm-for-deep-learning", retrieved 12/04/'21
solver = 'adam'

nn_reg = MLPRegressor(max_iter=max_iter, hidden_layer_sizes = hidden_layer_sizes, activation=activation, solver=solver, random_state=random_state)

nn_reg.fit(X_train, y_train.values.ravel())

y_pred = nn_reg.predict(X_test)
print(r2_score(y_test, y_pred))

0.943772889411137


In [None]:
# Support Vector Machine 
# Score: 0.9420355915848466

# SVMs work by conceptually plotting the data on a scatteplot, then trying to draw lines that separate the data.
# Data is analysed and placed on this conceptual plot
# The idea is that margins are drawn along the plot which clearly separates the data, resulting in defined categorisation.
# The support vector is the instance that lies along the margins of a class, i.e. the instance that is closest in definition to a memeber of another class. It is almost an outlier.
# If data isn't linearly separable, then we must use soft margin classification (which allows outlying instances to be misclassified for the sake of model accuracy)
# over hard margin classification (which only works if a straight line can be drawn between each class)

# Source: "Hands-on Machine Learning with Scikit-Learn, Keras & Tensorflow", Aurelien Geron, O'Reilly 2019, ISBN 978-1-492-03264-9, p153-158, Retrieved 10/04/'21

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.svm import LinearSVC, SVC
from sklearn.svm import SVR

X_train, X_test, y_train, y_test = train_test_split(X_values, y_values, test_size = .33, random_state = random_state)

# Scales the data to lie within the range of (x - u) / s, with x = sample, u = mean of x, s =standard deviation of x
# Source: 'https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html', retrieved 12/04/'21
# Compresses range so that outlying values don't have as drastic an effect on the algorithm
scaling = StandardScaler().fit(X_train)
X_train = scaling.transform(X_train)
X_test = scaling.transform(X_test)

#Maximum number of possible iterations. Will stop earl if data converges.
max_iter = 500000

# C represents tolerance, or width of the margins generated by the SVM. High values results in narrower margins, which can result in better training score at the risk of overfitting
C = 1000 # 1000 & rbf = 0.9420355915848466
#kernel = 'poly' # 0.8659278159023867
#kernel = 'linear' # 0.8649820116013944

# Radial Basis Function kernel. Honestly, I don't understand the maths behind this, asides it being linked to the squared euclidean distance between 2 data points.
# It gave me the best score when training the model.
kernel = 'rbf'

# Degree is only used if kernel = 'poly', which was used druing development, so this is legacy code. 
# It's presence in the svm_model declaration makes no difference.
degree = 1
svm_model = SVR(max_iter = max_iter, C=C, kernel=kernel, degree = degree)
svm_model.fit(X_train, y_train)
print(svm_model.score(X_test, y_test))




0.9420355915848466


In [None]:
# Ensemble - Random Forest Regressor
# Score: 0.9803378454248586

# A random forest is an enhanced version of a decision tree.
# It uses many decision trees on subsets of the data to classify them.
# It uses averages of these trees to build a model, and as such it is able to deal with outlying data more easily, 
# as it will be averaged out due to regression to the mean.
# The use of averages also minimises the risk of overfitting.

# This provided the highest score that I was able to get for this dataset, and I believe that was due to the strengths
# regarding the resistance to outlyers. As the diamonds dataset is about prices, and prices are set by people, it is
# prone to bias (for example, certain diamond colours could appeal to certain people, resulting in a lack of price consistency)
# Also, prices may change over time, and time isn't part of the dataset.
# These factors may well result in some over or undervalued diamonds, which will be outlyers. The fact that random forests cancel
# out the effects of outlyers by utilising means results in a more accurate and predictable model for less uniform datasets.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(X_values, y_values, test_size = .33, random_state = random_state)

# Scales everything to lie between the values of 0 & 1. This reduces the damage caused by outliers on the model
scaling = MinMaxScaler(feature_range=(0, 1)).fit(X_train)
ensemble_model = RandomForestRegressor()
ensemble_model.fit(X_train, y_train)
y_pred = ensemble_model.predict(X_test)
print(ensemble_model.score(X_test, y_test))

0.9803378454248586
