# Predicting Diamond Prices
The team has given you a CSV containing  53944 records of diamond pricing information based on carat, cut, color, clarity, depth and x,y,z related to diamondsWith your knowledge of machine learning, the price of a diamond will be predicted

# Predicting the price of diamonds

Logistic regression is used to predict prices of diamonds based on carat, cut, clarity and depth

## Process:


1. Prepare the data

2. Split the data into training and testing sets

3. Model and fit the data into a logistic regression

4. Predict the testing labels 

5. Calculate the  metrics



In [1]:
# Imports
import pandas as pd
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression


---

## Prepare the data to be used on a neural network model

### Step 1: Read the 'Diamonds_price_data.csv file into a dataframe and displayes first 5 rows of data 


In [2]:
# Read the Diamonds_price_data.csv into a Pandas DataFrame
diamond_data_df = pd.read_csv("Diamonds_price_data.csv")

# Review the DataFrame

diamond_data_df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
# Review the data types associated with the columns

diamond_data_df.dtypes

Unnamed: 0      int64
carat         float64
cut            object
color          object
clarity        object
depth         float64
table         float64
price           int64
x             float64
y             float64
z             float64
dtype: object

### Step 2: Drop the “Unnamed: 0” because it is not relevant to the binary classification model.

In [4]:
# Drop the 'Unnamed: 0' columns from the DataFrame
diamond_data_df =diamond_data_df.drop(columns=['Unnamed: 0'])

# Review the DataFrame
diamond_data_df


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
2432,1.11,Premium,I,I1,61.8,59.0,3183,6.65,6.58,4.09
2433,0.85,Ideal,H,SI1,60.8,57.0,3183,6.13,6.18,3.74
2434,0.76,Ideal,G,VS2,61.5,55.0,3183,5.92,5.88,3.63
2435,0.70,Premium,D,VS1,60.9,60.0,3183,5.75,5.71,3.49


### Use Label Encoder to encode string data into numeric data

In [5]:
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column

diamond_data_df['cut'] = labelencoder.fit_transform(diamond_data_df['cut'])
diamond_data_df['color'] = labelencoder.fit_transform(diamond_data_df['color'])
diamond_data_df['clarity'] = labelencoder.fit_transform(diamond_data_df['clarity'])
diamond_data_df["price"].value_counts()

561     91
554     34
558     32
2822    23
2777    21
        ..
2884     1
2864     1
3151     1
2820     1
3184     1
Name: price, Length: 446, dtype: int64

In [6]:
#display the diamond_data_df dataframe
diamond_data_df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,2,1,3,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,1,4,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,5,5,62.4,58.0,334,4.20,4.23,2.63
4,0.31,1,6,3,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
2432,1.11,3,5,0,61.8,59.0,3183,6.65,6.58,4.09
2433,0.85,2,4,2,60.8,57.0,3183,6.13,6.18,3.74
2434,0.76,2,3,5,61.5,55.0,3183,5.92,5.88,3.63
2435,0.70,3,0,4,60.9,60.0,3183,5.75,5.71,3.49


In [7]:
#Predicting price and therefore the target is the price
target = diamond_data_df["price"]

# The features column should be all of the features. 
features = diamond_data_df.drop(['price'],axis=1,inplace=False)


In [8]:
#display features
features

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,2,1,3,61.5,55.0,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,3.89,3.84,2.31
2,0.23,1,1,4,56.9,65.0,4.05,4.07,2.31
3,0.29,3,5,5,62.4,58.0,4.20,4.23,2.63
4,0.31,1,6,3,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...
2432,1.11,3,5,0,61.8,59.0,6.65,6.58,4.09
2433,0.85,2,4,2,60.8,57.0,6.13,6.18,3.74
2434,0.76,2,3,5,61.5,55.0,5.92,5.88,3.63
2435,0.70,3,0,4,60.9,60.0,5.75,5.71,3.49


In [9]:
# Check the balance of our target values
target.value_counts()

561     91
554     34
558     32
2822    23
2777    21
        ..
2884     1
2864     1
3151     1
2820     1
3184     1
Name: price, Length: 446, dtype: int64

### Split the data into training and testing


In [10]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(features,target, random_state=1)

## Create a Logistic Regression Model with the Original Data

###  Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [11]:
logistic_regression_model = LogisticRegression(random_state=1,solver='lbfgs', max_iter=30000)
# Fit the model using training data
logistic_regression_model.fit(X_train, y_train)

LogisticRegression(max_iter=30000, random_state=1)

### Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [12]:
# Make a prediction using the testing data
training_predictions = logistic_regression_model.predict(X_train)
testing_predictions = logistic_regression_model.predict(X_test)

In [13]:
# Split the preprocessed data into a training and testing dataset
# Assign the function a random_state equal to 1
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=1)


### Evaluate the model’s performance

In [14]:
from sklearn.metrics import classification_report
# Save a classification report
training_report = classification_report(y_train, training_predictions)
# Then view its output
print(training_report)


              precision    recall  f1-score   support

         326       0.00      0.00      0.00         1
         327       0.00      0.00      0.00         1
         336       0.00      0.00      0.00         1
         337       0.00      0.00      0.00         1
         338       0.00      0.00      0.00         1
         339       0.00      0.00      0.00         1
         340       0.00      0.00      0.00         1
         342       0.00      0.00      0.00         1
         344       0.00      0.00      0.00         1
         345       0.50      0.50      0.50         2
         351       0.50      0.33      0.40         3
         352       0.00      0.00      0.00         1
         353       1.00      0.50      0.67         2
         354       0.00      0.00      0.00         1
         355       0.00      0.00      0.00         1
         357       0.00      0.00      0.00         3
         402       1.00      0.12      0.22         8
         403       0.00    