In [1]:
import numpy as np
import pandas as pd

import plotly.graph_objects as go
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore") # Ignore all warnings
pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', 10) # Display Max 10 Rows

ModuleNotFoundError: No module named 'plotly'

In [None]:
# Read the CSV data into dataframe

df_cushion = pd.read_csv('./data/data_cushion.csv')
df_emerald = pd.read_csv('./data/data_emerald.csv')
df_heart = pd.read_csv('./data/data_heart.csv')
df_oval = pd.read_csv('./data/data_oval.csv')
df_radiant = pd.read_csv('./data/data_radiant.csv')
df_round = pd.read_csv('./data/data_round.csv')

df_cushion.drop(['Shape'], axis=1, inplace=True) 
df_emerald.drop(['Shape'], axis=1, inplace=True) 
df_heart.drop(['Shape'], axis=1, inplace=True) 
df_oval.drop(['Shape'], axis=1, inplace=True) 
df_radiant.drop(['Shape'], axis=1, inplace=True) 
df_round.drop(['Shape'], axis=1, inplace=True) 

frames = [df_cushion,df_heart, df_radiant]
df = pd.concat(frames)

df_oval['Price'].dtypes

df


In [3]:
# Convert Price to float from string
df['Price'] = df['Price'].str.replace(',', '').astype(np.float)

frames = [df_emerald, df_oval, df_round]

df = pd.concat(frames)

# Function to return all numbers as an array
def getNumbers(str):
    import re
    
    array = re.findall(r'[0-9]', str)
    return array

# Convert Messurements to a usable string
df['Messurements'] = df['Messurements'].apply(lambda x: getNumbers(x) )
df['Messurements'] = df['Messurements'].apply(lambda x: ''.join(x) )

# Derive new Attributes from the Messurements attribute
df['length']= df['Messurements'].str[:3].astype(np.float) /100
df['width'] = df['Messurements'].str[3:6].astype(np.float) /100
df['depth'] = df['Messurements'].str[6:].astype(np.float) / 100

df

In [4]:
def EDA(df):
    print('\033[1m' + 'Shape of the data :' + '\033[0m')
    print(df.shape, 
          '\n------------------------------------------------------------------------------------\n')
    
    print('\033[1m' + 'All columns from the dataframe :' + '\033[0m')
    print(df.columns, 
          '\n------------------------------------------------------------------------------------\n')
    
    print('\033[1m' + 'Datatpes and Missing values:' + '\033[0m')
    print(df.info(), 
          '\n------------------------------------------------------------------------------------\n')
    
    print('\033[1m' + 'Missing value count:' + '\033[0m')
    print(df.isnull().sum(),
          '\n------------------------------------------------------------------------------------\n')
    
    print('\033[1m' + 'Summary statistics for the data' + '\033[0m')
    print(df.describe(include='all'), 
          '\n------------------------------------------------------------------------------------\n')
    
    print('\033[1m' + 'Outliers in the data :' + '\033[0m')
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    outliers = (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))
    print(outliers.sum(), 
          '\n------------------------------------------------------------------------------------\n')
        
    print('\033[1m' + 'Memory used by the data :' + '\033[0m')
    print(df.memory_usage(), 
          '\n------------------------------------------------------------------------------------\n')
    
    print('\033[1m' + 'Number of duplicate values :' + '\033[0m')
    print(df.duplicated().sum())
    
EDA(df)

In [5]:
# Remove parameters unecessary to prediction
df = df.drop(['Id', 'Messurements', 'Data Url'], axis=1)
df.columns


In [6]:
# Split features used for prediction and label to train data
features = df[ 
    ['Clarity',
     'Weight',
     'Colour',
     'Polish',
     'Cut',
     'Symmetry',
     'Fluorescence',
     'length',
     'width',
     'depth',
     ]
]

# Each price becomes a unique label 
labels= df['Price']
features

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(pd.get_dummies( features ), labels, test_size=0.2, random_state=0)

X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0)

X_train.head()

In [8]:
from sklearn.preprocessing import MinMaxScaler

# Min Max normalisation over the training data
scaler = MinMaxScaler()
scaler.fit(X_train)

# Scale the training, test, and validation sets
features = X_train.columns

X_train[features] = scaler.transform(X_train[features])
X_val[features] = scaler.transform(X_val[features])
X_test[features] = scaler.transform(X_test[features])

X_train.head()


In [9]:
def correlation_heat_map(data, title, zmin=-1, zmax=1, height=600, width= 800):
    """
    data: Your dataframe.
    title: Title for the correlation matrix.
    zmin: Minimum number for color scale. (-1 to 1). 
    zmax: Maximum number for color scale. (-1 to 1). 
    height: height of diplayed map.
    width: width of diplayed map.
    """
    
    # Pairwise correlation of all features
    data = data.corr()
    mask = np.triu(np.ones_like(data, dtype=bool))
    rLT = data.mask(mask)

    heat = go.Heatmap(
        z = rLT,
        x = rLT.columns.values,
        y = rLT.columns.values,
        zmin = zmin, 
            # Sets the lower bound of the color domain
        zmax = zmax,
            # Sets the upper bound of color domain
        xgap = 1, # Sets the horizontal gap (in pixels) between bricks
        ygap = 1,
        colorscale = 'Greens'
    )

    title = title

    layout = go.Layout(
        title_text=title, 
        title_x=0.5, 
        width= width, 
        height= height,
        xaxis_showgrid=False,
        yaxis_showgrid=False,
        yaxis_autorange='reversed'
    )

    fig= go.Figure(data=[heat], layout=layout)
    return fig

In [10]:
Xy_train = pd.concat([X_train, y_train], axis=1)

correlation_heat_map(Xy_train,'Correlation Map', height=1500, width=1500)

In [11]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(n_estimators=100, oob_score=True)
rfr_model = rfr.fit(X_train, y_train)

In [12]:
plt.figure(figsize= (18, 45))

feature_importance = rfr.feature_importances_
indices = np.argsort(feature_importance)


plt.yticks(range(len(indices)), [X_train.columns[i] for i in indices])
plt.barh(range(len(indices)), feature_importance[indices],color='orange' ,align='center')
plt.show()

In [13]:
print('Accuracy for Train:', rfr.score(X_train, y_train) )
print('Accuracy for Test:', rfr.score(X_test, y_test) )

In [14]:
# Make predictions for the test set
y_pred = rfr_model.predict(X_test)

pred_res =pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
pred_res

In [15]:
from sklearn import metrics

# Print out the mean absolute error (MAE)
print('Mean Absolute Error:', round( metrics.mean_absolute_error(y_test, y_pred),2 ))
print('Mean Squared Error:', round( metrics.mean_squared_error(y_test, y_pred), 2))

# Calculate mean absolute percentage error (MAPE)
errors = abs(y_pred - y_test)
mape = 100 * (errors / y_test)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')