# Importing the dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/cubic-zirconia/cubic_zirconia.csv")
df.head()

In [None]:
df = df.drop("Unnamed: 0",axis=1)

In [None]:
df.info()

## Cleaning and Preprocessing the Dataset

In [None]:
print(f"{df.isna().sum()}\n \nlength of the dataset:- {len(df)} rows.")

In [None]:
## Filling in missing values
df['depth'] = df['depth'].fillna(df.depth.median())

In [None]:
numerical_cols = ["carat",'depth', 'table', 'x', 'y', 'z','price']
cate_cols = ['cut', 'color', 'clarity']

In [None]:
## Checking for outliers 
for i,col in enumerate(numerical_cols):
    sns.boxplot(df[col],whis=1.5)
    plt.show()

 There are outliers in all numerical features

In [None]:
def outlier_removal(col):
    sorted(col)
    Q1,Q3=np.percentile(col,[25,75])
    IQR=Q3-Q1
    lower_range= Q1-(1.5 * IQR)
    upper_range= Q3+(1.5 * IQR)
    return lower_range, upper_range

In [None]:
for col in numerical_cols:
    lr,ur = outlier_removal(df[col])
    df[col] = np.where(df[col]>ur,ur,df[col])
    df[col] = np.where(df[col]<lr,lr,df[col])

In [None]:
for i,col in enumerate(numerical_cols):
    sns.boxplot(df[col],whis=1.5)
    plt.show()

In [None]:
df.head()

## EDA

### Univariate Analysis

In [None]:
for i,col in enumerate(numerical_cols):
    plt.figure(figsize=(12,10))
    plt.subplot(4,2,i+1)
    sns.distplot(df[col],kde=True,color="midnightblue")
    plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(df.corr(),cmap = sns.cubehelix_palette(as_cmap=True))
plt.show()

**Caret and Target variable price are both rightward skewed, other features are more or less noramally distributed**

### Multivariate/bi-variate analysis

In [None]:
df.head()

In [None]:
sns.pairplot(df)

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(df.corr(),cmap=sns.cubehelix_palette(as_cmap=True))
plt.show()

Price has a very high correlation with carat, length, width and height of the diamond. \
It means Price is highly determined by these 4 features of a Diamond.

In [None]:
df.head()

In [None]:
plt.figure(figsize = (12,6))
splot = sns.countplot(x = "cut",data=df,order = df['cut'].value_counts().index)
splot.set_xlabel("CUT",fontsize=15)
splot.set_ylabel("COUNT",fontsize=15)
plt.show()

In [None]:
plt.figure(figsize = (12,6))
splot = sns.countplot(x = "color",data=df,order = df['color'].value_counts().index)
splot.set_xlabel("COLOR",fontsize=15)
splot.set_ylabel("COUNT",fontsize=15)

**Color G, E, F, and H have the most occurance**

In [None]:
df_color_price =df.groupby('color')['price'].mean()            \
        .reset_index()                          \
        .sort_values("price",ascending=False)    \
        .reset_index().drop("index",axis=1)

In [None]:
plt.figure(figsize = (12,6))
sns.barplot(x=df_color_price['color'],y = df_color_price['price'],color="lime")
plt.show()

**Color J and I are most pricy colors which explains the less quantity as shown in the above countplot**

In [None]:
plt.figure(figsize = (12,6))
splot = sns.countplot(x = "clarity",data=df,order = df['clarity'].value_counts().index)
splot.set_xlabel("clarity",fontsize=15)
splot.set_ylabel("COUNT",fontsize=15)

plt.show()

In [None]:
df.head()

### Feature Selection

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
for col in cate_cols:
    df[col]=le.fit_transform(df[col])

In [None]:
df.head()

## Model Creation

### Creating a Linear Model

In [None]:
#df = df.drop(["y",'z',"depth","table"],axis=1)

In [None]:
X = df.drop("price",axis=1)
y = df['price']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,test_size=0.2)

In [None]:
reg = LinearRegression()
reg.fit(X_train,y_train)

In [None]:
y_preds = reg.predict(X_test)
score = reg.score(X_test,y_test)
print(f"accuracy of our Linear model is :- {score}")

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(max_depth=15,random_state=42)
forest.fit(X_train,y_train)

In [None]:
score = forest.score(X_test,y_test)
print(f"accuracy of our Linear model is :- {score}")

**Accuracy of 98.7% with a random forest model**

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = df.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(df.values, i) \
                   for i in range(len(df.columns))]

print(vif_data)


This data is prone to Multicollinearity and their are features whose absence won't affect the **accuracy** of the model like `["y",'z',"table","depth"]`\
Additionally, I used Random Forest in order to avoid overfitting.