In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import mode


In [None]:
df=pd.read_csv("diamonds.csv")

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
# make a split of numerical and categorical columns
numerical = ['carat', 'depth', 'table', 'price', 'x', 'y', 'z']
categorical = df.loc[:, ~df.columns.isin(numerical)].columns

In [None]:
numerical

In [None]:
categorical

In [None]:
# plot the distplot for numerical columns
plt.figure(figsize = (30, 25))

for i, col in enumerate(numerical):
    plt.subplot(3, 3, i+1)
    sns.distplot(df[col], hist = False)

In [None]:
sns.pairplot(df)

In [None]:
# plot the boxplot
plt.figure(figsize = (20, 15))

for i , col in enumerate(numerical):
    plt.subplot(3, 3, i+1)
    sns.boxplot(df[col])

In [None]:
#scatter plot for all numeric columns
plt.figure(figsize = (20, 15))

for i , col in enumerate(numerical):
    plt.subplot(3, 3, i+1)
    sns.scatterplot(y = df['price'], x = df[col])

In [None]:
#count plot for all numeric columns
plt.figure(figsize = (20, 5))

for i , col in enumerate(categorical):
    plt.subplot(1, 3, i+1)
    sns.countplot(df[col])

In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(df.corr(), data = df, annot = True, cmap = 'RdBu_r')

In [None]:
X = df.drop('price', axis = 1)
y = df['price']

In [None]:
X,y

In [None]:
#split this data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 4)
print('Train Set: ', X_train.shape, y_train.shape)
print('Test Set: ', X_test.shape, y_test.shape)

In [None]:
X_train.head()

In [None]:
X_test.head()

## Lable Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
X_train['cut']=le.fit_transform(X_train['cut'])
X_train['color']=le.fit_transform(X_train['color'])
X_train['clarity']=le.fit_transform(X_train['clarity'])

In [None]:
# label encoding the test set categorical columns
le= LabelEncoder()
X_test['cut']=le.fit_transform(X_test['cut'])
X_test['color']=le.fit_transform(X_test['color'])
X_test['clarity']=le.fit_transform(X_test['clarity'])

In [None]:
X_train.head(2)

In [None]:
X_test.head(2)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_rescaled = pd.DataFrame(scaler.fit_transform(X_train),
                                columns = X_train.columns,
                                index = X_train.index)
X_train_rescaled.head()

In [None]:
X_test_rescaled = pd.DataFrame(scaler.transform(X_test),
                               columns = X_test.columns,
                               index = X_test.index)
X_test_rescaled.head()

### Expriment Tracking

In [None]:
pip install mlflow

In [None]:
import mlflow

In [None]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")

mlflow.set_experiment("Diamond Price Prediction")

In [None]:
from pickle import dump
dump(le, open('models/label_encoder.pkl', 'wb'))
dump(scaler, open('models/standard_scaler.pkl', 'wb'))


In [None]:
from sklearn import metrics


### Experiment-1 Linear Regrission

In [None]:
from sklearn.linear_model import LinearRegression


In [None]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Abdul Azad")
    mlflow.set_tag("algo", "Linear Regression")
    # log the data for each run using log_param, log_metric
    mlflow.log_param("data-path", "diamonds.csv")
    linear_regressor = LinearRegression()
    linear_regressor.fit(X_train_rescaled, y_train)
    y_test_pred = linear_regressor.predict(X_test_rescaled)
    acc = metrics.r2_score(y_test, y_test_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
    mlflow.log_metric("R2 Score", acc)
    mlflow.log_metric("RMSE", rmse)
    mlflow.sklearn.log_model(linear_regressor, artifact_path = "models")
    mlflow.log_artifact("models/standard_scaler.pkl")
    mlflow.log_artifact("models/label_encoder.pkl")

### Experiment-2 KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Abdul Azad")
    mlflow.set_tag("algo", "KNN")
    # log the data for each run using log_param, log_metric
    mlflow.log_param("data-path", "diamonds.csv")
    k = 6
    knn_regressor = KNeighborsRegressor(n_neighbors = k)
    knn_regressor.fit(X_train_rescaled, y_train)
    y_test_pred = knn_regressor.predict(X_test_rescaled)
    acc = metrics.r2_score(y_test, y_test_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
    mlflow.log_param("n_neighbors", k)
    mlflow.log_metric("R2 Score", acc)
    mlflow.log_metric("RMSE", rmse)
    mlflow.sklearn.log_model(knn_regressor, artifact_path = "models")
    mlflow.log_artifact("models/standard_scaler.pkl")
    mlflow.log_artifact("models/label_encoder.pkl")

### Experiment-3 Decision Tree Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor


In [None]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Abdul Azad")
    mlflow.set_tag("algo", "Decision Tree Regression")
    # log the data for each run using log_param, log_metric
    mlflow.log_param("data-path", "diamonds.csv")
    d = None
    dt_regressor = DecisionTreeRegressor(max_depth = d)
    dt_regressor.fit(X_train_rescaled, y_train)
    y_test_pred = dt_regressor.predict(X_test_rescaled)
    acc = metrics.r2_score(y_test, y_test_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
    mlflow.log_param("max_depth", d)
    mlflow.log_metric("R2 Score", acc)
    mlflow.log_metric("RMSE", rmse)
    mlflow.sklearn.log_model(dt_regressor, artifact_path = "models")
    mlflow.log_artifact("models/standard_scaler.pkl")
    mlflow.log_artifact("models/label_encoder.pkl")

### Experiment-4 Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Abdul Azad")
    mlflow.set_tag("algo", "Random Forest")
    # log the data for each run using log_param, log_metric
    mlflow.log_param("data-path", "diamonds.csv")
    e = 101
    rf_regressor = RandomForestRegressor(n_estimators = e)
    rf_regressor.fit(X_train_rescaled, y_train)
    y_test_pred = rf_regressor.predict(X_test_rescaled)
    acc = metrics.r2_score(y_test, y_test_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
    mlflow.log_param("n_estimators", e)
    mlflow.log_metric("R2 Score", acc)
    mlflow.log_metric("RMSE", rmse)
    mlflow.sklearn.log_model(rf_regressor, artifact_path = "models")
    mlflow.log_artifact("models/standard_scaler.pkl")
    mlflow.log_artifact("models/label_encoder.pkl")

### Experiment-5 Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Abdul Azad")
    mlflow.set_tag("algo", "GaussianNB")
    mlflow.log_param("data-path", "diamond.csv")
    nb_classifier = GaussianNB()
    nb_classifier.fit(X_train_rescaled, y_train)
    y_test_pred = nb_classifier.predict(X_test_rescaled)
    acc = metrics.accuracy_score(y_test, y_test_pred)    
    mlflow.log_metric("R2 Score", acc)
    mlflow.sklearn.log_model(nb_classifier, artifact_path="models")
    mlflow.log_artifact("models/standard_scaler.pkl")
    mlflow.log_artifact("models/label_encoder.pkl")