# Machine Learning

## Best Model Selection  

In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Importing Classification Models
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Importing Classification Metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:
# Ignoring Warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Loading the Dataset
df = sns.load_dataset("diamonds")
print(df.head())

   carat      cut color clarity  depth  table  price     x     y     z
0   0.23    Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43
1   0.21  Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31
2   0.23     Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31
3   0.29  Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63
4   0.31     Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75


In [4]:
# Getting basic info of our Dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [5]:
# Let's check for missing values in our dataset
print(df.isnull().sum().sort_values(ascending=False))

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64


In [6]:
# Create a LabelEncoder object
le = LabelEncoder()

# Encode all categorical columns in the DataFrame
for col in df.select_dtypes(include=['object', 'category']).columns:
    df[col] = le.fit_transform(df[col])

In [7]:
# Getting the name of columns
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

## Train Test Split

In [8]:
# Selecting Features and Target Variable
X = df.drop("clarity", axis=1)
y = df["clarity"]
# Splitting the Dataset using Train-Test-Split by 80/20 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Create a dictionary of models to evaluate
models = { 
    'XGB Classifier': XGBClassifier(),  # XGBoost classifier
    'Logistic Regression': LogisticRegression(),  # Logistic Regression
    'Random Forest Classifier': RandomForestClassifier(),  # Random Forest
    'Support Vector Classifier': SVC(),  # Support Vector Machine
    'K Neighbors Classifier': KNeighborsClassifier(),  # KNN Classifier
    'Gradient Boosting Classifier': GradientBoostingClassifier()  # Gradient Boosting
}

model_scores = []  # List to store model names and their accuracy

# Loop through each model
for name, model in models.items():
    model.fit(X_train, y_train)  # Train the model on training data
    y_pred = model.predict(X_test)  # Predict on test data
    acc = accuracy_score(y_test, y_pred)  # Calculate accuracy
    model_scores.append((name, acc))  # Append model name and accuracy to list

# Sort the models by accuracy in ascending order
model_scores_sorted = sorted(model_scores, key=lambda x: x[1])

# Create a DataFrame for results
results_df = pd.DataFrame(model_scores_sorted, columns=['Model', 'Accuracy'])

# Print the results table
print(results_df.to_string(index=False))  # Display as a table

# Get the best model
best_model = results_df.iloc[-1]  # Last row contains the best model
print(f"\nBest Model: {best_model['Model']} with Accuracy: {best_model['Accuracy']:.4f}")

                       Model  Accuracy
         Logistic Regression  0.239525
   Support Vector Classifier  0.271042
      K Neighbors Classifier  0.420838
Gradient Boosting Classifier  0.521320
              XGB Classifier  0.657953
    Random Forest Classifier  0.688265

Best Model: Random Forest Classifier with Accuracy: 0.6883
