# Step 1: Import python libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option("Display.max_rows", None)
pd.set_option("Display.max_columns", None)

# Step 2 : Import Dataset using pandas

In [None]:
df = pd.read_csv("Dentistry Dataset.csv")
df.head()                                # To check the first five rows

In [None]:
# To check the shape of the DataFrame
print(f"Total number of rows : {df.shape[0]}")
print(f"Total number of columns : {df.shape[1]}")

In [None]:
# To check the datatypes of each feature
df.dtypes

# Step 3 : Data Preprocessing

In [None]:
# To check the null values columns
df.isnull().sum()

In [None]:
# Create dummy variables for the Gender column
gender_dummies = pd.get_dummies(df['Gender'], prefix='Gender')

# Concatenate the original DataFrame with the dummy variables
df_numeric_only = pd.concat([df, gender_dummies], axis=1)

# Removing the unnecessary features
df_numeric_only = df_numeric_only.drop(columns=["Sample ID","Sl No"], axis=1)

In [None]:
df_numeric_only.head()

In [None]:
from sklearn.model_selection import train_test_split

# Split independent & dependent variable i.e X and Y
X = df_numeric_only.drop(["Gender"],axis=1)    # independent feature should be DataFrame or 2-dimensional array
y = df_numeric_only["Gender"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)


In [None]:
# Normalize the X variable
from sklearn.preprocessing import Normalizer

# dataframe
x = df_numeric_only.drop("Gender", axis=1)
normalizer = Normalizer()
x_normalized = normalizer.fit_transform(x)
print(x_normalized)

# Summary Statistics of Numerical features

In [None]:
df_numeric_only.describe()

# Step 4 : Exploratory Data Analysis

# Correlation matrix

In [None]:
corr_matrix = df_numeric_only.corr(numeric_only=True)
corr_matrix

# Heat map

In [None]:
plt.figure(figsize=(12,7))
sns.heatmap(df_numeric_only.corr(numeric_only=True), annot=True , cmap='coolwarm' )
plt.title("Correlation Heatmap", fontsize=16)
plt.xticks(rotation=80)
plt.tight_layout()
plt.show()

As you can see from this correlation heat map, that their are some features that are highly correlated with others features

# Gender

In [None]:
fig, ax = plt.subplots(figsize=(6,5))
# To create countplot
sns.countplot(df_numeric_only, x="Gender", palette='Set2')
plt.title("Count of Gender")
plt.ylabel("Count", labelpad=20, fontsize=10)
plt.xticks(rotation=45)
for container in ax.containers:
    ax.bar_label(container)
    ax.set_xlabel('')

As you can see here their are equal number of males & females

# Pie Chart representation to determine the percentage of outliers in each features

In [None]:
# import the necessary python library
import ipywidgets as widgets
from IPython.display import display, clear_output

# Create a dropdown widget for selecting a single column
column_selector = widgets.Dropdown(
    options=df_numeric_only.select_dtypes(include=["float64", "int64"]).columns.tolist(),
    description='Column',
    disabled=False
)

# Function to plot pie chart for the selected column
def plot_outliers(selected_column):
    Q1 = df_numeric_only[selected_column].quantile(0.25)
    Q3 = df_numeric_only[selected_column].quantile(0.75)
    IQR = Q3 - Q1
    outliers = (df_numeric_only[selected_column] < Q1 - 1.5 * IQR) | (df_numeric_only[selected_column] > Q3 + 1.5 * IQR)

    num_outliers = outliers.sum()
    num_non_outliers = len(df_numeric_only[selected_column]) - num_outliers

    sizes = [num_outliers, num_non_outliers]
    labels = ["Outliers", "Non-Outliers"]
    colors = ["#ff9999", "#66b3ff"]
    explode = (0.1, 0)

    # Clear the previous output
    clear_output(wait=True)
    
    # Display the dropdown widget again
    display(column_selector)
    
    # Plot the new pie chart
    fig, ax = plt.subplots()
    ax.pie(sizes, explode=explode, labels=labels, colors=colors,
           autopct='%1.1f%%', startangle=90)
    ax.axis("equal")
    plt.title("Percentage of outliers in {}".format(selected_column))
    plt.show()

# Function to handle the interaction
def on_column_select(change):
    if change['type'] == 'change' and change['name'] == 'value':
        selected_column = change['new']
        plot_outliers(selected_column)

# Attach the handler to the dropdown widget
column_selector.observe(on_column_select)

# Display the dropdown widget
display(column_selector)

# Initial plot
plot_outliers(column_selector.value)

# Boxplot of numerical features

In [None]:
numerical_cols = df_numeric_only.select_dtypes(include=["float64","int64"])

# To determine the subplots and figure size
fig, axes = plt.subplots(6,2, figsize=(12,15))
axes = axes.flatten()

# To create a Boxplot for all the numerical features.
for i, col in enumerate(numerical_cols.columns):
    sns.boxplot(x=numerical_cols[col], ax=axes[i], color="violet")
    axes[i].set_title(col)
    
# Hide any remaining empty subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

# Histogram plot for numerical features

In [None]:
numerical_cols = df_numeric_only.select_dtypes(include=["float64","int64"])
plt.figure(figsize=(15,15))
# To create a histogram plot for all the numerical features
for count,i in enumerate(numerical_cols):
    plt.subplot(7,2,count+1)
    sns.histplot(numerical_cols, x=i, kde=True, stat='density', color='violet')
    plt.xlabel(i, fontsize=20)
    plt.ylabel('Density', fontsize=14)
plt.tight_layout()
plt.show()

From this histogram plot we can see the data distribution and skewness for each features

In [None]:
sns.scatterplot(df_numeric_only, x="right canine width intraoral", y="right canine index casts", color='violet')
plt.title("Relationship between right canine width intraoral vs right canine index casts")

The figure shows a positive correlation between the width of the right canine tooth measured intraorally and the index of the right canine tooth measured on casts. This means that as the width of the right canine tooth measured intraorally increases, the index of the right canine tooth measured on casts also tends to increase. This correlation is not perfect, but it is clear.

In [None]:
sns.scatterplot(df_numeric_only, x="right canine width intraoral", y="left canine width intraoral", color='violet')
plt.title("Relationship between right canine width intraoral vs left canine width intraoral")

This above figure shows a positive correlation between each other, if one feature increases than the other one also increases.

In [None]:
sns.scatterplot(df_numeric_only, x="left canine width intraoral", y="left canine width casts", color='r')
plt.title("Relationship between Left canine width intraoral vs Left canine width casts")

It shows a positive correlation between left canine width casts and left canine width intraoral, it is basically directly proportional to each other, as one increases other one also increases.

In [None]:
sns.scatterplot(df_numeric_only, x="right canine index casts", y="left canine index casts", color='r')
plt.title("Relationship between right canine index casts vs Left canine index casts")

The scatter plot shows a positive correlation between the right and left canine index casts. This means that as the right canine index cast increases, the left canine index cast also tends to increase. The strong positive correlation suggests that the measurements of the right and left canine index casts are highly related.

# Step 5 : Model Building

# Train Test split

In [None]:
from sklearn.model_selection import train_test_split

# Split independent & dependent variable i.e X and Y
X = df_numeric_only.drop(["Gender"],axis=1)    # independent feature should be DataFrame or 2-dimensional array
y = df_numeric_only["Gender"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)


# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
log_reg = LogisticRegression()

# Train the model on the training data
log_reg.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = log_reg.predict(X_test)

# To check the train and test score
train_score = log_reg.score(X_train,y_train)
test_score = log_reg.score(X_test,y_test)

# Evaluate the model
print("Train score : {}".format(train_score))
print("Test score : {}".format(test_score))
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

In [None]:
sns.histplot(y_pred, kde=True)

# Decision Tree classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dec_tree = DecisionTreeClassifier()

# Train the model on the training data
dec_tree.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = dec_tree.predict(X_test)

# To check the train and test score
train_score = dec_tree.score(X_train,y_train)
test_score = dec_tree.score(X_test,y_test)

# Evaluate the model
print("Train score : {}".format(train_score))
print("Test score : {}".format(test_score))
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

In [None]:
dec_tree = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=5)

# Random Forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
rfc.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rfc.predict(X_test)

# To check the train and test score
train_score = rfc.score(X_train,y_train)
test_score = rfc.score(X_test,y_test)

# Evaluate the model
print("Train score : {}".format(train_score))
print("Test score : {}".format(test_score))
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

In [None]:
rfc = RandomForestClassifier(n_estimators=500, max_depth=5, min_samples_split=10, min_samples_leaf=5, random_state=42)

# XGBOOST Classifier

In [None]:
! pip install xgboost

In [None]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

# Convert categorical labels to integer labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Create a DMatrix object for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train_encoded)
dtest = xgb.DMatrix(X_test, label=y_test_encoded)

# Define the XGBoost classifier parameters
params = {
    'objective': 'multi:softprob',
    'num_class': 3,  # number of classes
    'max_depth': 6,
    'learning_rate': 0.1,
    'gamma': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0,
    'reg_lambda': 1,
    'seed': 42
}

# Train the XGBoost model
bst = xgb.train(params, dtrain, num_boost_round=1000)

# Evaluate the model on the training data
train_eval = bst.eval(dtrain)
print("Train eval:", train_eval)

# Evaluate the model on the testing data
test_eval = bst.eval(dtest)
print("Test eval:", test_eval)

# Make predictions on the testing data
y_pred_prob = bst.predict(dtest)
y_pred = y_pred_prob.argmax(axis=1)

# Convert y_pred back to original labels
y_pred_labels = label_encoder.inverse_transform(y_pred)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_labels)
print('Accuracy:', accuracy)
print('Classification Report:')
print(classification_report(y_test, y_pred_labels))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_labels))


In [None]:
sns.histplot(y_pred, kde=True)

# Conclusion :- All the models give good score with high accuracy, so our training data is best fit with all the models(logisticRegression, Decision Tree classifier, Random forest classifier, XGboost classifier)