In [None]:
'''
ABOUT THE DATASET

It is a collection of basic health biosignals
there are 55692 entries with 27 rows
The following are the columns:-
ID : index
gender
age : 5-years gap
height(cm)
weight(kg)
waist(cm) : Waist circumference length
eyesight(left)
eyesight(right)
hearing(left)
hearing(right)
systolic : Blood pressure
relaxation : Blood pressure
fasting blood sugar
Cholesterol : total
triglyceride
HDL : cholesterol type
LDL : cholesterol type
hemoglobin
Urine protein
serum creatinine
AST : glutamic oxaloacetic transaminase type
ALT : glutamic oxaloacetic transaminase type
Gtp : γ-GTP
oral : Oral Examination status
dental caries
tartar : tartar status
smoking : 0 or 1

'''

#First, we shall analyse the data

#importing all necessary libraries for performing analysis
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

data = pd.read_csv("/Users/devanshigupta/Documents/PYTHON CODE/ML IITK/find_smokers_by_vital_signs_dataset/smoking.csv")

#First look at the data set
print("ORIGINAL DATA SET")
data.head()
print("SHAPE OF DATASET")
data.shape
print("EXAMINE TYPE OF EACH COLUMN")
data.info()

In [None]:
#Lets work on visualisations next

import matplotlib.pyplot as plt
import seaborn as sns
custom_palette = sns.color_palette("husl", 2)


# Visualize the distribution of the target variable
plt.figure(figsize=(8, 6))
sns.countplot(data['smoking'],palette=custom_palette)
plt.title('Distribution of Smokers and Non-Smokers')
plt.xlabel('Smoking Status')
plt.ylabel('Count')
plt.xticks([0, 1], ['Non-Smoker', 'Smoker'])
plt.show()

# Visualize the relationship between age and smoking status
plt.figure(figsize=(10, 6))
sns.boxplot(x='smoking', y='age', data=data,palette=custom_palette)
plt.title('Age vs Smoking Status')
plt.xlabel('Smoking Status')
plt.ylabel('Age')
plt.xticks([0, 1], ['Non-Smoker', 'Smoker'])
plt.show()

# Visualize the relationship between height(cm) and smoking status
plt.figure(figsize=(10, 6))
sns.boxplot(x='smoking', y='height(cm)', data=data,palette=custom_palette)
plt.title('height(cm) vs Smoking Status')
plt.xlabel('Smoking Status')
plt.ylabel('height(cm)')
plt.xticks([0, 1], ['Non-Smoker', 'Smoker'])
plt.show()

# Visualize the relationship between weight(kg) and smoking status
plt.figure(figsize=(10, 6))
sns.boxplot(x='smoking', y='weight(kg)', data=data,palette=custom_palette)
plt.title('weight(kg) vs Smoking Status')
plt.xlabel('Smoking Status')
plt.ylabel('weight(kg)')
plt.xticks([0, 1], ['Non-Smoker', 'Smoker'])
plt.show()

# Visualize the correlation matrix
plt.figure(figsize=(14, 10))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Visualize the relationship between gender and smoking status
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', hue='smoking', data=data,palette=custom_palette)
plt.title('Gender vs Smoking Status')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.xticks([0, 1], ['Female', 'Male'])
plt.show()

# Visualize the relationship between oral health and smoking status
plt.figure(figsize=(8, 6))
sns.countplot(x='oral', hue='smoking', data=data,palette=custom_palette)
plt.title('Oral Health vs Smoking Status')
plt.xlabel('Oral Health Status')
plt.ylabel('Count')
plt.xticks([0, 1], ['Good', 'Bad'])
plt.show()

# Visualize the relationship between tartar and smoking status
plt.figure(figsize=(8, 6))
sns.countplot(x='tartar', hue='smoking', data=data,palette=custom_palette)
plt.title('Tartar vs Smoking Status')
plt.xlabel('Tartar')
plt.ylabel('Count')
plt.xticks([0, 1], ['No Tartar', 'Tartar'])
plt.show()

In [None]:
#DEVELOPING ML MODEL
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier

# Check for missing values
missing_values = data.isnull().sum()
print(missing_values)

# Encode categorical variables
label_encoder = LabelEncoder()
data['gender'] = label_encoder.fit_transform(data['gender'])
data['oral'] = label_encoder.fit_transform(data['oral'])
data['tartar'] = label_encoder.fit_transform(data['tartar'])

# Separate features and target
X = data.drop(columns=['ID', 'smoking'])
y = data['smoking']

# Standardize numerical variables
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train, y_train)

# Predict on the test set using Random Forest
y_pred_rf = random_forest_model.predict(X_test)

# Evaluate the Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf)

print("\nRandom Forest Classifier")
print(f"Accuracy: {accuracy_rf}")
print("Classification Report:")
print(report_rf)


# Train a Decision Tree Classifier
decision_tree_model = DecisionTreeClassifier(random_state=42)
decision_tree_model.fit(X_train, y_train)

# Predict on the test set using Decision Tree
y_pred_tree = decision_tree_model.predict(X_test)

# Evaluate the Decision Tree model
accuracy_tree = accuracy_score(y_test, y_pred_tree)
report_tree = classification_report(y_test, y_pred_tree)

print("Decision Tree Classifier")
print(f"Accuracy: {accuracy_tree}")
print("Classification Report:")
print(report_tree)

