In [1]:

# Core
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# Data preparation imports (from 03 Data Preparation)
from sklearn.experimental import enable_iterative_imputer  # noqa: F401
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OrdinalEncoder

# Supervised learning imports (from 04 Supervised Learning)
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Plotting in notebook
%matplotlib inline
sns.set(style="whitegrid", rc={"figure.figsize": (7,4)})

In [2]:
df = pd.read_csv("smoking-v1.csv")
df.head()

Unnamed: 0,gender,age,marital_status,highest_qualification,nationality,ethnicity,gross_income,region,smoke,amt_weekends,amt_weekdays,type
0,Male,38.0,Divorced,No Qualification,British,White,"2,600 to 5,200",The North,No,,,
1,Female,42.0,Single,No Qualification,British,White,"Under 2,600",The North,Yes,12.0,12.0,Packets
2,Male,40.0,Married,Degree,English,White,"28,600 to 36,400",The North,No,,,
3,Female,,Married,Degree,English,White,"10,400 to 15,600",The North,No,,,
4,Female,39.0,Married,GCSE/O Level,British,White,"2,600 to 5,200",The North,No,,,


In [3]:
df["marital_status"].isna().mean()

np.float64(0.0)

In [4]:
def encode_or_drop_marital_status(df):
    if "marital_status" not in df.columns:
        print("Column 'marital_status' not found — skipping.")
        return df

    # Fill missing values
    df["marital_status"] = df["marital_status"].fillna(df["marital_status"].mode()[0])

    # One-hot encode
    dummies = pd.get_dummies(df["marital_status"], prefix="marital").astype(int)

    # Drop the original
    df = df.drop(columns=["marital_status"])

    # Concatenate **side by side** with axis=1
    df = pd.concat([df, dummies], axis=1)
    return df

In [6]:
encode_or_drop_marital_status(df)

Unnamed: 0,gender,age,highest_qualification,nationality,ethnicity,gross_income,region,smoke,amt_weekends,amt_weekdays,type,marital_Divorced,marital_Married,marital_Separated,marital_Single,marital_Widowed
0,Male,38.0,No Qualification,British,White,"2,600 to 5,200",The North,No,,,,1,0,0,0,0
1,Female,42.0,No Qualification,British,White,"Under 2,600",The North,Yes,12.0,12.0,Packets,0,0,0,1,0
2,Male,40.0,Degree,English,White,"28,600 to 36,400",The North,No,,,,0,1,0,0,0
3,Female,,Degree,English,White,"10,400 to 15,600",The North,No,,,,0,1,0,0,0
4,Female,39.0,GCSE/O Level,British,White,"2,600 to 5,200",The North,No,,,,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1686,Male,22.0,No Qualification,Scottish,White,"2,600 to 5,200",Scotland,No,,,,0,0,0,1,0
1687,Female,49.0,Other/Sub Degree,English,White,"2,600 to 5,200",Scotland,Yes,20.0,20.0,Hand-Rolled,1,0,0,0,0
1688,Male,45.0,Other/Sub Degree,Scottish,White,"5,200 to 10,400",Scotland,No,,,,0,1,0,0,0
1689,Female,51.0,No Qualification,English,White,"2,600 to 5,200",Scotland,Yes,20.0,20.0,Packets,0,1,0,0,0


In [10]:
def gender_encode(df):
    df["gender"] = df["gender"].astype(str).str.lower().map({
        "male": 1,
        "m": 1,
        "female": 0,
        "f": 0
    })

    df["gender"].mode()[0]
    return df

In [11]:
gender_encode(df)

Unnamed: 0,gender,age,marital_status,highest_qualification,nationality,ethnicity,gross_income,region,smoke,amt_weekends,amt_weekdays,type
0,1,38.0,Divorced,No Qualification,British,White,"2,600 to 5,200",The North,No,,,
1,0,42.0,Single,No Qualification,British,White,"Under 2,600",The North,Yes,12.0,12.0,Packets
2,1,40.0,Married,Degree,English,White,"28,600 to 36,400",The North,No,,,
3,0,,Married,Degree,English,White,"10,400 to 15,600",The North,No,,,
4,0,39.0,Married,GCSE/O Level,British,White,"2,600 to 5,200",The North,No,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1686,1,22.0,Single,No Qualification,Scottish,White,"2,600 to 5,200",Scotland,No,,,
1687,0,49.0,Divorced,Other/Sub Degree,English,White,"2,600 to 5,200",Scotland,Yes,20.0,20.0,Hand-Rolled
1688,1,45.0,Married,Other/Sub Degree,Scottish,White,"5,200 to 10,400",Scotland,No,,,
1689,0,51.0,Married,No Qualification,English,White,"2,600 to 5,200",Scotland,Yes,20.0,20.0,Packets


In [13]:
df.isna().sum()

gender                      0
age                        98
marital_status              0
highest_qualification       0
nationality                 0
ethnicity                   0
gross_income               75
region                      0
smoke                       0
amt_weekends             1270
amt_weekdays             1270
type                     1270
dtype: int64

In [14]:
def drop_amt_weekends(df):
    df = df.drop("amt_weekends", axis=1)
    return df

In [15]:
def drop_amt_weekdays(df):
    df = df.drop("amt_weekdays", axis=1)
    return df

In [16]:
def drop_type(df):
    df = df.drop("type", axis=1)
    return df

In [17]:
drop_amt_weekdays(df)
drop_amt_weekends(df)
drop_type(df)

Unnamed: 0,gender,age,marital_status,highest_qualification,nationality,ethnicity,gross_income,region,smoke,amt_weekends,amt_weekdays
0,1,38.0,Divorced,No Qualification,British,White,"2,600 to 5,200",The North,No,,
1,0,42.0,Single,No Qualification,British,White,"Under 2,600",The North,Yes,12.0,12.0
2,1,40.0,Married,Degree,English,White,"28,600 to 36,400",The North,No,,
3,0,,Married,Degree,English,White,"10,400 to 15,600",The North,No,,
4,0,39.0,Married,GCSE/O Level,British,White,"2,600 to 5,200",The North,No,,
...,...,...,...,...,...,...,...,...,...,...,...
1686,1,22.0,Single,No Qualification,Scottish,White,"2,600 to 5,200",Scotland,No,,
1687,0,49.0,Divorced,Other/Sub Degree,English,White,"2,600 to 5,200",Scotland,Yes,20.0,20.0
1688,1,45.0,Married,Other/Sub Degree,Scottish,White,"5,200 to 10,400",Scotland,No,,
1689,0,51.0,Married,No Qualification,English,White,"2,600 to 5,200",Scotland,Yes,20.0,20.0


In [20]:
df["ethnicity"].unique().sum()

'WhiteMixedBlackRefusedAsianChineseUnknown'