In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import (
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
    root_mean_squared_error,
    r2_score,
    confusion_matrix,
    classification_report,
)
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import (
    LinearRegression,
    Lasso,
    Ridge,
    LassoCV,
    RidgeCV,
    LogisticRegression,
)
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

In [2]:
df = pd.read_csv("smoking-v1.csv")

In [3]:
df.head()

Unnamed: 0,gender,age,marital_status,highest_qualification,nationality,ethnicity,gross_income,region,smoke,amt_weekends,amt_weekdays,type
0,Male,38.0,Divorced,No Qualification,British,White,"2,600 to 5,200",The North,No,,,
1,Female,42.0,Single,No Qualification,British,White,"Under 2,600",The North,Yes,12.0,12.0,Packets
2,Male,40.0,Married,Degree,English,White,"28,600 to 36,400",The North,No,,,
3,Female,,Married,Degree,English,White,"10,400 to 15,600",The North,No,,,
4,Female,39.0,Married,GCSE/O Level,British,White,"2,600 to 5,200",The North,No,,,


In [4]:
df["gender"].unique()

array(['Male', 'Female'], dtype=object)

In [5]:
df["gender"] = df["gender"].map({"Male": 0, "Female": 1})

In [6]:
df.head()

Unnamed: 0,gender,age,marital_status,highest_qualification,nationality,ethnicity,gross_income,region,smoke,amt_weekends,amt_weekdays,type
0,0,38.0,Divorced,No Qualification,British,White,"2,600 to 5,200",The North,No,,,
1,1,42.0,Single,No Qualification,British,White,"Under 2,600",The North,Yes,12.0,12.0,Packets
2,0,40.0,Married,Degree,English,White,"28,600 to 36,400",The North,No,,,
3,1,,Married,Degree,English,White,"10,400 to 15,600",The North,No,,,
4,1,39.0,Married,GCSE/O Level,British,White,"2,600 to 5,200",The North,No,,,


In [7]:
df["marital_status"].unique()

array(['Divorced', 'Single', 'Married', 'Widowed', 'Separated'],
      dtype=object)

In [8]:
df.isnull().sum()

gender                      0
age                        98
marital_status              0
highest_qualification       0
nationality                 0
ethnicity                   0
gross_income               75
region                      0
smoke                       0
amt_weekends             1270
amt_weekdays             1270
type                     1270
dtype: int64

In [9]:
df = pd.concat([df, pd.get_dummies(df["marital_status"])], axis=1)
df.drop(columns=["marital_status"], axis=1, inplace=True)

In [10]:
df.head()

Unnamed: 0,gender,age,highest_qualification,nationality,ethnicity,gross_income,region,smoke,amt_weekends,amt_weekdays,type,Divorced,Married,Separated,Single,Widowed
0,0,38.0,No Qualification,British,White,"2,600 to 5,200",The North,No,,,,True,False,False,False,False
1,1,42.0,No Qualification,British,White,"Under 2,600",The North,Yes,12.0,12.0,Packets,False,False,False,True,False
2,0,40.0,Degree,English,White,"28,600 to 36,400",The North,No,,,,False,True,False,False,False
3,1,,Degree,English,White,"10,400 to 15,600",The North,No,,,,False,True,False,False,False
4,1,39.0,GCSE/O Level,British,White,"2,600 to 5,200",The North,No,,,,False,True,False,False,False


In [11]:
df["highest_qualification"].unique()

array(['No Qualification', 'Degree', 'GCSE/O Level', 'GCSE/CSE',
       'Other/Sub Degree', 'Higher/Sub Degree', 'ONC/BTEC', 'A Levels'],
      dtype=object)

In [19]:
educational_order = [
    "No Qualification",
    "GCSE/CSE",
    "GCSE/O Level",
    "A Levels",
    "ONC/BTEC",
    "Other/Sub Degree",
    "Higher/Sub Degree",
    "Degree"
]

from sklearn.preprocessing import OrdinalEncoder
import numpy as np

encoder = OrdinalEncoder(categories=[educational_order], handle_unknown="use_encoded_value", unknown_value=np.nan)
df[["highest_qualification"]] = encoder.fit_transform(df[["highest_qualification"]])

print(df[['highest_qualification', 'highest_qualification_encoded']].head(10))

   highest_qualification  highest_qualification_encoded
0                    0.0                            0.0
1                    0.0                            0.0
2                    7.0                            7.0
3                    7.0                            7.0
4                    2.0                            2.0
5                    2.0                            2.0
6                    7.0                            7.0
7                    7.0                            7.0
8                    1.0                            1.0
9                    0.0                            0.0


In [20]:
df.head()

Unnamed: 0,gender,age,highest_qualification,nationality,ethnicity,gross_income,region,smoke,amt_weekends,amt_weekdays,type,Divorced,Married,Separated,Single,Widowed,highest_qualification_encoded
0,0,38.0,0.0,British,White,"2,600 to 5,200",The North,No,,,,True,False,False,False,False,0.0
1,1,42.0,0.0,British,White,"Under 2,600",The North,Yes,12.0,12.0,Packets,False,False,False,True,False,0.0
2,0,40.0,7.0,English,White,"28,600 to 36,400",The North,No,,,,False,True,False,False,False,7.0
3,1,,7.0,English,White,"10,400 to 15,600",The North,No,,,,False,True,False,False,False,7.0
4,1,39.0,2.0,British,White,"2,600 to 5,200",The North,No,,,,False,True,False,False,False,2.0


In [21]:
df["nationality"].unique()

array(['British', 'English', 'Scottish', 'Other', 'Welsh', 'Irish',
       'Refused', 'Unknown'], dtype=object)

In [22]:
df["ethnicity"].unique()

array(['White', 'Mixed', 'Black', 'Refused', 'Asian', 'Chinese',
       'Unknown'], dtype=object)

In [23]:
df = pd.concat([df, pd.get_dummies(df["ethnicity"])], axis=1)
df.drop(columns=["ethnicity"], axis=1, inplace=True)

In [24]:
df

Unnamed: 0,gender,age,highest_qualification,nationality,gross_income,region,smoke,amt_weekends,amt_weekdays,type,...,Single,Widowed,highest_qualification_encoded,Asian,Black,Chinese,Mixed,Refused,Unknown,White
0,0,38.0,0.0,British,"2,600 to 5,200",The North,No,,,,...,False,False,0.0,False,False,False,False,False,False,True
1,1,42.0,0.0,British,"Under 2,600",The North,Yes,12.0,12.0,Packets,...,True,False,0.0,False,False,False,False,False,False,True
2,0,40.0,7.0,English,"28,600 to 36,400",The North,No,,,,...,False,False,7.0,False,False,False,False,False,False,True
3,1,,7.0,English,"10,400 to 15,600",The North,No,,,,...,False,False,7.0,False,False,False,False,False,False,True
4,1,39.0,2.0,British,"2,600 to 5,200",The North,No,,,,...,False,False,2.0,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1686,0,22.0,0.0,Scottish,"2,600 to 5,200",Scotland,No,,,,...,True,False,0.0,False,False,False,False,False,False,True
1687,1,49.0,5.0,English,"2,600 to 5,200",Scotland,Yes,20.0,20.0,Hand-Rolled,...,False,False,5.0,False,False,False,False,False,False,True
1688,0,45.0,5.0,Scottish,"5,200 to 10,400",Scotland,No,,,,...,False,False,5.0,False,False,False,False,False,False,True
1689,1,51.0,0.0,English,"2,600 to 5,200",Scotland,Yes,20.0,20.0,Packets,...,False,False,0.0,False,False,False,False,False,False,True


In [25]:
df["region"].unique()

array(['The North', 'Midlands & East Anglia', 'London', 'South East',
       'South West', 'Wales', 'Scotland'], dtype=object)

In [26]:
df = pd.concat([df, pd.get_dummies(df["region"])], axis=1)
df.drop(columns=["region"], axis=1, inplace=True)

In [27]:
df

Unnamed: 0,gender,age,highest_qualification,nationality,gross_income,smoke,amt_weekends,amt_weekdays,type,Divorced,...,Refused,Unknown,White,London,Midlands & East Anglia,Scotland,South East,South West,The North,Wales
0,0,38.0,0.0,British,"2,600 to 5,200",No,,,,True,...,False,False,True,False,False,False,False,False,True,False
1,1,42.0,0.0,British,"Under 2,600",Yes,12.0,12.0,Packets,False,...,False,False,True,False,False,False,False,False,True,False
2,0,40.0,7.0,English,"28,600 to 36,400",No,,,,False,...,False,False,True,False,False,False,False,False,True,False
3,1,,7.0,English,"10,400 to 15,600",No,,,,False,...,False,False,True,False,False,False,False,False,True,False
4,1,39.0,2.0,British,"2,600 to 5,200",No,,,,False,...,False,False,True,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1686,0,22.0,0.0,Scottish,"2,600 to 5,200",No,,,,False,...,False,False,True,False,False,True,False,False,False,False
1687,1,49.0,5.0,English,"2,600 to 5,200",Yes,20.0,20.0,Hand-Rolled,True,...,False,False,True,False,False,True,False,False,False,False
1688,0,45.0,5.0,Scottish,"5,200 to 10,400",No,,,,False,...,False,False,True,False,False,True,False,False,False,False
1689,1,51.0,0.0,English,"2,600 to 5,200",Yes,20.0,20.0,Packets,False,...,False,False,True,False,False,True,False,False,False,False


In [28]:
df["nationality"].unique()

array(['British', 'English', 'Scottish', 'Other', 'Welsh', 'Irish',
       'Refused', 'Unknown'], dtype=object)

In [29]:
df = pd.concat([df, pd.get_dummies(df["nationality"])], axis=1)
df.drop(columns=["nationality"], axis=1, inplace=True)

In [30]:
df

Unnamed: 0,gender,age,highest_qualification,gross_income,smoke,amt_weekends,amt_weekdays,type,Divorced,Married,...,The North,Wales,British,English,Irish,Other,Refused,Scottish,Unknown,Welsh
0,0,38.0,0.0,"2,600 to 5,200",No,,,,True,False,...,True,False,True,False,False,False,False,False,False,False
1,1,42.0,0.0,"Under 2,600",Yes,12.0,12.0,Packets,False,False,...,True,False,True,False,False,False,False,False,False,False
2,0,40.0,7.0,"28,600 to 36,400",No,,,,False,True,...,True,False,False,True,False,False,False,False,False,False
3,1,,7.0,"10,400 to 15,600",No,,,,False,True,...,True,False,False,True,False,False,False,False,False,False
4,1,39.0,2.0,"2,600 to 5,200",No,,,,False,True,...,True,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1686,0,22.0,0.0,"2,600 to 5,200",No,,,,False,False,...,False,False,False,False,False,False,False,True,False,False
1687,1,49.0,5.0,"2,600 to 5,200",Yes,20.0,20.0,Hand-Rolled,True,False,...,False,False,False,True,False,False,False,False,False,False
1688,0,45.0,5.0,"5,200 to 10,400",No,,,,False,True,...,False,False,False,False,False,False,False,True,False,False
1689,1,51.0,0.0,"2,600 to 5,200",Yes,20.0,20.0,Packets,False,True,...,False,False,False,True,False,False,False,False,False,False


In [31]:
df["smoke"].unique()

array(['No', 'Yes'], dtype=object)

In [32]:
df["smoke"] = df["smoke"].map({"No": 0, "Yes": 1})

In [33]:
df

Unnamed: 0,gender,age,highest_qualification,gross_income,smoke,amt_weekends,amt_weekdays,type,Divorced,Married,...,The North,Wales,British,English,Irish,Other,Refused,Scottish,Unknown,Welsh
0,0,38.0,0.0,"2,600 to 5,200",0,,,,True,False,...,True,False,True,False,False,False,False,False,False,False
1,1,42.0,0.0,"Under 2,600",1,12.0,12.0,Packets,False,False,...,True,False,True,False,False,False,False,False,False,False
2,0,40.0,7.0,"28,600 to 36,400",0,,,,False,True,...,True,False,False,True,False,False,False,False,False,False
3,1,,7.0,"10,400 to 15,600",0,,,,False,True,...,True,False,False,True,False,False,False,False,False,False
4,1,39.0,2.0,"2,600 to 5,200",0,,,,False,True,...,True,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1686,0,22.0,0.0,"2,600 to 5,200",0,,,,False,False,...,False,False,False,False,False,False,False,True,False,False
1687,1,49.0,5.0,"2,600 to 5,200",1,20.0,20.0,Hand-Rolled,True,False,...,False,False,False,True,False,False,False,False,False,False
1688,0,45.0,5.0,"5,200 to 10,400",0,,,,False,True,...,False,False,False,False,False,False,False,True,False,False
1689,1,51.0,0.0,"2,600 to 5,200",1,20.0,20.0,Packets,False,True,...,False,False,False,True,False,False,False,False,False,False


In [34]:
df.isnull().sum()

gender                              0
age                                98
highest_qualification               0
gross_income                       75
smoke                               0
amt_weekends                     1270
amt_weekdays                     1270
type                             1270
Divorced                            0
Married                             0
Separated                           0
Single                              0
Widowed                             0
highest_qualification_encoded       0
Asian                               0
Black                               0
Chinese                             0
Mixed                               0
Refused                             0
Unknown                             0
White                               0
London                              0
Midlands & East Anglia              0
Scotland                            0
South East                          0
South West                          0
The North   

In [35]:
df.drop(columns=["amt_weekends", "amt_weekdays", "type"], axis=1, inplace=True)

In [36]:
df

Unnamed: 0,gender,age,highest_qualification,gross_income,smoke,Divorced,Married,Separated,Single,Widowed,...,The North,Wales,British,English,Irish,Other,Refused,Scottish,Unknown,Welsh
0,0,38.0,0.0,"2,600 to 5,200",0,True,False,False,False,False,...,True,False,True,False,False,False,False,False,False,False
1,1,42.0,0.0,"Under 2,600",1,False,False,False,True,False,...,True,False,True,False,False,False,False,False,False,False
2,0,40.0,7.0,"28,600 to 36,400",0,False,True,False,False,False,...,True,False,False,True,False,False,False,False,False,False
3,1,,7.0,"10,400 to 15,600",0,False,True,False,False,False,...,True,False,False,True,False,False,False,False,False,False
4,1,39.0,2.0,"2,600 to 5,200",0,False,True,False,False,False,...,True,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1686,0,22.0,0.0,"2,600 to 5,200",0,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
1687,1,49.0,5.0,"2,600 to 5,200",1,True,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1688,0,45.0,5.0,"5,200 to 10,400",0,False,True,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1689,1,51.0,0.0,"2,600 to 5,200",1,False,True,False,False,False,...,False,False,False,True,False,False,False,False,False,False


In [37]:
df["age"] = df["age"].fillna(df["age"].median())

In [38]:
df

Unnamed: 0,gender,age,highest_qualification,gross_income,smoke,Divorced,Married,Separated,Single,Widowed,...,The North,Wales,British,English,Irish,Other,Refused,Scottish,Unknown,Welsh
0,0,38.0,0.0,"2,600 to 5,200",0,True,False,False,False,False,...,True,False,True,False,False,False,False,False,False,False
1,1,42.0,0.0,"Under 2,600",1,False,False,False,True,False,...,True,False,True,False,False,False,False,False,False,False
2,0,40.0,7.0,"28,600 to 36,400",0,False,True,False,False,False,...,True,False,False,True,False,False,False,False,False,False
3,1,48.0,7.0,"10,400 to 15,600",0,False,True,False,False,False,...,True,False,False,True,False,False,False,False,False,False
4,1,39.0,2.0,"2,600 to 5,200",0,False,True,False,False,False,...,True,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1686,0,22.0,0.0,"2,600 to 5,200",0,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
1687,1,49.0,5.0,"2,600 to 5,200",1,True,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1688,0,45.0,5.0,"5,200 to 10,400",0,False,True,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1689,1,51.0,0.0,"2,600 to 5,200",1,False,True,False,False,False,...,False,False,False,True,False,False,False,False,False,False


In [39]:
df.isnull().sum()

gender                            0
age                               0
highest_qualification             0
gross_income                     75
smoke                             0
Divorced                          0
Married                           0
Separated                         0
Single                            0
Widowed                           0
highest_qualification_encoded     0
Asian                             0
Black                             0
Chinese                           0
Mixed                             0
Refused                           0
Unknown                           0
White                             0
London                            0
Midlands & East Anglia            0
Scotland                          0
South East                        0
South West                        0
The North                         0
Wales                             0
British                           0
English                           0
Irish                       

In [40]:
df["gross_income"].unique()

array(['2,600 to 5,200', 'Under 2,600', '28,600 to 36,400',
       '10,400 to 15,600', '15,600 to 20,800', 'Above 36,400',
       '5,200 to 10,400', 'Refused', '20,800 to 28,600', 'Unknown', nan],
      dtype=object)

In [41]:
income_order = [
    "Refused",
    "Under 2,600",
    "2,600 to 5,200",
    "5,200 to 10,400",
    "10,400 to 15,600",
    "15,600 to 20,800",
    "20,800 to 28,600",
    "28,600 to 36,400",
    "Above 36,400",
    "Unknown"
]

encoder = OrdinalEncoder(categories=[income_order], handle_unknown="use_encoded_value", unknown_value=np.nan)
df["gross_income"] = encoder.fit_transform(df[["gross_income"]])

In [42]:
df["gross_income"]

0       2.0
1       1.0
2       7.0
3       4.0
4       2.0
       ... 
1686    2.0
1687    2.0
1688    3.0
1689    2.0
1690    4.0
Name: gross_income, Length: 1691, dtype: float64

In [43]:
df.isnull().sum()

gender                            0
age                               0
highest_qualification             0
gross_income                     75
smoke                             0
Divorced                          0
Married                           0
Separated                         0
Single                            0
Widowed                           0
highest_qualification_encoded     0
Asian                             0
Black                             0
Chinese                           0
Mixed                             0
Refused                           0
Unknown                           0
White                             0
London                            0
Midlands & East Anglia            0
Scotland                          0
South East                        0
South West                        0
The North                         0
Wales                             0
British                           0
English                           0
Irish                       

In [44]:
df["gross_income"] = df["gross_income"].fillna(df["gross_income"].mode()[0])

In [45]:
df.isnull().sum()

gender                           0
age                              0
highest_qualification            0
gross_income                     0
smoke                            0
Divorced                         0
Married                          0
Separated                        0
Single                           0
Widowed                          0
highest_qualification_encoded    0
Asian                            0
Black                            0
Chinese                          0
Mixed                            0
Refused                          0
Unknown                          0
White                            0
London                           0
Midlands & East Anglia           0
Scotland                         0
South East                       0
South West                       0
The North                        0
Wales                            0
British                          0
English                          0
Irish                            0
Other               

In [46]:
df

Unnamed: 0,gender,age,highest_qualification,gross_income,smoke,Divorced,Married,Separated,Single,Widowed,...,The North,Wales,British,English,Irish,Other,Refused,Scottish,Unknown,Welsh
0,0,38.0,0.0,2.0,0,True,False,False,False,False,...,True,False,True,False,False,False,False,False,False,False
1,1,42.0,0.0,1.0,1,False,False,False,True,False,...,True,False,True,False,False,False,False,False,False,False
2,0,40.0,7.0,7.0,0,False,True,False,False,False,...,True,False,False,True,False,False,False,False,False,False
3,1,48.0,7.0,4.0,0,False,True,False,False,False,...,True,False,False,True,False,False,False,False,False,False
4,1,39.0,2.0,2.0,0,False,True,False,False,False,...,True,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1686,0,22.0,0.0,2.0,0,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
1687,1,49.0,5.0,2.0,1,True,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1688,0,45.0,5.0,3.0,0,False,True,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1689,1,51.0,0.0,2.0,1,False,True,False,False,False,...,False,False,False,True,False,False,False,False,False,False


In [47]:
x,y = df.drop(columns=["smoke"]), df["smoke"]

In [48]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [49]:
model = LogisticRegression().fit(x_train, y_train)
pred = model.predict(x_test)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [50]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.72      0.93      0.81       242
           1       0.31      0.08      0.13        97

    accuracy                           0.68       339
   macro avg       0.51      0.50      0.47       339
weighted avg       0.60      0.68      0.61       339



In [51]:
model = DecisionTreeClassifier().fit(x_train, y_train)
pred = model.predict(x_test)

In [52]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.72      0.74      0.73       242
           1       0.31      0.29      0.30        97

    accuracy                           0.61       339
   macro avg       0.51      0.51      0.51       339
weighted avg       0.60      0.61      0.61       339



In [53]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier().fit(x_train, y_train)
pred = model.predict(x_test)

In [54]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.74      0.88      0.81       242
           1       0.44      0.24      0.31        97

    accuracy                           0.70       339
   macro avg       0.59      0.56      0.56       339
weighted avg       0.66      0.70      0.66       339

