# Regression VS Classification

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set(style="ticks", color_codes=True)
import plotly.express as px

# Locate and load the data file
df = pd.read_csv("./EP_datasets/master.csv")
print(f"#rows={len(df)} #columns={len(df.columns)}")

# Print some info and plots to get a feeling about the dataset
print(df.dtypes)

#rows=27820 #columns=12
country                object
year                    int64
sex                    object
age                    object
suicides_no             int64
population              int64
suicides/100k pop     float64
country-year           object
HDI for year          float64
 gdp_for_year ($)      object
gdp_per_capita ($)      int64
generation             object
dtype: object


In [2]:
df.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


In [3]:
# Check if we have any '?' in df values
print(df.columns[df.isin(["?"]).any()])
# Check if we have any NaN in df values
print(df.columns[df.isnull().any()])

print(df.dtypes)

Index([], dtype='object')
Index(['HDI for year'], dtype='object')
country                object
year                    int64
sex                    object
age                    object
suicides_no             int64
population              int64
suicides/100k pop     float64
country-year           object
HDI for year          float64
 gdp_for_year ($)      object
gdp_per_capita ($)      int64
generation             object
dtype: object


In [4]:
# Check for duplicates, this adds a new column to the dataset
df["is_duplicate"] = df.duplicated()

# Note that when using f-strings, the internal quote character must be different, such as 'is_duplicate' above
print(f"#total= {len(df)}")
print(f"#duplicated= {len(df[df['is_duplicate']==True])}")

#total= 27820
#duplicated= 0


In [5]:
# Do we have NaN in our dataset?
df.isnull().any()

country               False
year                  False
sex                   False
age                   False
suicides_no           False
population            False
suicides/100k pop     False
country-year          False
HDI for year           True
 gdp_for_year ($)     False
gdp_per_capita ($)    False
generation            False
is_duplicate          False
dtype: bool

In [6]:
df["age"] = df["age"].str.replace("5-14 years", "14")
df["age"] = df["age"].str.replace("15-24 years", "24")
df["age"] = df["age"].str.replace("25-34 years", "34")
df["age"] = df["age"].str.replace("35-54 years", "54")
df["age"] = df["age"].str.replace("55-74 years", "74")
df["age"] = df["age"].str.replace("75+ years", "100", regex=False)
# df['age'] = pd.to_numeric(df['age'], errors='coerce')
df = df.drop(columns=["HDI for year", "is_duplicate", "country-year"])
df.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,24,21,312900,6.71,2156624900,796,Generation X
1,Albania,1987,male,54,16,308000,5.19,2156624900,796,Silent
2,Albania,1987,female,24,14,289700,4.83,2156624900,796,Generation X
3,Albania,1987,male,100,1,21800,4.59,2156624900,796,G.I. Generation
4,Albania,1987,male,34,9,274300,3.28,2156624900,796,Boomers


In [7]:
print(df.dtypes)

country                object
year                    int64
sex                    object
age                    object
suicides_no             int64
population              int64
suicides/100k pop     float64
 gdp_for_year ($)      object
gdp_per_capita ($)      int64
generation             object
dtype: object


In [8]:
df_lr = df[["age", "sex", "generation", "suicides/100k pop"]].copy()

In [9]:
print(f"#rows={len(df_lr)} #columns={len(df_lr.columns)}")
df_lr.head()

#rows=27820 #columns=4


Unnamed: 0,age,sex,generation,suicides/100k pop
0,24,male,Generation X,6.71
1,54,male,Silent,5.19
2,24,female,Generation X,4.83
3,100,male,G.I. Generation,4.59
4,34,male,Boomers,3.28


In [10]:
def encode_onehot(_df):
    ___df = _df.copy()
    # Convert all features of type object to one-hot encoded with pandas dummies
    for f in list(_df.columns.values):
        if _df[f].dtype == object:
            __df = (
                pd.get_dummies(_df[f], prefix="", prefix_sep="")
                .groupby(level=0, axis=1)
                .max()
                .add_prefix(f + " - ")
            )
            _df = pd.concat([_df, __df], axis=1)
            _df = _df.drop([f], axis=1)

    return _df


# Convert the variable recurrence to numerical
df_lr = encode_onehot(df_lr)

# Sanity check
df_lr.head()

  pd.get_dummies(_df[f], prefix="", prefix_sep="")
  pd.get_dummies(_df[f], prefix="", prefix_sep="")
  pd.get_dummies(_df[f], prefix="", prefix_sep="")


Unnamed: 0,suicides/100k pop,age - 100,age - 14,age - 24,age - 34,age - 54,age - 74,sex - female,sex - male,generation - Boomers,generation - G.I. Generation,generation - Generation X,generation - Generation Z,generation - Millenials,generation - Silent
0,6.71,False,False,True,False,False,False,False,True,False,False,True,False,False,False
1,5.19,False,False,False,False,True,False,False,True,False,False,False,False,False,True
2,4.83,False,False,True,False,False,False,True,False,False,False,True,False,False,False
3,4.59,True,False,False,False,False,False,False,True,False,True,False,False,False,False
4,3.28,False,False,False,True,False,False,False,True,True,False,False,False,False,False


## 1)

The model predicts 8.8 suicides per 100k population for males age 20 and generation X. The MAE is 11.4. This model uses 14 regression coefficients

In [11]:
def mae(_y, _y_pred):
    return (len(_y) ** -1) * np.sum(np.abs(_y_pred - _y))

In [12]:
X = df_lr.loc[:, df_lr.columns != "suicides/100k pop"].values
y = df_lr.loc[:, df_lr.columns == "suicides/100k pop"].values.ravel()

In [13]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(X, y)

y_pred = reg.predict(X)

mae_val = mae(y, y_pred)
print(mae_val)

10.199631560028756


In [14]:
X_test = np.array([[0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0]])

In [15]:
y_pred_new = reg.predict(X_test)

mae_val = mae(y, y_pred_new)
print(mae_val)

11.438165887850468


In [16]:
y_pred_new

array([8.8125])

The model predicts 13 suicides per 100k population for males age 20 and generation X. The MAE is 13.4. This model uses 3 regression coefficients.

In [17]:
df_lr2 = df[["age", "sex", "generation", "suicides/100k pop"]].copy()

df_lr2["sex"] = df_lr2["sex"].str.replace("female", "1")
df_lr2["sex"] = df_lr2["sex"].str.replace("male", "0")
df_lr2["generation"] = df_lr2["generation"].str.replace("Boomers", "1")
df_lr2["generation"] = df_lr2["generation"].str.replace(
    "G.I. Generation", "2", regex=False
)
df_lr2["generation"] = df_lr2["generation"].str.replace("Generation X", "3")
df_lr2["generation"] = df_lr2["generation"].str.replace("Generation Z", "4")
df_lr2["generation"] = df_lr2["generation"].str.replace("Millenials", "5")
df_lr2["generation"] = df_lr2["generation"].str.replace("Silent", "6")
df_lr2["age"] = pd.to_numeric(df_lr2["age"], errors="coerce")
df_lr2["generation"] = pd.to_numeric(df_lr2["generation"], errors="coerce")
df_lr2["sex"] = pd.to_numeric(df_lr2["sex"], errors="coerce")
print(f"#rows={len(df_lr2)} #columns={len(df_lr2.columns)}")
df_lr2.head()

#rows=27820 #columns=4


Unnamed: 0,age,sex,generation,suicides/100k pop
0,24,0,3,6.71
1,54,0,6,5.19
2,24,1,3,4.83
3,100,0,2,4.59
4,34,0,1,3.28


In [18]:
print(df_lr2.dtypes)

age                    int64
sex                    int64
generation             int64
suicides/100k pop    float64
dtype: object


In [19]:
X2 = df_lr2.loc[:, df_lr2.columns != "suicides/100k pop"].values
y2 = df_lr2.loc[:, df_lr2.columns == "suicides/100k pop"].values.ravel()

print(X2)

[[24  0  3]
 [54  0  6]
 [24  1  3]
 ...
 [14  0  4]
 [14  1  4]
 [74  1  1]]


In [20]:
reg2 = LinearRegression().fit(X2, y2)

y_pred2 = reg2.predict(X2)

mae_val = mae(y2, y_pred2)
print(mae_val)

10.339322925911894


In [21]:
X_test = np.array([[24, 0, 3]])

y_pred_new = reg2.predict(X_test)

mae_val = mae(y2, y_pred_new)
print(mae_val)

13.400359795833294


In [22]:
y_pred_new

array([14.83749794])

In [23]:
y2

array([6.71, 5.19, 4.83, ..., 2.17, 1.67, 1.46])

There is a slight difference in the performance of the two models. The one-hot encoded model appeared to have a smaller error and a lower prediction. 

In [24]:
X_test_3 = np.array([[0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0]])
X_test_4 = np.array([[34, 0, 5]])

y_pred_3 = reg.predict(X_test_3)
y_pred_4 = reg2.predict(X_test_4)

mae_val = mae(y, y_pred_3)
print(f"prediction1: {y_pred_3}")
print(f"MAE Value: {mae_val}")
mae_val = mae(y, y_pred_4)
print(f"prediction2: {y_pred_4}")
print(f"MAE Value: {mae_val}")

prediction1: [7.78125]
MAE Value: 11.277740833932425
prediction2: [15.66798147]
MAE Value: 13.781314978648458


One advantage of regression is that the input data features don't necessarily have to be a cluster to be useful. It's possible to find meaningful trends in data that's more spread out. 

One advantage of numerical encoding is that it is easier to work with since you don't need to keep track of which column represents which very specific feature. For example, with numerical values you know which column age is and can change the eaiser than one-hot encoding age ranges and then looking for which column represents the appropriate range.

I would suggest the use of a classifier since it's better at providing a real value output and it will be easier to determine the most important independent features that contribute the most to a higher suicide rate. With this information, the customer will have more valuable information to combat suicide rates.