In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd

# Linear Regression
https://realpython.com/linear-regression-in-python/
https://towardsdatascience.com/let-us-understand-the-correlation-matrix-and-covariance-matrix-d42e6b643c22

In [2]:
df = pd.read_csv('data/tobacco_data.csv')

In [3]:
df.head()

Unnamed: 0,Country,Year,Tobac_Use_M,Tobac_Use_F,Tax_2015,Happiness_Score,Afford_2015,Ban_Score_Dir_Ads,Ban_Score_Indr_Ads,Ban_Score_add_indir_ads,Warn_Score,Ban_Score_places
0,Albania,2015,51.2,7.6,65.195,4.959,3.92,8,8,3,50,8
1,Argentina,2015,29.5,18.4,75.045,6.574,1.31,7,10,5,50,8
2,Armenia,2015,52.3,1.5,34.165,4.35,3.945,5,2,0,50,3
3,Australia,2015,16.7,13.1,58.515,7.284,2.285,6,2,0,83,6
4,Austria,2015,35.5,34.8,74.835,7.2,1.225,7,8,5,65,2


In [4]:
df.drop(['Country', 'Year'] , axis=1, inplace=True)

In [5]:
df = df.replace('^', 0)
df.head()

Unnamed: 0,Tobac_Use_M,Tobac_Use_F,Tax_2015,Happiness_Score,Afford_2015,Ban_Score_Dir_Ads,Ban_Score_Indr_Ads,Ban_Score_add_indir_ads,Warn_Score,Ban_Score_places
0,51.2,7.6,65.195,4.959,3.92,8,8,3,50,8
1,29.5,18.4,75.045,6.574,1.31,7,10,5,50,8
2,52.3,1.5,34.165,4.35,3.945,5,2,0,50,3
3,16.7,13.1,58.515,7.284,2.285,6,2,0,83,6
4,35.5,34.8,74.835,7.2,1.225,7,8,5,65,2


In [6]:
y_f = np.array(df['Tobac_Use_F'])
y_m = np.array(df['Tobac_Use_M'])

corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Tobac_Use_M,Tobac_Use_F,Tax_2015,Happiness_Score,Afford_2015,Ban_Score_Dir_Ads,Ban_Score_Indr_Ads,Ban_Score_add_indir_ads,Ban_Score_places
Tobac_Use_M,1.0,0.0847056,0.0967064,-0.327438,-0.0985019,-0.0689886,-0.203158,-0.187177,0.0125115
Tobac_Use_F,0.0847056,1.0,0.656236,0.420648,-0.42563,-0.00518897,-0.0535025,-0.0825792,0.00868903
Tax_2015,0.0967064,0.656236,1.0,0.441964,-0.487305,0.0459263,-0.00311081,-0.10922,0.16135
Happiness_Score,-0.327438,0.420648,0.441964,1.0,-0.626922,-0.0272592,-0.00420253,-0.0360306,-0.156774
Afford_2015,-0.0985019,-0.42563,-0.487305,-0.626922,1.0,-0.155646,-0.000443755,0.0193821,0.0304187
Ban_Score_Dir_Ads,-0.0689886,-0.00518897,0.0459263,-0.0272592,-0.155646,1.0,0.611921,0.557217,0.207777
Ban_Score_Indr_Ads,-0.203158,-0.0535025,-0.00311081,-0.00420253,-0.000443755,0.611921,1.0,0.801185,0.230062
Ban_Score_add_indir_ads,-0.187177,-0.0825792,-0.10922,-0.0360306,0.0193821,0.557217,0.801185,1.0,0.164753
Ban_Score_places,0.0125115,0.00868903,0.16135,-0.156774,0.0304187,0.207777,0.230062,0.164753,1.0


In [7]:
lr_df = df.drop(['Tobac_Use_M', 'Tobac_Use_F'] , axis=1)

In [8]:
x = lr_df.to_numpy()

In [9]:
model_m = LinearRegression().fit(x, y_m)
r_sq_m = model_m.score(x, y_m) 
r_sq_m

0.3348470070477083

In [10]:
print('slope:', model_m.coef_)

slope: [ 0.1203863  -8.21750376 -1.34606392 -0.25634906 -0.50224731 -0.21917918
 -0.01711739 -0.28092335]


In [11]:
model_f = LinearRegression().fit(x, y_f)
r_sq_f = model_f.score(x, y_f) 
r_sq_f

0.4625619100103973

http://www.fairlynerdy.com/what-is-r-squared/
Any R squared value greater than zero means that the regression analysis did better than just using a horizontal line through the mean value.  In the rare cases you get a negative r squared value, you should probably rethink your regression analysis, especially if you are forcing an intercept.

In [12]:
print('slope:', model_f.coef_)
print('intercept:', model_f.intercept_)

slope: [ 0.2770912   0.933964   -0.18411967 -0.13775466 -0.30140579  0.3357009
  0.01268157 -0.2118413 ]
intercept: -6.314735516844612


# Polynomial Regression

In [13]:
transformer = PolynomialFeatures(degree=2, include_bias=False)

In [14]:
transformer.fit(x)

PolynomialFeatures(degree=2, include_bias=False, interaction_only=False,
                   order='C')

In [15]:
x = transformer.transform(x)
x_ = PolynomialFeatures(degree=2, include_bias=False).fit_transform(x)

In [16]:
# Without polynomial features for males
model_m = LinearRegression().fit(x, y_m) 
r_sq = model_m.score(x, y_m)
r_sq

0.6948586176905879

In [17]:
print('coefficients:', model_m.coef_)
print('intercept:', model_m.intercept_)

coefficients: [-1.90291671e+00  3.30145305e+01 -6.79743122e+00 -3.05624681e+00
  2.04908431e+01 -2.94434132e+01  5.64191825e-01  4.75215227e+00
  2.99990560e-03  1.38216959e-01  5.19161756e-02  1.50958890e-01
 -4.81758081e-02  7.17135209e-02 -7.98164118e-03  4.33116760e-02
 -3.08817024e+00  1.37571705e-01 -1.40677284e+00 -4.30505687e-01
  1.37487919e+00 -3.67759142e-02 -1.09779708e+00  2.37916886e-01
 -6.28508108e-03 -6.64623382e-01  9.93808934e-01  1.90293188e-02
 -3.44688207e-01  6.81721313e-01 -1.35936539e+00  1.50793829e+00
 -6.28113594e-02  4.06307772e-02 -1.44529578e-01 -1.03234769e-01
  6.59629562e-04 -3.94586759e-01 -2.39205665e-01  4.17753165e-02
  5.10049228e-01  2.72313942e-03  7.44968362e-03  2.60719134e-02]
intercept: 4.419790182554763


In [18]:
# With polynomial features for males
model_m = LinearRegression().fit(x_, y_m)
r_sq = model_m.score(x_, y_m)
r_sq

1.0

In [19]:
# Without polynomial features for females
model_f = LinearRegression().fit(x, y_f)
f_r_sq = model_f.score(x, y_f)
f_r_sq

0.7288679337735072

In [20]:
print('slope:', model_f.coef_)
print('intercept:', model_f.intercept_)

slope: [-1.19689789e-01  1.91722871e+01  5.59246057e+00 -2.01304054e-01
 -3.44283432e+00  2.66092649e+00 -2.20731741e-01  5.70518383e+00
  4.79473607e-03 -1.64189335e-02 -4.67702824e-02  3.38838880e-02
  2.42901872e-02 -5.44815500e-02  4.49062820e-04 -5.09406662e-03
 -1.47848506e+00 -9.31441674e-01  6.30791966e-01  2.98165702e-01
 -5.81306893e-01  1.35268337e-02 -5.82864018e-01  5.71706902e-03
 -1.08625027e-01 -1.32944216e-02 -6.68282703e-02  3.72590479e-02
 -2.01518411e-01 -4.10047583e-01  3.21240113e-01  5.14885772e-02
 -4.97785890e-02 -5.88868030e-02 -1.96927244e-01  7.04996225e-01
 -8.31405682e-03 -2.49316444e-01 -4.50594148e-01  2.04373943e-02
  2.59586871e-01  2.11566850e-03 -1.77706857e-03 -1.51922246e-02]
intercept: -52.73201714204382


In [21]:
# Wit polynomial features for females
model_f = LinearRegression().fit(x_, y_f)
f_r_sq = model_f.score(x_, y_f)
f_r_sq

1.0

### SVM

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn import metrics

In [23]:
X_train, X_test, y_train, y_test = train_test_split(x_, y_m, test_size=0.2, random_state=1)

In [None]:
svr = SVR(kernel='linear', C=1.0, epsilon=0.2) #Default hyperparameters
svr.fit(X_train,y_train)
y_pred=svr.predict(X_test)
print('Accuracy Score:', svr.score(X_test, y_pred, sample_weight=None))