In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

# importing linear regression function
import sklearn.linear_model as lm

# function to calculate r-squared, MAE, RMSE
from sklearn.metrics import r2_score , mean_absolute_error, mean_squared_error

%matplotlib inline

Load Data

In [26]:
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
print('Class labels:', np.unique(y))


Class labels: [0 1 2]


Normalize Data

The unit of measurement might differ so let’s normalize the data before building the model.

In [27]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)

Split Data

Split data into train and test. Whenever we are using random function it’s advised to use a seed to ensure the reproducibility of the results.

In [28]:
# split data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
random_state=0)

Training Logistic Regression Model and Evaluating

In [29]:
from sklearn.linear_model import LogisticRegression
# l1 regularization gives better results
lr = LogisticRegression(C=10, penalty='l1', solver='liblinear')
lr.fit(X_train, y_train)
from sklearn import metrics
# generate evaluation metrics
print("Train - Accuracy :", metrics.accuracy_score(y_train, lr.predict(X_train)))
print("Train - Confusion matrix :",metrics.confusion_matrix(y_train,lr.predict(X_train)))
print("Train - classification report :", metrics.classification_report(y_train, lr.predict(X_train)))
print("Test - Accuracy :", metrics.accuracy_score(y_test, lr.predict(X_test)))
print("Test - Confusion matrix :",metrics.confusion_matrix(y_test,lr.predict(X_test)))
print("Test - classification report :", metrics.classification_report(y_test, lr.predict(X_test)))

Train - Accuracy : 0.9809523809523809
Train - Confusion matrix : [[34  0  0]
 [ 0 30  2]
 [ 0  0 39]]
Train - classification report :               precision    recall  f1-score   support

           0       1.00      1.00      1.00        34
           1       1.00      0.94      0.97        32
           2       0.95      1.00      0.97        39

    accuracy                           0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105

Test - Accuracy : 0.9777777777777777
Test - Confusion matrix : [[16  0  0]
 [ 0 17  1]
 [ 0  0 11]]
Test - classification report :               precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.94      0.97        18
           2       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98     

Generalized Linear Models

GLM was an effort by John Nelder and Robert Wedderburn to unify commonly used various statistical models such as linear, logistic, and poisson, etc.

In [30]:
df = pd.read_csv('Data/Grade_Set_1.csv')
print('####### Linear Regression Model ########')
# Create linear regression object
lr = lm.LinearRegression()
x= df.Hours_Studied[:, np.newaxis] # independent variable
y= df.Test_Grade.values            # dependent variable
# Train the model using the training sets
lr.fit(x, y)
print("Intercept: ", lr.intercept_)
print("Coefficient: ", lr.coef_)
print('\n####### Generalized Linear Model ########')
import statsmodels.api as sm
# To be able to run GLM, we'll have to add the intercept constant to x variable
x = sm.add_constant(x, prepend=False)
# Instantiate a gaussian family model with the default link function.
model = sm.GLM(y, x, family = sm.families.Gaussian())
model = model.fit()
print(model.summary())

####### Linear Regression Model ########
Intercept:  49.67777777777776
Coefficient:  [5.01666667]

####### Generalized Linear Model ########


  x= df.Hours_Studied[:, np.newaxis] # independent variable


                 Generalized Linear Model Regression Results                  
Dep. Variable:                      y   No. Observations:                    9
Model:                            GLM   Df Residuals:                        7
Model Family:                Gaussian   Df Model:                            1
Link Function:               identity   Scale:                          5.3627
Method:                          IRLS   Log-Likelihood:                -19.197
Date:                Tue, 19 Jul 2022   Deviance:                       37.539
Time:                        10:48:01   Pearson chi2:                     37.5
No. Iterations:                     3   Pseudo R-squ. (CS):              1.000
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1             5.0167      0.299     16.780      0.0