<a href="https://colab.research.google.com/github/FloridaTechYoung/CSE4224/blob/main/python_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Problem 1

In [None]:
import numpy as np
import time

class NumericalIntegrator:
    def __init__(self, point1, point2, used_equation, used_rule):
        self.point1 = point1
        self.point2 = point2
        self.split_num = 10000
        self.range = int(self.split_num * np.ceil(self.point2 - self.point1))
        self.used_rule = used_rule
        self.area = 0.0

        # Select equation based on `used_equation`
        if used_equation == 1:
            self.equation = self.equation1
        elif used_equation == 21:
            self.equation = lambda x: self.equation2(x, 1)
        elif used_equation == 22:
            self.equation = lambda x: self.equation2(x, 2)
        elif used_equation == 23:
            self.equation = lambda x: self.equation2(x, 3)
        elif used_equation == 25:
            self.equation = lambda x: self.equation2(x, 5)
        elif used_equation == 210:
            self.equation = lambda x: self.equation2(x, 10)
        elif used_equation == 3:
            self.equation = self.equation3
        elif used_equation == 4:
            self.equation = self.equation4

    def equation1(self, x):
        return 3 * (x**2) - 2 * x + 4

    def equation2(self, x, n):
        return np.sin(x)**n

    def equation3(self, x):
        return np.exp(x)**x

    def equation4(self, x):
        return 1 / (x**2)

    def run_rule(self):
        if self.used_rule == 1:
            return self.left_hand_rule()
        elif self.used_rule == 2:
            return self.right_hand_rule()
        elif self.used_rule == 3:
            return self.midpoint_rule()
        elif self.used_rule == 4:
            return self.trapezoidal_rule()
        elif self.used_rule == 5:
            return self.simpsons_rule()
        elif self.used_rule == 6:
            return self.monte_carlo_rule()

    def left_hand_rule(self):
        self.area = 0.0
        step = (self.point2 - self.point1) / self.split_num
        for i in range(self.split_num):
            x = self.point1 + i * step
            self.area += self.equation(x) * step
        return self.area

    def right_hand_rule(self):
        self.area = 0.0
        step = (self.point2 - self.point1) / self.split_num
        for i in range(1, self.split_num + 1):
            x = self.point1 + i * step
            self.area += self.equation(x) * step
        return self.area

    def midpoint_rule(self):
        self.area = 0.0
        step = (self.point2 - self.point1) / self.split_num
        for i in range(self.split_num):
            x = self.point1 + (i + 0.5) * step
            self.area += self.equation(x) * step
        return self.area

    def trapezoidal_rule(self):
        self.area = 0.0
        step = (self.point2 - self.point1) / self.split_num
        for i in range(self.split_num):
            x1 = self.point1 + i * step
            x2 = x1 + step
            self.area += (self.equation(x1) + self.equation(x2)) * step / 2
        return self.area

    def simpsons_rule(self):
        self.area = 0.0
        step = (self.point2 - self.point1) / self.split_num
        for i in range(self.split_num + 1):
            x = self.point1 + i * step
            if i == 0 or i == self.split_num:
                self.area += self.equation(x)
            elif i % 2 == 0:
                self.area += 2 * self.equation(x)
            else:
                self.area += 4 * self.equation(x)
        self.area *= step / 3
        return self.area

    def monte_carlo_rule(self):
        num_samples = 10000
        samples = np.random.uniform(self.point1, self.point2, num_samples)
        function_values = self.equation(samples)
        self.area = (self.point2 - self.point1) * np.mean(function_values)
        return self.area

    def measure_runtime(self):
        start_time = time.time()  # Start time
        result = self.run_rule()  # Run the chosen rule
        end_time = time.time()  # End time
        runtime = end_time - start_time
        return result, runtime


In [None]:
# (point1, point2, used_equation, used_rule)
experiment1_1 = NumericalIntegrator(-1, 1, used_equation=1, used_rule=1)
experiment1_2 = NumericalIntegrator(-1, 1, used_equation=1, used_rule=2)
experiment1_3 = NumericalIntegrator(-1, 1, used_equation=1, used_rule=3)
experiment1_4 = NumericalIntegrator(-1, 1, used_equation=1, used_rule=4)
experiment1_5 = NumericalIntegrator(-1, 1, used_equation=1, used_rule=5)
experiment1_6 = NumericalIntegrator(-1, 1, used_equation=1, used_rule=6)

#
result, runtime = experiment1_1.measure_runtime()
print("experiment1_1")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment1_2.measure_runtime()
print("experiment1_2")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment1_3.measure_runtime()
print("experiment1_3")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment1_4.measure_runtime()
print("experiment1_4")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment1_5.measure_runtime()
print("experiment1_5")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment1_6.measure_runtime()
print("experiment1_6")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")


experiment1_1
Result: 10.000400039999997
Runtime: 0.011487 seconds
experiment1_2
Result: 9.999600039999994
Runtime: 0.018580 seconds
experiment1_3
Result: 9.999999980000034
Runtime: 0.026139 seconds
experiment1_4
Result: 10.000000039999955
Runtime: 0.039112 seconds
experiment1_5
Result: 10.00000000000002
Runtime: 0.027593 seconds
experiment1_6
Result: 10.007784304590402
Runtime: 0.002007 seconds


In [None]:
# (point1, point2, used_equation, used_rule)
pi = np.pi
experiment2_1_1 = NumericalIntegrator(0, pi, used_equation=21, used_rule=1)
experiment2_1_2 = NumericalIntegrator(0, pi, used_equation=21, used_rule=2)
experiment2_1_3 = NumericalIntegrator(0, pi, used_equation=21, used_rule=3)
experiment2_1_4 = NumericalIntegrator(0, pi, used_equation=21, used_rule=4)
experiment2_1_5 = NumericalIntegrator(0, pi, used_equation=21, used_rule=5)
experiment2_1_6 = NumericalIntegrator(0, pi, used_equation=21, used_rule=6)

#
result, runtime = experiment2_1_1.measure_runtime()
print("experiment2_1_1")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment2_1_2.measure_runtime()
print("experiment2_1_2")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment2_1_3.measure_runtime()
print("experiment2_1_3")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment2_1_4.measure_runtime()
print("experiment2_1_4")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment2_1_5.measure_runtime()
print("experiment2_1_5")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment2_1_6.measure_runtime()
print("experiment2_1_6")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

experiment2_1_1
Result: 1.999999983550664
Runtime: 0.037315 seconds
experiment2_1_2
Result: 1.999999983550664
Runtime: 0.035042 seconds
experiment2_1_3
Result: 2.0000000082246694
Runtime: 0.026228 seconds
experiment2_1_4
Result: 1.9999999835506606
Runtime: 0.052730 seconds
experiment2_1_5
Result: 1.9999999999999925
Runtime: 0.030218 seconds
experiment2_1_6
Result: 1.9902350203685988
Runtime: 0.000685 seconds


In [None]:
# (point1, point2, used_equation, used_rule)
pi = np.pi
experiment2_2_1 = NumericalIntegrator(0, pi, used_equation=22, used_rule=1)
experiment2_2_2 = NumericalIntegrator(0, pi, used_equation=22, used_rule=2)
experiment2_2_3 = NumericalIntegrator(0, pi, used_equation=22, used_rule=3)
experiment2_2_4 = NumericalIntegrator(0, pi, used_equation=22, used_rule=4)
experiment2_2_5 = NumericalIntegrator(0, pi, used_equation=22, used_rule=5)
experiment2_2_6 = NumericalIntegrator(0, pi, used_equation=22, used_rule=6)

#
result, runtime = experiment2_2_1.measure_runtime()
print("experiment2_2_1")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment2_2_2.measure_runtime()
print("experiment2_2_2")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment2_2_3.measure_runtime()
print("experiment2_2_3")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment2_2_4.measure_runtime()
print("experiment2_2_4")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment2_2_5.measure_runtime()
print("experiment2_2_5")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment2_2_6.measure_runtime()
print("experiment2_2_6")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

experiment2_2_1
Result: 1.5707963267948983
Runtime: 0.065635 seconds
experiment2_2_2
Result: 1.5707963267948983
Runtime: 0.014889 seconds
experiment2_2_3
Result: 1.5707963267948988
Runtime: 0.014636 seconds
experiment2_2_4
Result: 1.5707963267948872
Runtime: 0.030044 seconds
experiment2_1_5
Result: 1.5707963267948972
Runtime: 0.014497 seconds
experiment2_2_6
Result: 1.5627217789385994
Runtime: 0.002998 seconds


In [None]:
# (point1, point2, used_equation, used_rule)
pi = np.pi
experiment2_3_1 = NumericalIntegrator(0, pi, used_equation=23, used_rule=1)
experiment2_3_2 = NumericalIntegrator(0, pi, used_equation=23, used_rule=2)
experiment2_3_3 = NumericalIntegrator(0, pi, used_equation=23, used_rule=3)
experiment2_3_4 = NumericalIntegrator(0, pi, used_equation=23, used_rule=4)
experiment2_3_5 = NumericalIntegrator(0, pi, used_equation=23, used_rule=5)
experiment2_3_6 = NumericalIntegrator(0, pi, used_equation=23, used_rule=6)

#
result, runtime = experiment2_3_1.measure_runtime()
print("experiment2_3_1")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment2_3_2.measure_runtime()
print("experiment2_3_2")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment2_3_3.measure_runtime()
print("experiment2_3_3")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment2_3_4.measure_runtime()
print("experiment2_3_4")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment2_3_5.measure_runtime()
print("experiment2_3_5")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment2_3_6.measure_runtime()
print("experiment2_3_6")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

experiment2_3_1
Result: 1.3333333333333302
Runtime: 0.030950 seconds
experiment2_3_2
Result: 1.3333333333333302
Runtime: 0.025435 seconds
experiment2_3_3
Result: 1.3333333333333321
Runtime: 0.024080 seconds
experiment2_3_4
Result: 1.3333333333333264
Runtime: 0.075926 seconds
experiment2_3_5
Result: 1.3333333333333366
Runtime: 0.031005 seconds
experiment2_3_6
Result: 1.3545963540847603
Runtime: 0.002612 seconds


In [None]:
experiment3_1 = NumericalIntegrator(0, pi, used_equation=3, used_rule=1)
experiment3_2 = NumericalIntegrator(0, pi, used_equation=3, used_rule=2)
experiment3_3 = NumericalIntegrator(0, pi, used_equation=3, used_rule=3)
experiment3_4 = NumericalIntegrator(0, pi, used_equation=3, used_rule=4)
experiment3_5 = NumericalIntegrator(0, pi, used_equation=3, used_rule=5)
experiment3_6 = NumericalIntegrator(0, pi, used_equation=3, used_rule=6)

#
result, runtime = experiment3_1.measure_runtime()
print("experiment3_1")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment3_2.measure_runtime()
print("experiment3_2")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment3_3.measure_runtime()
print("experiment3_3")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment3_4.measure_runtime()
print("experiment3_4")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment3_5.measure_runtime()
print("experiment3_5")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment3_6.measure_runtime()
print("experiment3_6")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

experiment3_1
Result: 3264.101320822796
Runtime: 0.018401 seconds
experiment3_2
Result: 3270.174864219812
Runtime: 0.015102 seconds
experiment3_3
Result: 3267.1365938571635
Runtime: 0.014624 seconds
experiment3_4
Result: 3267.138092521305
Runtime: 0.026768 seconds
experiment3_5
Result: 3267.13709341216
Runtime: 0.015151 seconds
experiment3_6
Result: 3302.8151845738885
Runtime: 0.001961 seconds


In [None]:

# (point1, point2, used_equation, used_rule)
inf = 1e6
experiment4_1 = NumericalIntegrator(1, inf, used_equation=4, used_rule=1)
experiment4_2 = NumericalIntegrator(1, inf, used_equation=4, used_rule=2)
experiment4_3 = NumericalIntegrator(1, inf, used_equation=4, used_rule=3)
experiment4_4 = NumericalIntegrator(1, inf, used_equation=4, used_rule=4)
experiment4_5 = NumericalIntegrator(1, inf, used_equation=4, used_rule=5)
experiment4_6 = NumericalIntegrator(1, inf, used_equation=4, used_rule=6)

#
result, runtime = experiment4_1.measure_runtime()
print("experiment4_1")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment4_2.measure_runtime()
print("experiment4_2")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment4_3.measure_runtime()
print("experiment4_3")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment4_4.measure_runtime()
print("experiment4_4")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment4_5.measure_runtime()
print("experiment4_5")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

result, runtime = experiment4_6.measure_runtime()
print("experiment4_6")
print(f"Result: {result}")
print(f"Runtime: {runtime:.6f} seconds")

experiment4_1
Result: 100.01611115121104
Runtime: 0.007869 seconds
experiment4_2
Result: 0.016211151311241744
Runtime: 0.006448 seconds
experiment4_3
Result: 0.04771163851502475
Runtime: 0.008768 seconds
experiment4_4
Result: 50.01616115126095
Runtime: 0.009083 seconds
experiment4_5
Result: 33.352193542052085
Runtime: 0.005895 seconds
experiment4_6
Result: 0.14932542739111498
Runtime: 0.001081 seconds


## Problem 2

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kumarajarshi/life-expectancy-who")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/kumarajarshi/life-expectancy-who?dataset_version_number=1...


100%|██████████| 119k/119k [00:00<00:00, 33.2MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/kumarajarshi/life-expectancy-who/versions/1





In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [4]:
import os

file_path = os.path.join(path, 'Life Expectancy Data.csv')
data = pd.read_csv(file_path)
data

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.259210,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.470,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959000,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2933,Zimbabwe,2004,Developing,44.3,723.0,27,4.36,0.000000,68.0,31,...,67.0,7.13,65.0,33.6,454.366654,12777511.0,9.4,9.4,0.407,9.2
2934,Zimbabwe,2003,Developing,44.5,715.0,26,4.06,0.000000,7.0,998,...,7.0,6.52,68.0,36.7,453.351155,12633897.0,9.8,9.9,0.418,9.5
2935,Zimbabwe,2002,Developing,44.8,73.0,25,4.43,0.000000,73.0,304,...,73.0,6.53,71.0,39.8,57.348340,125525.0,1.2,1.3,0.427,10.0
2936,Zimbabwe,2001,Developing,45.3,686.0,25,1.72,0.000000,76.0,529,...,76.0,6.16,75.0,42.1,548.587312,12366165.0,1.6,1.7,0.427,9.8


In [5]:
# Check Nan values in data
percent_missing = data.isnull().sum() * 100 / len(data)
print(percent_missing)

Country                             0.000000
Year                                0.000000
Status                              0.000000
Life expectancy                     0.340368
Adult Mortality                     0.340368
infant deaths                       0.000000
Alcohol                             6.603131
percentage expenditure              0.000000
Hepatitis B                        18.822328
Measles                             0.000000
 BMI                                1.157250
under-five deaths                   0.000000
Polio                               0.646698
Total expenditure                   7.692308
Diphtheria                          0.646698
 HIV/AIDS                           0.000000
GDP                                15.248468
Population                         22.191967
 thinness  1-19 years               1.157250
 thinness 5-9 years                 1.157250
Income composition of resources     5.684139
Schooling                           5.547992
dtype: flo

### solution 1
: remove all nan values

In [6]:

# drop rows with missing data
data = data.dropna()

# display the data
data

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.259210,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.470,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959000,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2933,Zimbabwe,2004,Developing,44.3,723.0,27,4.36,0.000000,68.0,31,...,67.0,7.13,65.0,33.6,454.366654,12777511.0,9.4,9.4,0.407,9.2
2934,Zimbabwe,2003,Developing,44.5,715.0,26,4.06,0.000000,7.0,998,...,7.0,6.52,68.0,36.7,453.351155,12633897.0,9.8,9.9,0.418,9.5
2935,Zimbabwe,2002,Developing,44.8,73.0,25,4.43,0.000000,73.0,304,...,73.0,6.53,71.0,39.8,57.348340,125525.0,1.2,1.3,0.427,10.0
2936,Zimbabwe,2001,Developing,45.3,686.0,25,1.72,0.000000,76.0,529,...,76.0,6.16,75.0,42.1,548.587312,12366165.0,1.6,1.7,0.427,9.8


In [7]:
# convert the 'country' column to binary variables
data = pd.get_dummies(data, columns = ['Country'])

In [8]:
# check the number of unique status
unique_values = set()
for value in data["Status"]: # Iterate over values in the 'Status' column
    unique_values.add(value)   # Add each unique value to the set
unique_values

{'Developed', 'Developing'}

decided to convert to one-hot encoding in "Status"

In [9]:
data = pd.get_dummies(data, columns = ['Status'])

In [10]:
data

Unnamed: 0,Year,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,...,Country_Turkmenistan,Country_Uganda,Country_Ukraine,Country_Uruguay,Country_Uzbekistan,Country_Vanuatu,Country_Zambia,Country_Zimbabwe,Status_Developed,Status_Developing
0,2015,65.0,263.0,62,0.01,71.279624,65.0,1154,19.1,83,...,False,False,False,False,False,False,False,False,False,True
1,2014,59.9,271.0,64,0.01,73.523582,62.0,492,18.6,86,...,False,False,False,False,False,False,False,False,False,True
2,2013,59.9,268.0,66,0.01,73.219243,64.0,430,18.1,89,...,False,False,False,False,False,False,False,False,False,True
3,2012,59.5,272.0,69,0.01,78.184215,67.0,2787,17.6,93,...,False,False,False,False,False,False,False,False,False,True
4,2011,59.2,275.0,71,0.01,7.097109,68.0,3013,17.2,97,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2933,2004,44.3,723.0,27,4.36,0.000000,68.0,31,27.1,42,...,False,False,False,False,False,False,False,True,False,True
2934,2003,44.5,715.0,26,4.06,0.000000,7.0,998,26.7,41,...,False,False,False,False,False,False,False,True,False,True
2935,2002,44.8,73.0,25,4.43,0.000000,73.0,304,26.3,40,...,False,False,False,False,False,False,False,True,False,True
2936,2001,45.3,686.0,25,1.72,0.000000,76.0,529,25.9,39,...,False,False,False,False,False,False,False,True,False,True


In [11]:

# target Y
dataY = data['Life expectancy '].to_numpy()

# everything except Y
dataX = data.drop(columns ='Life expectancy ').to_numpy()

# split data to test and train
trainX, testX, trainY, testY = train_test_split(dataX, dataY, test_size = 0.2, random_state = 1)

In [12]:
print('Training set dimensions')
print(trainX.shape)
print(trainY.shape)

print('\nTest set dimensions')
print(testX.shape)
print(testY.shape)

Training set dimensions
(1319, 154)
(1319,)

Test set dimensions
(330, 154)
(330,)


In [13]:
import time


# import the linear regression model
from sklearn.linear_model import LinearRegression

# instantiate an OLS model
model = LinearRegression()

# fit the model to the training data (find the theta parameters)
start_time = time.time()
model.fit(trainX, trainY)
end_time = time.time()
train_time = end_time - start_time

# return the predicted outputs for the datapoints in the training set
train_predictions = model.predict(trainX)


# print the coefficient of determination r^2 for train
print('The r^2 score for training data is', model.score(trainX, trainY))


SSE_train = 0
for i in range(len(train_predictions)):
  SSE_train += (train_predictions[i]-trainY[i])**2

MSE_train = SSE_train / len(train_predictions)

print("SSE_train: ", SSE_train)
print("MSE_train: ", MSE_train)
print("train_time(OLS fit): ", train_time)

print()

start_time = time.time()
test_predictions = model.predict(testX)
end_time = time.time()
test_time = end_time - start_time

# print the coefficient of determination r^2 for test
print('The r^2 score for test data is', model.score(testX, testY))

SSE_test = 0
for i in range(len(test_predictions)):
  SSE_test += (test_predictions[i]-testY[i])**2

MSE_test = SSE_test / len(test_predictions)

print("SSE_test: ", SSE_test)
print("MSE_train: ", MSE_test)
print("test_time(): ", test_time)


The r^2 score for training data is 0.9661908152680255
SSE_train:  3457.6071635642043
MSE_train:  2.621385264263991
train_time(OLS fit):  0.05534958839416504

The r^2 score for test data is 0.9641867233059467
SSE_test:  904.6507489972878
MSE_train:  2.7413659060523874
test_time():  0.006024599075317383
