In [3]:
import requests 
import pandas as pd
import section

# Part 1. Loading the dataset

In [4]:
# Using pandas load the dataset (load remotely, not locally)
file = pd.read_csv("https://raw.githubusercontent.com/profmcnich/example_notebook/main/science_data_large.csv")

In [5]:
# Output the first 15 rows of the data
file.head(15)

Unnamed: 0,Temperature °C,Mols KCL,Size nm^3
0,469,647,624474.3
1,403,694,577961.0
2,302,975,619684.7
3,779,916,1460449.0
4,901,18,43257.26
5,545,637,712463.4
6,660,519,700696.0
7,143,869,271826.0
8,89,461,89198.03
9,294,776,477021.0


In [6]:
# Display a summary of the table information (number of datapoints, etc.)
file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Temperature °C  1000 non-null   int64  
 1   Mols KCL        1000 non-null   int64  
 2   Size nm^3       1000 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 23.6 KB


In [7]:
# Display a summary of the table information (number of datapoints, etc.)
file.describe()

Unnamed: 0,Temperature °C,Mols KCL,Size nm^3
count,1000.0,1000.0,1000.0
mean,500.5,471.53,508611.1
std,288.819436,288.482872,447483.8
min,1.0,1.0,16.11429
25%,250.75,226.75,129826.7
50%,500.5,459.5,382718.2
75%,750.25,710.25,760321.1
max,1000.0,1000.0,1972127.0


# Part 2. Splitting the dataset¶


In [8]:
 # Take the pandas dataset and split it into our features (X) and label (y)
X,Y = file[['Temperature °C','Mols KCL']], file[['Size nm^3']]

In [14]:
Y

Unnamed: 0,Size nm^3
0,6.244743e+05
1,5.779610e+05
2,6.196847e+05
3,1.460449e+06
4,4.325726e+04
...,...
995,1.545661e+06
996,6.737041e+05
997,3.477543e+05
998,8.684794e+05


In [9]:
# Use sklearn to split the features and labels into a training/test set. (90% train, 10% test)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.10, random_state=42)

# Part 3. Perform a Linear Regression

In [13]:
# Use sklearn to train a model on the training set
from sklearn.linear_model import LinearRegression

In [14]:
regression = LinearRegression().fit(X,Y)

In [55]:
regression.score(X_test, y_test)

0.8567685950516806

#Report on the score for that model, in your own words (markdown, not code) explain what the score means

Our regression has a high score, which tells that there is a high correlation between our target which is the size. The relathionship is directly proportional which means as the temperature and Mols increase the size will increase as well. It represents the The coefficient of determination. It does a good job of fitting the data and it does a good job of explaining changes in the dependent variables.
It basically shows that the difference between the predicted values and observed values is low, however we need to make sure its not overfitting meaning its not biased. We may need a larger dataset to show to analyze the correlation between the dependent and independent variable

In [16]:
#Extract the coefficents and intercept from the model and write an equation for your h(x) using LaTeX
regression.coef_

array([[ 875.90992708, 1031.59502452]])

In [17]:
regression.coef_, regression.intercept_

(array([[ 875.90992708, 1031.59502452]]), array([-416209.8173862]))

#latex equation 
 
\begin{equation} Y_i = {-416209 + 875.90992708*x}\end{equation}

In [21]:
#Create a sample datapoint and predict the output of that sample with the trained model
datapoint = {
    'Temperature °C': [800,700],
    'Mols KCL': [800,650]
}
data_df = pd.DataFrame(datapoint)

In [25]:
size_pred = regression.predict(data_df)

In [26]:
print(size_pred)

[[1109794.14389322]
 [ 867463.89750715]]


#  Part 4. Use Cross Validation

In [34]:
# Use the cross_val_score function to repeat your experiment across many shuffles of the data
from sklearn.model_selection import cross_val_score
print(cross_val_score(regression, X, Y, cv=5))

[0.83918826 0.87051239 0.85871066 0.87202623 0.84364641]


#Report on their finding and their significance

as we can see the model performs similary for five different set of instances of unseen data. It predicts well on data not used during the training of the model. Its very important to do cross validation for model checking. The fives range between 0.83- 0.87 therefore we can see the accuract is very high.

In [35]:
y_test

Unnamed: 0,Size nm^3
521,1.177623e+05
737,8.687293e+05
740,1.084893e+06
660,1.716039e+06
411,9.536850e+05
...,...
436,6.305199e+05
764,7.676234e+05
88,8.684308e+05
63,9.737511e+05


# Part 5. Using Polynomial Regression

In [29]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

In [48]:

# Using the PolynomialFeatures library perform another regression on an augmented dataset of degree 2
# Step 1: data preparation
poly = PolynomialFeatures(degree=2)
X_TRAN = poly.fit_transform(X_train)
X_test_ = poly.fit_transform(X_test)

In [49]:
# Step 3: define and train a model

# Instantiate
model = LinearRegression()

# Fit
model.fit(X_TRAN, y_train)

# Obtain coefficients
model.coef_

array([[ 0.00000000e+00,  1.20000000e+01, -1.27197810e-07,
         1.26444795e-11,  2.00000000e+00,  2.85714287e-02]])

In [51]:
model.score(X_TRAN, y_train)

1.0

In [52]:
model.coef_

array([[ 0.00000000e+00,  1.20000000e+01, -1.27197810e-07,
         1.26444795e-11,  2.00000000e+00,  2.85714287e-02]])

\begin{equation} Y_i = { (0.00000000e) + (1.20000000e^1 * a) +  (-1.27197810e^-7*b) + (1.26444795e^-11*a^2) + (2.00000000e+00*ab) + (2.85714287e^-2 * b^2)}\end{equation}

In [53]:
model.intercept_

array([2.04788521e-05])

In [46]:
#predict
model.predict(X_test_)

array([[1.17762314e+05],
       [8.68729257e+05],
       [1.08489300e+06],
       [1.71603946e+06],
       [9.53685000e+05],
       [1.15057114e+05],
       [2.22299400e+05],
       [2.85719400e+05],
       [2.87802714e+05],
       [3.58129714e+05],
       [2.43109457e+05],
       [2.56621829e+05],
       [8.23168314e+05],
       [1.28610714e+05],
       [3.55214714e+05],
       [5.49207314e+05],
       [7.11794571e+04],
       [4.10720600e+05],
       [2.17334314e+05],
       [1.48665911e+06],
       [7.61464000e+04],
       [3.30257157e+02],
       [1.44061911e+06],
       [6.87856114e+05],
       [3.04432457e+05],
       [1.23172829e+05],
       [3.30255457e+05],
       [7.97577257e+05],
       [1.07736803e+06],
       [1.50313257e+05],
       [3.50800114e+05],
       [1.16391429e+05],
       [3.60092314e+05],
       [6.22649829e+05],
       [3.89294314e+05],
       [9.72528114e+05],
       [4.92891314e+05],
       [4.52268000e+05],
       [5.54434600e+05],
       [9.38430829e+05],


In [47]:
model.intercept_

array([1.65716046e-05])

#Report on the metrics and output the resultant equation as you did in Part 3.

the model using polynomial features scored 1, We can see that the score has increase as compared to the linear line. a quadratic curve is able to fit the data better than a linear line. However since we have 1 as a score, the model may have ovefitted because  we dont have enought  training sample. Basically it performs well on the data but it generalizes poorly. Its not ideal to have a score of 1.