In [1]:
import os
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.3.2'
spark_version = 'spark-3.3.2'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Hit:4 http://archive.ubuntu.com/ubuntu focal InRelease
Hit:5 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InRelease
Get:6 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Hit:7 http://ppa.launchpad.net/cran/libgit2/ubuntu focal InRelease
Get:8 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal InRelease [18.1 kB]
Get:9 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]
Hit:10 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu focal InRelease
Hit:11 http://ppa.launchpad.net/ubuntugis/ppa/ubuntu focal InRelease
Get:12 http://security.ubuntu.com/ubuntu focal-security/main amd64 Packages [2,774 kB]
Get:13 http://archive.ubuntu.com/ubuntu focal-updates/universe amd64

In [2]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [3]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DataFrame Basics").getOrCreate()

In [4]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url = "https://raw.githubusercontent.com/Ekenc/Project4/main/Data/Merged_Target_Glaciers.csv"
spark.sparkContext.addFile(url)
df = pd.read_csv(url)

# Show DataFrame
df.head()


Unnamed: 0.1,Unnamed: 0,Year,Mean Nitrous Oxide Concentration,Mean Carbon Dioxide Concentration,Mean Methane Concentration,Mean cumulative mass balance of glaciers,Precipitation Anomaly,Mean Adjusted Sea Level (inches),Sea Temperature Anomaly,Earth Surface Temperature Anomaly (land and ocean)
0,0,1964,291.2,319.62,1260.3,-2.545,-0.041776,4.169291,-0.546,-0.252
1,1,1967,291.5,322.18,1284.03,-2.662,-0.096894,4.452756,-0.42,0.0
2,2,1970,293.8,325.620315,1351.7,-3.519,-0.070516,4.677165,-0.294,0.108
3,3,1971,294.0,326.32,1357.2,-3.758,0.03224,4.88189,-0.51,-0.126
4,4,1972,295.6,328.74211,1380.1,-4.016,-0.772485,5.240157,-0.186,0.072


In [5]:
# Define features set
X = df.drop("Mean cumulative mass balance of glaciers", axis = 1)
X = X.drop("Unnamed: 0", axis = 1)
X.head()

Unnamed: 0,Year,Mean Nitrous Oxide Concentration,Mean Carbon Dioxide Concentration,Mean Methane Concentration,Precipitation Anomaly,Mean Adjusted Sea Level (inches),Sea Temperature Anomaly,Earth Surface Temperature Anomaly (land and ocean)
0,1964,291.2,319.62,1260.3,-0.041776,4.169291,-0.546,-0.252
1,1967,291.5,322.18,1284.03,-0.096894,4.452756,-0.42,0.0
2,1970,293.8,325.620315,1351.7,-0.070516,4.677165,-0.294,0.108
3,1971,294.0,326.32,1357.2,0.03224,4.88189,-0.51,-0.126
4,1972,295.6,328.74211,1380.1,-0.772485,5.240157,-0.186,0.072


In [6]:
# Define target vector
y = df["Mean cumulative mass balance of glaciers"]
y[:48]

0     -2.545
1     -2.662
2     -3.519
3     -3.758
4     -4.016
5     -4.147
6     -4.339
7     -4.534
8     -5.140
9     -5.919
10    -6.726
11    -7.009
12    -7.586
13    -7.475
14    -7.559
15    -7.892
16    -8.399
17    -8.832
18    -8.935
19    -9.242
20    -9.917
21   -10.384
22   -10.819
23   -11.611
24   -12.134
25   -12.861
26   -13.088
27   -13.317
28   -13.755
29   -14.326
30   -15.018
31   -15.758
32   -16.591
33   -17.202
34   -17.619
35   -18.169
36   -19.011
37   -19.920
38   -20.657
39   -21.502
40   -22.285
41   -23.402
42   -24.383
43   -25.152
44   -26.043
45   -27.174
Name: Mean cumulative mass balance of glaciers, dtype: float64

In [7]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [8]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
                                                                                                

In [9]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [10]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [11]:
# Create a linear regression model
model = LinearRegression()

In [12]:
# Train the model
model.fit(X_train, y_train)

In [13]:
# Make predictions
predictions = model.predict(X_test)

In [14]:
# Calculate the mean squared error, model evaluation
print(
  'mean_squared_error : ', mean_squared_error(y_test, predictions))
print(
  'mean_absolute_error : ', mean_absolute_error(y_test, predictions))


# Compute the linear regression model score
model.score(X_test, y_test)


mean_squared_error :  0.13344398097871177
mean_absolute_error :  0.3273768383162952


0.9962796449329785

In [15]:
# Display features
model.feature_names_in_

array(['Year', 'Mean Nitrous Oxide Concentration',
       'Mean Carbon Dioxide Concentration', 'Mean Methane Concentration',
       'Precipitation Anomaly', 'Mean Adjusted Sea Level (inches)',
       'Sea Temperature Anomaly',
       'Earth Surface Temperature Anomaly (land and ocean)'], dtype=object)

In [16]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [ 0.47718963 -0.36988827 -0.4169124   0.00363793  0.10901377  0.06858544
 -0.72948833  0.46540477]


In [17]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: -704.3577862901818


In [18]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = -704.3577862901818 + 0.47718963391684366X


In [19]:
# Display the formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 2030")

# Predict the 
y_0 = model.intercept_ + model.coef_[0] *2030

# Display the prediction
print(f": {y_0:.3f}")

Model's formula: y = -704.3577862901818 + 0.47718963391684366 * 2030
: 264.337
