In [1]:
import os
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.3.2'
spark_version = 'spark-3.3.2'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
Hit:3 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InRelease
Get:4 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Hit:5 http://archive.ubuntu.com/ubuntu focal InRelease
Get:6 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Hit:7 http://ppa.launchpad.net/cran/libgit2/ubuntu focal InRelease
Get:8 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]
Hit:9 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal InRelease
Get:10 http://archive.ubuntu.com/ubuntu focal-updates/main amd64 Packages [3,255 kB]
Hit:11 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu focal InRelease
Get:12 http://security.ubuntu.com/ubuntu focal-security/main amd64 Packages [2,776 kB]
Hit:13 http://ppa.launchpad.net/ubuntugis/ppa/ubuntu focal InR

In [2]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [3]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DataFrame Basics").getOrCreate()

In [4]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url = "https://raw.githubusercontent.com/Ekenc/Project4/main/Data/Merged_Target_SeaLevel.csv"
spark.sparkContext.addFile(url)
df = pd.read_csv(url)

# Show DataFrame
df.head()


Unnamed: 0.1,Unnamed: 0,Year,Mean Nitrous Oxide Concentration,Mean Carbon Dioxide Concentration,Mean Methane Concentration,Precipitation Anomaly,Mean Adjusted Sea Level (inches),Sea Temperature Anomaly,Earth Surface Temperature Anomaly (land and ocean)
0,0,1964,291.2,319.62,1260.3,-0.041776,4.169291,-0.546,-0.252
1,1,1967,291.5,322.18,1284.03,-0.096894,4.452756,-0.42,0.0
2,2,1970,293.8,325.620315,1351.7,-0.070516,4.677165,-0.294,0.108
3,3,1971,294.0,326.32,1357.2,0.03224,4.88189,-0.51,-0.126
4,4,1972,295.6,328.74211,1380.1,-0.772485,5.240157,-0.186,0.072


In [5]:
# Define features set
X = df.drop("Mean Adjusted Sea Level (inches)", axis = 1)
X = X.drop("Unnamed: 0", axis = 1)
X= X.drop("Precipitation Anomaly", axis = 1)
X.head()

Unnamed: 0,Year,Mean Nitrous Oxide Concentration,Mean Carbon Dioxide Concentration,Mean Methane Concentration,Sea Temperature Anomaly,Earth Surface Temperature Anomaly (land and ocean)
0,1964,291.2,319.62,1260.3,-0.546,-0.252
1,1967,291.5,322.18,1284.03,-0.42,0.0
2,1970,293.8,325.620315,1351.7,-0.294,0.108
3,1971,294.0,326.32,1357.2,-0.51,-0.126
4,1972,295.6,328.74211,1380.1,-0.186,0.072


In [6]:
# Load the data
url1 = "https://raw.githubusercontent.com/Ekenc/Project4/main/Data/Projected_Future_GHG_and_Precipitation_2020_2050.csv"
df1 = pd.read_csv(url1)

# Show DataFrame
df1.head()

Unnamed: 0.1,Unnamed: 0,Year,Mean Nitrous Oxide Concentration,Mean Carbon Dioxide Concentration,Mean Methane Concentration,Precipitation Anomaly,Sea Temperature Anomaly,Earth Surface Temperature Anomaly (land and ocean)
0,0,2020,330.714233,405.576768,1906.690278,0.328059,0.770041,1.612599
1,1,2021,331.466061,407.257998,1916.355223,0.333086,0.792619,1.645512
2,2,2022,332.217889,408.939228,1926.020169,0.338114,0.815197,1.678425
3,3,2023,332.969718,410.620458,1935.685114,0.343142,0.837775,1.711339
4,4,2024,333.721546,412.301688,1945.350059,0.348169,0.860353,1.744252


In [7]:
df1=df1.drop("Unnamed: 0", axis = 1)
df1= df1.drop("Precipitation Anomaly", axis = 1)
df1.head()

Unnamed: 0,Year,Mean Nitrous Oxide Concentration,Mean Carbon Dioxide Concentration,Mean Methane Concentration,Sea Temperature Anomaly,Earth Surface Temperature Anomaly (land and ocean)
0,2020,330.714233,405.576768,1906.690278,0.770041,1.612599
1,2021,331.466061,407.257998,1916.355223,0.792619,1.645512
2,2022,332.217889,408.939228,1926.020169,0.815197,1.678425
3,2023,332.969718,410.620458,1935.685114,0.837775,1.711339
4,2024,333.721546,412.301688,1945.350059,0.860353,1.744252


In [8]:
# Define target vector
y = df["Mean Adjusted Sea Level (inches)"]
y[:48]

0     4.169291
1     4.452756
2     4.677165
3     4.881890
4     5.240157
5     5.003937
6     5.472441
7     5.409449
8     5.303150
9     5.598425
10    6.153543
11    5.748031
12    5.771654
13    5.795276
14    5.980315
15    6.157480
16    6.248031
17    6.346457
18    6.374016
19    6.303150
20    6.507874
21    6.622047
22    6.783465
23    7.059055
24    6.669291
25    7.003937
26    7.055118
27    7.271654
28    7.366142
29    7.728346
30    7.712598
31    7.716535
32    7.885827
33    7.960630
34    8.303150
35    8.531496
36    8.834646
37    8.897638
38    9.244094
39    8.913386
40    8.579036
41    8.924984
42    9.048399
43    9.110986
44    9.234521
45    9.480223
46    9.592477
Name: Mean Adjusted Sea Level (inches), dtype: float64

In [9]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [10]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
                                                                                                

In [11]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [12]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
# Create a linear regression model
model = LinearRegression()

In [14]:
# Train the model
model.fit(X_train, y_train)

In [15]:
# Make predictions
predictions = model.predict(X_test)

In [16]:
# Calculate the mean squared error, model evaluation
print(
  'mean_squared_error : ', mean_squared_error(y_test, predictions))
print(
  'mean_absolute_error : ', mean_absolute_error(y_test, predictions))


# Compute the linear regression model score
model.score(X_test, y_test)


mean_squared_error :  0.04360162099868611
mean_absolute_error :  0.1753387745014119


0.9800281521532895

In [23]:
# Display features
model.feature_names_in_
# Display the slope
print(f"Model's slope: {model.coef_}")
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's slope: [ 0.10469842  0.01123675  0.0158134  -0.0027585   0.79202387 -1.12661952]
Model's y-intercept: -205.8515630899781
Model's formula: y = -205.8515630899781 + 0.10469842051289702X


In [21]:
# Display the formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]} * 2030")

# Predict the 
y_0 = model.intercept_ + model.coef_[0] *2030

# Display the prediction
print(f": {y_0:.3f}")

Model's formula: y = -205.8515630899781 + 0.10469842051289702 * 2030
: 6.686


In [22]:
#Check Model with Predictions data
Sea_Level_Predictions = model.predict(df1)
Sea_Level_Predictions

array([ 9.30244085,  9.39631421,  9.49018756,  9.58406092,  9.67793428,
        9.77180764,  9.86568099,  9.95955435, 10.05342771, 10.14730106,
       10.24117442, 10.33504778, 10.42892113, 10.52279449, 10.61666785,
       10.71054121, 10.80441456, 10.89828792, 10.99216128, 11.08603463,
       11.17990799, 11.27378135, 11.36765471, 11.46152806, 11.55540142,
       11.64927478, 11.74314813, 11.83702149, 11.93089485, 12.02476821,
       12.11864156])

In [27]:
Linear_Prediction_df=df1.copy()
Linear_Prediction_df["Mean Adjusted Sea Level (inches)"] = Sea_Level_Predictions
Linear_Prediction_df = Linear_Prediction_df[['Mean Nitrous Oxide Concentration', 'Mean Carbon Dioxide Concentration', 'Mean Methane Concentration', 'Mean Adjusted Sea Level (inches)', 'Sea Temperature Anomaly', 'Earth Surface Temperature Anomaly (land and ocean)']]
Linear_Prediction_df

Unnamed: 0,Mean Nitrous Oxide Concentration,Mean Carbon Dioxide Concentration,Mean Methane Concentration,Mean Adjusted Sea Level (inches),Sea Temperature Anomaly,Earth Surface Temperature Anomaly (land and ocean)
0,330.714233,405.576768,1906.690278,9.302441,0.770041,1.612599
1,331.466061,407.257998,1916.355223,9.396314,0.792619,1.645512
2,332.217889,408.939228,1926.020169,9.490188,0.815197,1.678425
3,332.969718,410.620458,1935.685114,9.584061,0.837775,1.711339
4,333.721546,412.301688,1945.350059,9.677934,0.860353,1.744252
5,334.473374,413.982918,1955.015005,9.771808,0.882931,1.777165
6,335.225202,415.664148,1964.67995,9.865681,0.905509,1.810079
7,335.97703,417.345378,1974.344895,9.959554,0.928088,1.842992
8,336.728858,419.026608,1984.00984,10.053428,0.950666,1.875905
9,337.480687,420.707838,1993.674786,10.147301,0.973244,1.908819


In [28]:
from google.colab import files
Linear_Prediction_df.to_csv("Linear_Projected_SeaLevel_Dataframe.csv",encoding = 'utf-8-sig')
files.download('Linear_Projected_SeaLevel_Dataframe.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>