## Reading NASA's data as csv files
### The data is composed of slices of the data provided by NASA
### We downloaded the data provided by NASA and did some manual inspection and cleaning
### we used years 2012 and 2022 because solar wind events occurred in them

In [1]:
import pandas as pd

df2012 = pd.read_csv(r"D:\nouran\Competitions\Nasa space apps\julymergedcleaned.csv")
df2012

Unnamed: 0,Record,fit_flag,year,doy,Proton_V_nonlin,Proton_sigmaV_nonlin,Proton_VX_nonlin,Proton_sigmaVX_nonlin,Proton_VY_nonlin,Proton_sigmaVY_nonlin,...,BX,BY,BZ,Ang_dev,dev,xgse,ygse,zgse,ygsm,zgsm
0,1.0,10.0,-36.0,183.0,663.000000,44.300000,-659.000000,44.500000,21.800000,5.290000,...,2.75,-1.86,5.9100,11.800,0.0881,212.0,-90.7,-2.94,-89.9,-12.1
1,2.0,10.0,-36.0,183.0,656.000000,38.700000,-651.000000,38.900000,22.600000,4.780000,...,3.98,-2.15,5.2600,5.180,0.1440,212.0,-90.7,-2.95,-89.9,-12.1
2,3.0,10.0,-36.0,183.0,650.000000,38.400000,-646.000000,38.600000,20.400000,4.430000,...,4.44,-2.40,5.0300,4.810,0.1880,212.0,-90.7,-2.95,-89.9,-12.1
3,4.0,10.0,-36.0,183.0,655.000000,36.300000,-651.000000,36.600000,19.000000,3.850000,...,4.14,-2.88,5.4700,2.360,0.0754,212.0,-90.7,-2.95,-89.9,-12.0
4,5.0,10.0,-36.0,183.0,659.000000,37.700000,-655.000000,37.900000,19.100000,3.900000,...,3.70,-3.14,5.6200,5.720,0.1290,212.0,-90.7,-2.95,-89.9,-12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8429,810.0,3.0,-36.0,192.0,394.000000,17.000000,-394.000000,17.000000,-1.610000,1.000000,...,-3.07,10.50,0.0672,0.838,0.0286,207.0,-73.7,-9.23,-73.3,-12.3
8430,811.0,3.0,-36.0,192.0,389.000000,17.200000,-389.000000,17.200000,-0.064000,1.000000,...,-3.40,10.30,-1.0700,1.590,0.0247,207.0,-73.7,-9.23,-73.3,-12.2
8431,812.0,3.0,-36.0,192.0,388.000000,16.900000,-388.000000,16.900000,-0.369000,1.000000,...,-3.68,10.30,-1.2400,0.740,0.0173,207.0,-73.7,-9.23,-73.3,-12.2
8432,813.0,3.0,-36.0,192.0,387.000000,17.000000,-387.000000,17.000000,0.629000,1.000000,...,-3.86,10.20,-1.5500,1.350,0.0483,207.0,-73.7,-9.23,-73.3,-12.2


In [2]:
df2012.columns

Index(['Record', 'fit_flag', 'year', 'doy', 'Proton_V_nonlin',
       'Proton_sigmaV_nonlin', 'Proton_VX_nonlin', 'Proton_sigmaVX_nonlin',
       'Proton_VY_nonlin', 'Proton_sigmaVY_nonlin', 'Proton_VZ_nonlin',
       'Proton_sigmaVZ_nonlin', 'Proton_W_nonlin', 'Proton_sigmaW_nonlin',
       'Proton_Wperp_nonlin', 'Proton_sigmaWperp_nonlin', 'Proton_Wpar_nonlin',
       'Proton_sigmaWpar_nonlin', 'EW_flowangle', 'SigmaEW_flowangle',
       'NS_flowangle', 'SigmaNS_flowangle', 'Proton_Np_nonlin',
       'Proton_sigmaNp_nonlin', 'Alpha_V_nonlin', 'Alpha_sigmaV_nonlin',
       'Alpha_VX_nonlin', 'Alpha_sigmaVX_nonlin', 'Alpha_VY_nonlin',
       'Alpha_sigmaVY_nonlin', 'Alpha_VZ_nonlin', 'Alpha_sigmaVZ_nonlin',
       'Alpha_W_nonlin', 'Alpha_sigmaW_nonlin', 'Alpha_Wperp_nonlin',
       'Alpha_sigmaWperp_nonlin', 'Alpha_Wpar_nonlin',
       'Alpha_sigmaWpar_nonlin', 'Alpha_Na_nonlin', 'Alpha_sigmaNa_nonlin',
       'ChisQ_DOF_nonlin', 'Peak_doy', 'sigmaPeak_doy', 'Proton_V_moment',
       

## A helper function to convert 'doy' feature into day and year

In [3]:
from datetime import datetime
def getday(doy, year):
    return int(datetime.strptime(year + "-" + str(int(doy)), "%Y-%j").strftime("%d-%m-%Y")[0:2])

## Dropping unimportant or corrupted features and adding more useful features

In [4]:
df2012.drop(['Record', 'fit_flag', 'Alpha_V_nonlin', 'Alpha_sigmaV_nonlin',
       'Alpha_VX_nonlin', 'Alpha_sigmaVX_nonlin', 'Alpha_VY_nonlin',
       'Alpha_sigmaVY_nonlin', 'Alpha_VZ_nonlin', 'Alpha_sigmaVZ_nonlin',
       'Alpha_W_nonlin', 'Alpha_sigmaW_nonlin', 'Alpha_Wperp_nonlin',
       'Alpha_sigmaWperp_nonlin', 'Alpha_Wpar_nonlin',
       'Alpha_sigmaWpar_nonlin', 'Alpha_Na_nonlin', 'Alpha_sigmaNa_nonlin'], axis=1, inplace=True)

df2012['year'] = 2012
df2012['month'] = 7
df2012['doy'] = df2012['doy'].apply(getday, args=['2012'])
df2012

Unnamed: 0,year,doy,Proton_V_nonlin,Proton_sigmaV_nonlin,Proton_VX_nonlin,Proton_sigmaVX_nonlin,Proton_VY_nonlin,Proton_sigmaVY_nonlin,Proton_VZ_nonlin,Proton_sigmaVZ_nonlin,...,BY,BZ,Ang_dev,dev,xgse,ygse,zgse,ygsm,zgsm,month
0,2012,1,663.000000,44.300000,-659.000000,44.500000,21.800000,5.290000,67.9000,16.000000,...,-1.86,5.9100,11.800,0.0881,212.0,-90.7,-2.94,-89.9,-12.1,7
1,2012,1,656.000000,38.700000,-651.000000,38.900000,22.600000,4.780000,73.3000,15.100000,...,-2.15,5.2600,5.180,0.1440,212.0,-90.7,-2.95,-89.9,-12.1,7
2,2012,1,650.000000,38.400000,-646.000000,38.600000,20.400000,4.430000,65.6000,13.600000,...,-2.40,5.0300,4.810,0.1880,212.0,-90.7,-2.95,-89.9,-12.1,7
3,2012,1,655.000000,36.300000,-651.000000,36.600000,19.000000,3.850000,72.2000,14.100000,...,-2.88,5.4700,2.360,0.0754,212.0,-90.7,-2.95,-89.9,-12.0,7
4,2012,1,659.000000,37.700000,-655.000000,37.900000,19.100000,3.900000,69.8000,14.000000,...,-3.14,5.6200,5.720,0.1290,212.0,-90.7,-2.95,-89.9,-12.0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8429,2012,10,394.000000,17.000000,-394.000000,17.000000,-1.610000,1.000000,2.8200,1.000000,...,10.50,0.0672,0.838,0.0286,207.0,-73.7,-9.23,-73.3,-12.3,7
8430,2012,10,389.000000,17.200000,-389.000000,17.200000,-0.064000,1.000000,6.1200,1.000000,...,10.30,-1.0700,1.590,0.0247,207.0,-73.7,-9.23,-73.3,-12.2,7
8431,2012,10,388.000000,16.900000,-388.000000,16.900000,-0.369000,1.000000,4.9600,1.000000,...,10.30,-1.2400,0.740,0.0173,207.0,-73.7,-9.23,-73.3,-12.2,7
8432,2012,10,387.000000,17.000000,-387.000000,17.000000,0.629000,1.000000,6.1800,1.000000,...,10.20,-1.5500,1.350,0.0483,207.0,-73.7,-9.23,-73.3,-12.2,7


## Reading year 2022 csv data

In [5]:
df2022 = pd.read_csv(r"D:\nouran\Competitions\Nasa space apps\2022.csv")
df2022

Unnamed: 0,Record,fit_flag,year,doy,Proton_V_nonlin,Proton_sigmaV_nonlin,Proton_VX_nonlin,Proton_sigmaVX_nonlin,Proton_VY_nonlin,Proton_sigmaVY_nonlin,...,BX,BY,BZ,Ang_dev,dev,xgse,ygse,zgse,ygsm,zgsm
0,1,1,-26,32.0,435,4.35,-432,4.32,13.80,1.00,...,-2.720,5.63,-0.957,2.99,0.0271,258,-21.9,12.9,-25.3,1.91
1,2,0,-26,32.0,424,53.80,-423,54.00,7.89,3.07,...,-2.520,5.47,-1.060,7.14,0.0470,258,-21.9,12.9,-25.3,1.91
2,3,1,-26,32.0,436,22.90,-433,23.00,9.83,1.87,...,-2.020,5.61,-1.610,1.78,0.0264,258,-21.9,12.9,-25.3,1.91
3,4,1,-26,32.0,444,4.44,-441,4.41,11.10,1.00,...,-1.110,5.75,-1.810,5.39,0.0524,258,-21.9,12.9,-25.3,1.92
4,5,1,-26,32.0,461,24.50,-457,24.70,15.50,2.93,...,0.415,5.16,-3.220,3.28,0.0685,258,-21.9,12.9,-25.3,1.93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5724,780,1,-26,40.0,393,20.70,-392,20.70,2.86,1.00,...,2.300,-1.75,1.040,3.14,0.0437,253,-47.9,11.9,-47.8,-12.30
5725,781,1,-26,40.0,392,21.70,-390,21.70,6.36,1.38,...,2.380,-1.54,0.974,3.78,0.0361,253,-47.9,11.9,-47.8,-12.30
5726,782,1,-26,40.0,393,21.30,-392,21.30,5.96,1.27,...,2.240,-1.59,1.080,1.88,0.0360,253,-47.9,11.9,-47.8,-12.30
5727,783,1,-26,40.0,393,21.80,-392,21.90,4.64,1.01,...,2.190,-1.57,1.190,3.91,0.0267,253,-47.9,11.9,-47.8,-12.30


In [6]:
df2022.drop(['Record', 'fit_flag', 'Alpha_V_nonlin', 'Alpha_sigmaV_nonlin',
       'Alpha_VX_nonlin', 'Alpha_sigmaVX_nonlin', 'Alpha_VY_nonlin',
       'Alpha_sigmaVY_nonlin', 'Alpha_VZ_nonlin', 'Alpha_sigmaVZ_nonlin',
       'Alpha_W_nonlin', 'Alpha_sigmaW_nonlin', 'Alpha_Wperp_nonlin',
       'Alpha_sigmaWperp_nonlin', 'Alpha_Wpar_nonlin',
       'Alpha_sigmaWpar_nonlin', 'Alpha_Na_nonlin', 'Alpha_sigmaNa_nonlin'], axis=1, inplace=True)

df2022['year'] = 2022
df2022['month'] = 2
df2022['doy'] = df2022['doy'].apply(getday, args=['2022'])
df2022

Unnamed: 0,year,doy,Proton_V_nonlin,Proton_sigmaV_nonlin,Proton_VX_nonlin,Proton_sigmaVX_nonlin,Proton_VY_nonlin,Proton_sigmaVY_nonlin,Proton_VZ_nonlin,Proton_sigmaVZ_nonlin,...,BY,BZ,Ang_dev,dev,xgse,ygse,zgse,ygsm,zgsm,month
0,2022,1,435,4.35,-432,4.32,13.80,1.00,43.7,1.00,...,5.63,-0.957,2.99,0.0271,258,-21.9,12.9,-25.3,1.91,2
1,2022,1,424,53.80,-423,54.00,7.89,3.07,38.1,10.50,...,5.47,-1.060,7.14,0.0470,258,-21.9,12.9,-25.3,1.91,2
2,2022,1,436,22.90,-433,23.00,9.83,1.87,43.3,7.56,...,5.61,-1.610,1.78,0.0264,258,-21.9,12.9,-25.3,1.91,2
3,2022,1,444,4.44,-441,4.41,11.10,1.00,50.7,1.00,...,5.75,-1.810,5.39,0.0524,258,-21.9,12.9,-25.3,1.92,2
4,2022,1,461,24.50,-457,24.70,15.50,2.93,59.5,10.70,...,5.16,-3.220,3.28,0.0685,258,-21.9,12.9,-25.3,1.93,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5724,2022,9,393,20.70,-392,20.70,2.86,1.00,-24.7,4.55,...,-1.75,1.040,3.14,0.0437,253,-47.9,11.9,-47.8,-12.30,2
5725,2022,9,392,21.70,-390,21.70,6.36,1.38,-29.0,5.65,...,-1.54,0.974,3.78,0.0361,253,-47.9,11.9,-47.8,-12.30,2
5726,2022,9,393,21.30,-392,21.30,5.96,1.27,-27.1,5.13,...,-1.59,1.080,1.88,0.0360,253,-47.9,11.9,-47.8,-12.30,2
5727,2022,9,393,21.80,-392,21.90,4.64,1.01,-28.5,5.56,...,-1.57,1.190,3.91,0.0267,253,-47.9,11.9,-47.8,-12.30,2


## Using manual inspection, outliers were found
### outliers were typically values of 100000 in the columns, so theses records were removed 

In [7]:
for column in df2022.columns:
    df2022 = df2022.query(column+'<100000').copy()

In [8]:
df2022

Unnamed: 0,year,doy,Proton_V_nonlin,Proton_sigmaV_nonlin,Proton_VX_nonlin,Proton_sigmaVX_nonlin,Proton_VY_nonlin,Proton_sigmaVY_nonlin,Proton_VZ_nonlin,Proton_sigmaVZ_nonlin,...,BY,BZ,Ang_dev,dev,xgse,ygse,zgse,ygsm,zgsm,month
0,2022,1,435,4.35,-432,4.32,13.80,1.00,43.7,1.00,...,5.63,-0.957,2.99,0.0271,258,-21.9,12.9,-25.3,1.91,2
1,2022,1,424,53.80,-423,54.00,7.89,3.07,38.1,10.50,...,5.47,-1.060,7.14,0.0470,258,-21.9,12.9,-25.3,1.91,2
2,2022,1,436,22.90,-433,23.00,9.83,1.87,43.3,7.56,...,5.61,-1.610,1.78,0.0264,258,-21.9,12.9,-25.3,1.91,2
3,2022,1,444,4.44,-441,4.41,11.10,1.00,50.7,1.00,...,5.75,-1.810,5.39,0.0524,258,-21.9,12.9,-25.3,1.92,2
4,2022,1,461,24.50,-457,24.70,15.50,2.93,59.5,10.70,...,5.16,-3.220,3.28,0.0685,258,-21.9,12.9,-25.3,1.93,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5724,2022,9,393,20.70,-392,20.70,2.86,1.00,-24.7,4.55,...,-1.75,1.040,3.14,0.0437,253,-47.9,11.9,-47.8,-12.30,2
5725,2022,9,392,21.70,-390,21.70,6.36,1.38,-29.0,5.65,...,-1.54,0.974,3.78,0.0361,253,-47.9,11.9,-47.8,-12.30,2
5726,2022,9,393,21.30,-392,21.30,5.96,1.27,-27.1,5.13,...,-1.59,1.080,1.88,0.0360,253,-47.9,11.9,-47.8,-12.30,2
5727,2022,9,393,21.80,-392,21.90,4.64,1.01,-28.5,5.56,...,-1.57,1.190,3.91,0.0267,253,-47.9,11.9,-47.8,-12.30,2


In [9]:
df2022['Proton_W_nonlin'].max()

177.0

## Merging the two dataframes (year 2012, year 2022) to make one complete dataset
### and saving the dataset to a csv file

In [10]:
data = pd.concat([df2012, df2022], axis=0, ignore_index=True)
data.to_csv(r'D:\nouran\Competitions\Nasa space apps\final data.csv')
data

Unnamed: 0,year,doy,Proton_V_nonlin,Proton_sigmaV_nonlin,Proton_VX_nonlin,Proton_sigmaVX_nonlin,Proton_VY_nonlin,Proton_sigmaVY_nonlin,Proton_VZ_nonlin,Proton_sigmaVZ_nonlin,...,BY,BZ,Ang_dev,dev,xgse,ygse,zgse,ygsm,zgsm,month
0,2012,1,663.0,44.3,-659.0,44.5,21.80,5.29,67.9,16.00,...,-1.86,5.910,11.80,0.0881,212.0,-90.7,-2.94,-89.9,-12.1,7
1,2012,1,656.0,38.7,-651.0,38.9,22.60,4.78,73.3,15.10,...,-2.15,5.260,5.18,0.1440,212.0,-90.7,-2.95,-89.9,-12.1,7
2,2012,1,650.0,38.4,-646.0,38.6,20.40,4.43,65.6,13.60,...,-2.40,5.030,4.81,0.1880,212.0,-90.7,-2.95,-89.9,-12.1,7
3,2012,1,655.0,36.3,-651.0,36.6,19.00,3.85,72.2,14.10,...,-2.88,5.470,2.36,0.0754,212.0,-90.7,-2.95,-89.9,-12.0,7
4,2012,1,659.0,37.7,-655.0,37.9,19.10,3.90,69.8,14.00,...,-3.14,5.620,5.72,0.1290,212.0,-90.7,-2.95,-89.9,-12.0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13672,2022,9,393.0,20.7,-392.0,20.7,2.86,1.00,-24.7,4.55,...,-1.75,1.040,3.14,0.0437,253.0,-47.9,11.90,-47.8,-12.3,2
13673,2022,9,392.0,21.7,-390.0,21.7,6.36,1.38,-29.0,5.65,...,-1.54,0.974,3.78,0.0361,253.0,-47.9,11.90,-47.8,-12.3,2
13674,2022,9,393.0,21.3,-392.0,21.3,5.96,1.27,-27.1,5.13,...,-1.59,1.080,1.88,0.0360,253.0,-47.9,11.90,-47.8,-12.3,2
13675,2022,9,393.0,21.8,-392.0,21.9,4.64,1.01,-28.5,5.56,...,-1.57,1.190,3.91,0.0267,253.0,-47.9,11.90,-47.8,-12.3,2


## Install libraries needed for machine learning model

In [12]:
!pip install xgboost



You should consider upgrading via the 'c:\users\nouran\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


In [13]:
!pip install -U scikit-learn



You should consider upgrading via the 'c:\users\nouran\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


In [14]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

## Splittting the data into 70% training and 30% testing

In [15]:
train = data.loc[data.index < int(round(data.shape[0]*0.7,0))]
test = data.loc[data.index >= int(round(data.shape[0]*0.7,0))]

In [16]:
train

Unnamed: 0,year,doy,Proton_V_nonlin,Proton_sigmaV_nonlin,Proton_VX_nonlin,Proton_sigmaVX_nonlin,Proton_VY_nonlin,Proton_sigmaVY_nonlin,Proton_VZ_nonlin,Proton_sigmaVZ_nonlin,...,BY,BZ,Ang_dev,dev,xgse,ygse,zgse,ygsm,zgsm,month
0,2012,1,663.0,44.30,-659.0,44.50,21.8,5.29,67.9,16.00,...,-1.86,5.910,11.80,0.0881,212.0,-90.7,-2.94,-89.9,-12.10,7
1,2012,1,656.0,38.70,-651.0,38.90,22.6,4.78,73.3,15.10,...,-2.15,5.260,5.18,0.1440,212.0,-90.7,-2.95,-89.9,-12.10,7
2,2012,1,650.0,38.40,-646.0,38.60,20.4,4.43,65.6,13.60,...,-2.40,5.030,4.81,0.1880,212.0,-90.7,-2.95,-89.9,-12.10,7
3,2012,1,655.0,36.30,-651.0,36.60,19.0,3.85,72.2,14.10,...,-2.88,5.470,2.36,0.0754,212.0,-90.7,-2.95,-89.9,-12.00,7
4,2012,1,659.0,37.70,-655.0,37.90,19.1,3.90,69.8,14.00,...,-3.14,5.620,5.72,0.1290,212.0,-90.7,-2.95,-89.9,-12.00,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9570,2022,2,452.0,18.90,-447.0,19.10,-54.2,8.07,-42.7,6.07,...,5.37,3.460,3.24,0.0283,257.0,-27.0,12.80,-28.7,8.56,2
9571,2022,2,451.0,21.30,-445.0,21.50,-49.3,8.53,-45.9,7.44,...,4.53,4.650,3.36,0.1200,257.0,-27.0,12.80,-28.7,8.54,2
9572,2022,2,451.0,12.60,-445.0,12.70,-47.8,5.38,-51.4,5.27,...,5.37,1.960,13.40,0.0429,257.0,-27.0,12.80,-28.7,8.51,2
9573,2022,2,450.0,17.40,-445.0,17.50,-45.7,7.30,-54.9,7.68,...,5.67,0.361,9.21,0.0614,257.0,-27.1,12.80,-28.7,8.49,2


In [17]:
test

Unnamed: 0,year,doy,Proton_V_nonlin,Proton_sigmaV_nonlin,Proton_VX_nonlin,Proton_sigmaVX_nonlin,Proton_VY_nonlin,Proton_sigmaVY_nonlin,Proton_VZ_nonlin,Proton_sigmaVZ_nonlin,...,BY,BZ,Ang_dev,dev,xgse,ygse,zgse,ygsm,zgsm,month
9575,2022,2,434.0,4.34,-433.0,4.33,-31.70,1.00,-14.500,1.00,...,3.53,-5.350,11.00,0.0498,257.0,-27.1,12.8,-28.7,8.44,2
9576,2022,2,433.0,4.33,-433.0,4.33,-20.60,1.00,-15.600,1.00,...,2.57,-5.860,7.25,0.4330,257.0,-27.1,12.8,-28.7,8.42,2
9577,2022,2,433.0,4.33,-432.0,4.32,-25.90,1.00,0.541,1.00,...,3.32,-6.480,7.81,0.1300,257.0,-27.1,12.8,-28.7,8.37,2
9578,2022,2,435.0,4.35,-434.0,4.34,-34.20,1.00,3.000,1.00,...,3.31,-6.600,4.91,0.0382,257.0,-27.1,12.8,-28.7,8.35,2
9579,2022,2,437.0,4.37,-437.0,4.37,-24.80,1.00,6.750,1.00,...,1.89,-7.130,6.17,0.0418,257.0,-27.1,12.8,-28.8,8.33,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13672,2022,9,393.0,20.70,-392.0,20.70,2.86,1.00,-24.700,4.55,...,-1.75,1.040,3.14,0.0437,253.0,-47.9,11.9,-47.8,-12.30,2
13673,2022,9,392.0,21.70,-390.0,21.70,6.36,1.38,-29.000,5.65,...,-1.54,0.974,3.78,0.0361,253.0,-47.9,11.9,-47.8,-12.30,2
13674,2022,9,393.0,21.30,-392.0,21.30,5.96,1.27,-27.100,5.13,...,-1.59,1.080,1.88,0.0360,253.0,-47.9,11.9,-47.8,-12.30,2
13675,2022,9,393.0,21.80,-392.0,21.90,4.64,1.01,-28.500,5.56,...,-1.57,1.190,3.91,0.0267,253.0,-47.9,11.9,-47.8,-12.30,2


In [18]:
train.columns

Index(['year', 'doy', 'Proton_V_nonlin', 'Proton_sigmaV_nonlin',
       'Proton_VX_nonlin', 'Proton_sigmaVX_nonlin', 'Proton_VY_nonlin',
       'Proton_sigmaVY_nonlin', 'Proton_VZ_nonlin', 'Proton_sigmaVZ_nonlin',
       'Proton_W_nonlin', 'Proton_sigmaW_nonlin', 'Proton_Wperp_nonlin',
       'Proton_sigmaWperp_nonlin', 'Proton_Wpar_nonlin',
       'Proton_sigmaWpar_nonlin', 'EW_flowangle', 'SigmaEW_flowangle',
       'NS_flowangle', 'SigmaNS_flowangle', 'Proton_Np_nonlin',
       'Proton_sigmaNp_nonlin', 'ChisQ_DOF_nonlin', 'Peak_doy',
       'sigmaPeak_doy', 'Proton_V_moment', 'Proton_VX_moment',
       'Proton_VY_moment', 'Proton_VZ_moment', 'Proton_W_moment',
       'Proton_Wperp_moment', 'Proton_Wpar_moment', 'Proton_Np_moment', 'BX',
       'BY', 'BZ', 'Ang_dev', 'dev', 'xgse', 'ygse', 'zgse', 'ygsm', 'zgsm',
       'month'],
      dtype='object')

## Selecting the features and target variables for train and test data

In [19]:
features = ['year', 'doy', 'Proton_V_nonlin', 'Proton_sigmaV_nonlin',
       'Proton_VX_nonlin', 'Proton_sigmaVX_nonlin', 'Proton_VY_nonlin',
       'Proton_sigmaVY_nonlin', 'Proton_sigmaVZ_nonlin',
       'Proton_W_nonlin', 'Proton_sigmaW_nonlin', 'Proton_Wperp_nonlin',
       'Proton_sigmaWperp_nonlin', 'Proton_Wpar_nonlin',
       'Proton_sigmaWpar_nonlin', 'EW_flowangle', 'SigmaEW_flowangle',
       'NS_flowangle', 'SigmaNS_flowangle', 'Proton_Np_nonlin',
       'Proton_sigmaNp_nonlin', 'ChisQ_DOF_nonlin', 'Peak_doy',
       'sigmaPeak_doy', 'Proton_V_moment', 'Proton_VX_moment',
       'Proton_VY_moment', 'Proton_VZ_moment', 'Proton_W_moment',
       'Proton_Wperp_moment', 'Proton_Wpar_moment', 'Proton_Np_moment', 'BX',
       'BY', 'BZ', 'Ang_dev', 'dev', 'xgse', 'ygse', 'zgse', 'ygsm', 'zgsm',
       'month']
target = 'Proton_VZ_nonlin'

x_train = train[features]
y_train = train[target]
x_test = test[features]
y_test = test[target]

In [20]:
x_train

Unnamed: 0,year,doy,Proton_V_nonlin,Proton_sigmaV_nonlin,Proton_VX_nonlin,Proton_sigmaVX_nonlin,Proton_VY_nonlin,Proton_sigmaVY_nonlin,Proton_sigmaVZ_nonlin,Proton_W_nonlin,...,BY,BZ,Ang_dev,dev,xgse,ygse,zgse,ygsm,zgsm,month
0,2012,1,663.0,44.30,-659.0,44.50,21.8,5.29,16.00,70.5,...,-1.86,5.910,11.80,0.0881,212.0,-90.7,-2.94,-89.9,-12.10,7
1,2012,1,656.0,38.70,-651.0,38.90,22.6,4.78,15.10,67.5,...,-2.15,5.260,5.18,0.1440,212.0,-90.7,-2.95,-89.9,-12.10,7
2,2012,1,650.0,38.40,-646.0,38.60,20.4,4.43,13.60,69.4,...,-2.40,5.030,4.81,0.1880,212.0,-90.7,-2.95,-89.9,-12.10,7
3,2012,1,655.0,36.30,-651.0,36.60,19.0,3.85,14.10,64.9,...,-2.88,5.470,2.36,0.0754,212.0,-90.7,-2.95,-89.9,-12.00,7
4,2012,1,659.0,37.70,-655.0,37.90,19.1,3.90,14.00,65.7,...,-3.14,5.620,5.72,0.1290,212.0,-90.7,-2.95,-89.9,-12.00,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9570,2022,2,452.0,18.90,-447.0,19.10,-54.2,8.07,6.07,40.4,...,5.37,3.460,3.24,0.0283,257.0,-27.0,12.80,-28.7,8.56,2
9571,2022,2,451.0,21.30,-445.0,21.50,-49.3,8.53,7.44,42.7,...,4.53,4.650,3.36,0.1200,257.0,-27.0,12.80,-28.7,8.54,2
9572,2022,2,451.0,12.60,-445.0,12.70,-47.8,5.38,5.27,46.9,...,5.37,1.960,13.40,0.0429,257.0,-27.0,12.80,-28.7,8.51,2
9573,2022,2,450.0,17.40,-445.0,17.50,-45.7,7.30,7.68,45.9,...,5.67,0.361,9.21,0.0614,257.0,-27.1,12.80,-28.7,8.49,2


In [21]:
y_train

0       67.9
1       73.3
2       65.6
3       72.2
4       69.8
        ... 
9570   -42.7
9571   -45.9
9572   -51.4
9573   -54.9
9574   -36.7
Name: Proton_VZ_nonlin, Length: 9575, dtype: float64

## Training an XGBoost Model with learning rate = 0.01 on the data and testing it

reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',    
                       n_estimators=1000,
                       early_stopping_rounds=50,
                       objective='reg:linear',
                       max_depth=3,
                       learning_rate=0.01)
reg.fit(x_train, y_train,
        eval_set=[(x_train, y_train), (x_test, y_test)],
        verbose=100)

In [24]:
import numpy as np
y_predicted = reg.predict(x_test)
score = np.sqrt(mean_squared_error(y_test, y_predicted))
print(f'RMSE Score on Test set: {score:0.2f}')

RMSE Score on Test set: 2.29


In [25]:
from sklearn.metrics import max_error
max_error(y_test, y_predicted)

35.72547149658203

## Training an XGBoost Model with learning rate = 0.1 on the data and testing it 

In [26]:
reg2 = xgb.XGBRegressor(base_score=0.5, booster='gbtree',    
                       n_estimators=1000,
                       early_stopping_rounds=50,
                       objective='reg:linear',
                       max_depth=3,
                       learning_rate=0.1)
reg2.fit(x_train, y_train,
        eval_set=[(x_train, y_train), (x_test, y_test)],
        verbose=100)

y_predicted = reg2.predict(x_test)
score = np.sqrt(mean_squared_error(y_test, y_predicted))
print(f'RMSE Score on Test set: {score:0.2f}')

from sklearn.metrics import max_error
max_error(y_test, y_predicted)

[0]	validation_0-rmse:17.56574	validation_1-rmse:21.79558
[100]	validation_0-rmse:1.06882	validation_1-rmse:2.34769
[200]	validation_0-rmse:0.83201	validation_1-rmse:2.14540
[300]	validation_0-rmse:0.67702	validation_1-rmse:1.98898
[400]	validation_0-rmse:0.57447	validation_1-rmse:1.86993
[500]	validation_0-rmse:0.50576	validation_1-rmse:1.80256
[600]	validation_0-rmse:0.44846	validation_1-rmse:1.74580
[700]	validation_0-rmse:0.40331	validation_1-rmse:1.69757
[800]	validation_0-rmse:0.36813	validation_1-rmse:1.65823
[900]	validation_0-rmse:0.34186	validation_1-rmse:1.63566
[999]	validation_0-rmse:0.31744	validation_1-rmse:1.61262
RMSE Score on Test set: 1.61


23.744861602783203

## The model can be used for forecasting by predicting the value of the target paramter and then calculate how intense is the solar wind (i.e. Kp index)