In [1]:
import pandas as pd
import rasterio
from rasterstats import zonal_stats
from osgeo import gdal
import geopandas as gpd
from rasterio.warp import calculate_default_transform, reproject, Resampling
import numpy as np
from shapely.geometry import Point
from shapely.geometry import Polygon
from pyproj import CRS
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import pyproj
import netCDF4 as nc
import glob
import pickle 
import random
from tqdm import tqdm  
import datetime as datetime
import gc
import os
import h5py
from scipy.stats import pearson3, norm
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### Nightlights

#### Daily (25th and 26th August, 2018)

In [3]:
# Import packages
import os
import warnings
import glob
import viirs

In [4]:
# Set options
warnings.simplefilter("ignore")

In [5]:
# Define path to folder containing input VNP46A2 HDF5 files

hdf5_input_folder = "C:/Users/omhai/OneDrive/Desktop/Shetty/Capstone Project/Nightlights/Harvey/VNP46A2"

# Defne path to output folder to store exported GeoTiff files

geotiff_output_folder = "C:/Users/omhai/OneDrive/Desktop/Shetty/Capstone Project/Nightlights/Harvey/VNP46A2-Processed"

In [6]:
hdf5_files = glob.glob(os.path.join(hdf5_input_folder, "*.h5"))

In [7]:
# Preprocess each HDF5 file (extract bands, mask for fill values,
#  poor-quality, no retrieval, clouds, sea water, fill masked values
#  with NaN, export to GeoTiff)
hdf5_files = glob.glob(os.path.join(hdf5_input_folder, "*.h5"))
processed_files = 0
total_files = len(hdf5_files)
for hdf5 in hdf5_files:
    viirs.preprocess_vnp46a2(
        hdf5_path=hdf5, output_folder=geotiff_output_folder
    )
    processed_files += 1
    print(f"Preprocessed file: {processed_files} of {total_files}\n\n")

Started preprocessing: VNP46A2.A2017237.h07v05.001.2021116121853.h5
Extracting bands...
Applying scale factor...
Masking for fill values...
Masking for poor quality and no retrieval...
Masking for clouds...
Masking for sea water...
Filling masked values...
Creating metadata...
Exporting to GeoTiff...
Exported: vnp46a2-a2017237-h07v05-001-2021116121853.tif
Completed preprocessing: VNP46A2.A2017237.h07v05.001.2021116121853.h5

Preprocessed file: 1 of 8


Started preprocessing: VNP46A2.A2017237.h07v06.001.2021116114957.h5
Extracting bands...
Applying scale factor...
Masking for fill values...
Masking for poor quality and no retrieval...
Masking for clouds...
Masking for sea water...
Filling masked values...
Creating metadata...
Exporting to GeoTiff...
Exported: vnp46a2-a2017237-h07v06-001-2021116114957.tif
Completed preprocessing: VNP46A2.A2017237.h07v06.001.2021116114957.h5

Preprocessed file: 2 of 8


Started preprocessing: VNP46A2.A2017237.h08v05.001.2021116115851.h5
Extracting bands..

In [8]:
#tiff files we have are two vertical files so we will join them here

# Define path to folder containing preprocessed VNP46A1 GeoTiff files
geotiff_input_folder ="C:/Users/omhai/OneDrive/Desktop/Shetty/Capstone Project/Nightlights/Harvey/VNP46A2-Processed"

# Defne path to output folder to store concatenated, exported GeoTiff files
geotiff_output_folder = "C:/Users/omhai/OneDrive/Desktop/Shetty/Capstone Project/Nightlights/Harvey/VNP46A2-Concatenate"

# Set start date and end date for processing
start_date, end_date = "2018-08-25", "2018-08-26"

In [9]:
viirs.concatenate(
            input_folder=geotiff_input_folder,
            output_folder=geotiff_output_folder,
            name="asd"
        )

 50%|█████     | 1/2 [00:01<00:01,  1.02s/it]

Processed Date: 08-25-2017


100%|██████████| 2/2 [00:01<00:00,  1.02it/s]

Processed Date: 08-26-2017





#### Monthly

In [16]:
# Define path to folder containing input VNP46A2 HDF5 files

hdf5_input_folder = "C:/Users/omhai/OneDrive/Desktop/Shetty/Capstone Project/Nightlights/Harvey/VNP46A2_Monthly"

# Defne path to output folder to store exported GeoTiff files

geotiff_output_folder = "C:/Users/omhai/OneDrive/Desktop/Shetty/Capstone Project/Nightlights/Harvey/VNP46A2_Monthly-Processed"

In [17]:
hdf5_files = glob.glob(os.path.join(hdf5_input_folder, "*.h5"))

In [18]:
hdf5_files

['C:/Users/omhai/OneDrive/Desktop/Shetty/Capstone Project/Nightlights/Harvey/VNP46A2_Monthly\\VNP46A3.A2018152.h07v05.001.2021125173146.h5',
 'C:/Users/omhai/OneDrive/Desktop/Shetty/Capstone Project/Nightlights/Harvey/VNP46A2_Monthly\\VNP46A3.A2018152.h07v06.001.2021125183705.h5',
 'C:/Users/omhai/OneDrive/Desktop/Shetty/Capstone Project/Nightlights/Harvey/VNP46A2_Monthly\\VNP46A3.A2018152.h08v05.001.2021125173205.h5',
 'C:/Users/omhai/OneDrive/Desktop/Shetty/Capstone Project/Nightlights/Harvey/VNP46A2_Monthly\\VNP46A3.A2018152.h08v06.001.2021125173156.h5']

In [19]:
# Preprocess each HDF5 file (extract bands, mask for fill values,
#  poor-quality, no retrieval, clouds, sea water, fill masked values
#  with NaN, export to GeoTiff)
hdf5_files = glob.glob(os.path.join(hdf5_input_folder, "*.h5"))
processed_files = 0
total_files = len(hdf5_files)
for hdf5 in hdf5_files:
    viirs.preprocess_vnp46a3(
        hdf5_path=hdf5, output_folder=geotiff_output_folder
    )
    processed_files += 1
    print(f"Preprocessed file: {processed_files} of {total_files}\n\n")

Started preprocessing: VNP46A3.A2018152.h07v05.001.2021125173146.h5
Extracting bands...
Applying scale factor...
Masking for fill values...
Filling masked values...
Creating metadata...
Exporting to GeoTiff...
Exported: vnp46a3-a2018152-h07v05-001-2021125173146.tif
Completed preprocessing: VNP46A3.A2018152.h07v05.001.2021125173146.h5

Preprocessed file: 1 of 4


Started preprocessing: VNP46A3.A2018152.h07v06.001.2021125183705.h5
Extracting bands...
Applying scale factor...
Masking for fill values...
Filling masked values...
Creating metadata...
Exporting to GeoTiff...
Exported: vnp46a3-a2018152-h07v06-001-2021125183705.tif
Completed preprocessing: VNP46A3.A2018152.h07v06.001.2021125183705.h5

Preprocessed file: 2 of 4


Started preprocessing: VNP46A3.A2018152.h08v05.001.2021125173205.h5
Extracting bands...
Applying scale factor...
Masking for fill values...
Filling masked values...
Creating metadata...
Exporting to GeoTiff...
Exported: vnp46a3-a2018152-h08v05-001-2021125173205.tif
Comp

In [20]:
# Define path to folder containing preprocessed VNP46A1 GeoTiff files
geotiff_input_folder ="C:/Users/omhai/OneDrive/Desktop/Shetty/Capstone Project/Nightlights/Harvey/VNP46A2_Monthly-Processed"

# Defne path to output folder to store concatenated, exported GeoTiff files
geotiff_output_folder = "C:/Users/omhai/OneDrive/Desktop/Shetty/Capstone Project/Nightlights/Harvey/VNP46A2_Monthly-Concatenate"

In [21]:
viirs.concatenate(
            input_folder=geotiff_input_folder,
            output_folder=geotiff_output_folder,
            name="asd"
        )

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:01<00:00,  1.32s/it]

Processed Date: 06-01-2018





In [22]:
day_26 = pd.read_csv("C:/Users/omhai/OneDrive/Desktop/Shetty/Capstone Project/Nightlights/Harvey/26th_Daily.csv")
day_26.head()

Unnamed: 0,id,left,top,right,bottom,_count,_sum,_mean
0,1,-674640.626252,-919588.25018,-674140.626252,-920088.25018,,,
1,2,-674640.626252,-920088.25018,-674140.626252,-920588.25018,,,
2,3,-674640.626252,-920588.25018,-674140.626252,-921088.25018,,,
3,4,-674640.626252,-921088.25018,-674140.626252,-921588.25018,,,
4,5,-674640.626252,-921588.25018,-674140.626252,-922088.25018,,,


In [23]:
day_26 = day_26[["id","_mean"]].rename(columns={"_mean":"day_26"})
day_26.head()

Unnamed: 0,id,day_26
0,1,
1,2,
2,3,
3,4,
4,5,


In [24]:
night_monthly = pd.read_csv("C:/Users/omhai/OneDrive/Desktop/Shetty/Capstone Project/Nightlights/Harvey/Monthly_Nightlights_Harvey.csv")
night_monthly.head()

Unnamed: 0,id,left,top,right,bottom,_count,_sum,_mean
0,1,-674640.626252,-919588.25018,-674140.626252,-920088.25018,,,
1,2,-674640.626252,-920088.25018,-674140.626252,-920588.25018,,,
2,3,-674640.626252,-920588.25018,-674140.626252,-921088.25018,,,
3,4,-674640.626252,-921088.25018,-674140.626252,-921588.25018,,,
4,5,-674640.626252,-921588.25018,-674140.626252,-922088.25018,,,


In [25]:
night_monthly = night_monthly[["id","_mean"]].rename(columns={"_mean":"nightlight_prev"})
night_monthly.head()

Unnamed: 0,id,nightlight_prev
0,1,
1,2,
2,3,
3,4,
4,5,


In [26]:
outage = day_26.merge(night_monthly,on="id")
outage["outage"] = (outage["nightlight_prev"]-outage["day_26"])/outage["nightlight_prev"]*100
outage.head()

Unnamed: 0,id,day_26,nightlight_prev,outage
0,1,,,
1,2,,,
2,3,,,
3,4,,,
4,5,,,


In [27]:
outage.shape

(6389860, 4)

In [28]:
outage = outage[outage["outage"]>0]
outage.head()

Unnamed: 0,id,day_26,nightlight_prev,outage
206983,206984,1.4,2.3,39.130435
206984,206985,1.696302,2.426987,30.106655
206985,206986,2.1,2.45,14.285714
206986,206987,2.1,2.3,8.695652
206991,206992,1.5,6.2,75.806452


In [29]:
outage.shape

(72197, 4)

### Rainfall

In [30]:
files_list = sorted(glob.glob("C:/Users/omhai/OneDrive/Desktop/Shetty/Capstone Project/Rainfall/Harvey/August_26.nc4"))
files_list[0]

'C:/Users/omhai/OneDrive/Desktop/Shetty/Capstone Project/Rainfall/Harvey/August_26.nc4'

In [31]:
# Open the .nc4 file
file_path = files_list[0]
dataset = nc.Dataset(file_path)

dataset

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    CDI: Climate Data Interface version 1.9.8 (https://mpimet.mpg.de/cdi)
    Conventions: CF-1.6
    BeginDate: 2017-08-26
    BeginTime: 00:00:00.000Z
    EndDate: 2017-08-26
    EndTime: 23:59:59.999Z
    FileHeader: StartGranuleDateTime=2017-08-26T00:00:00.000Z;
StopGranuleDateTime=2017-08-26T23:59:59.999Z
    InputPointer: 3B-HHR.MS.MRG.3IMERG.20170826-S000000-E002959.0000.V06B.HDF5;3B-HHR.MS.MRG.3IMERG.20170826-S003000-E005959.0030.V06B.HDF5;3B-HHR.MS.MRG.3IMERG.20170826-S010000-E012959.0060.V06B.HDF5;3B-HHR.MS.MRG.3IMERG.20170826-S013000-E015959.0090.V06B.HDF5;3B-HHR.MS.MRG.3IMERG.20170826-S020000-E022959.0120.V06B.HDF5;3B-HHR.MS.MRG.3IMERG.20170826-S023000-E025959.0150.V06B.HDF5;3B-HHR.MS.MRG.3IMERG.20170826-S030000-E032959.0180.V06B.HDF5;3B-HHR.MS.MRG.3IMERG.20170826-S033000-E035959.0210.V06B.HDF5;3B-HHR.MS.MRG.3IMERG.20170826-S040000-E042959.0240.V06B.HDF5;3B-HHR.MS.MRG.3IMERG.20170826-S0430

In [32]:
# Define the coordinates in New Jersey as a DataFrame
tx_coordinates_df = pd.read_csv("C:/users/omhai/OneDrive/Desktop/Shetty/Capstone Project/Landcover/Temp/latlong_Texas_degrees.csv")
tx_coordinates_df.head()

Unnamed: 0.1,Unnamed: 0,Center_Latitude,Center_Longitude
0,0,36.44359,-107.514549
1,1,36.439094,-107.514092
2,2,36.434599,-107.513635
3,3,36.430103,-107.513178
4,4,36.425608,-107.512721


In [34]:
import netCDF4 as nc
import numpy as np
from scipy.spatial import cKDTree
import pandas as pd

# Open the netCDF file
dataset = nc.Dataset(files_list[0])

# Read the variable
precipitation = dataset.variables['precipitationCal'][:]

# Get the longitude and latitude values
lon = dataset.variables['lon'][:]
lat = dataset.variables['lat'][:]

# Reshape the longitude and latitude arrays to have the same dimensions
lon_2d, lat_2d = np.meshgrid(lon, lat)

# Create a KDTree from the reshaped longitude and latitude arrays
tree = cKDTree(np.column_stack((lon_2d.ravel(), lat_2d.ravel())))

# Define the coordinates in New Jersey as a DataFrame
#nj_coordinates_df = pd.read_csv("C:/Users/omhai/OneDrive/Desktop/Shetty/Capstone Project/Landcover/Temp/latlong_NJ_degrees.csv")

# Find the nearest grid points to the New Jersey coordinates
distances, indices = tree.query(np.column_stack((tx_coordinates_df['Center_Longitude'], tx_coordinates_df['Center_Latitude'])))

# Filter out indices that are out of bounds
valid_indices = np.logical_and(indices >= 0, indices < lon.shape[0] * lat.shape[0])

# Get the valid indices and rainfall values
valid_indices = valid_indices.nonzero()[0]
rainfall_values = precipitation.ravel()[indices[valid_indices]]

# Create a new DataFrame with the valid indices and rainfall values
tx_coordinates_df_filtered = tx_coordinates_df.loc[valid_indices].copy()
tx_coordinates_df_filtered['Rainfall'] = rainfall_values

tx_coordinates_df_filtered.head()

Unnamed: 0.1,Unnamed: 0,Center_Latitude,Center_Longitude,Rainfall
0,0,36.44359,-107.514549,0.807318
1,1,36.439094,-107.514092,0.807318
2,2,36.434599,-107.513635,0.807318
3,3,36.430103,-107.513178,0.807318
4,4,36.425608,-107.512721,0.807318


In [35]:
rainfall = tx_coordinates_df_filtered.copy()
rainfall = rainfall.rename(columns={'Unnamed: 0': 'id'})
rainfall.head()

Unnamed: 0,id,Center_Latitude,Center_Longitude,Rainfall
0,0,36.44359,-107.514549,0.807318
1,1,36.439094,-107.514092,0.807318
2,2,36.434599,-107.513635,0.807318
3,3,36.430103,-107.513178,0.807318
4,4,36.425608,-107.512721,0.807318


### Landcover

In [36]:
landcover = pd.read_csv("C:/Users/omhai/OneDrive/Desktop/Shetty/Capstone Project/Landcover/landcover_zonal_Texas.csv")
landcover.head()

Unnamed: 0,id,left,top,right,bottom,HISTO_0,HISTO_1,HISTO_3,HISTO_4,HISTO_5,HISTO_6,HISTO_7,HISTO_8,HISTO_9,HISTO_10,HISTO_14,HISTO_15,HISTO_16,HISTO_17,HISTO_18,HISTO_NODATA
0,1,-631830,-938170,-631330,-938670,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,289
1,2,-631830,-938670,-631330,-939170,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,272
2,3,-631830,-939170,-631330,-939670,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,289
3,4,-631830,-939670,-631330,-940170,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,289
4,5,-631830,-940170,-631330,-940670,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,272


In [37]:
landcover.columns

Index(['id', 'left', 'top', 'right', 'bottom', 'HISTO_0', 'HISTO_1', 'HISTO_3',
       'HISTO_4', 'HISTO_5', 'HISTO_6', 'HISTO_7', 'HISTO_8', 'HISTO_9',
       'HISTO_10', 'HISTO_14', 'HISTO_15', 'HISTO_16', 'HISTO_17', 'HISTO_18',
       'HISTO_NODATA'],
      dtype='object')

In [38]:
histo_columns = ['HISTO_0', 'HISTO_1', 'HISTO_3',
       'HISTO_4', 'HISTO_5', 'HISTO_6', 'HISTO_7', 'HISTO_8', 'HISTO_9',
       'HISTO_10', 'HISTO_14', 'HISTO_15', 'HISTO_16', 'HISTO_17', 'HISTO_18',
       'HISTO_NODATA']

# Convert HISTO columns to percentages
landcover[histo_columns] = landcover[histo_columns].div(landcover[histo_columns].sum(axis=1), axis=0) * 100
landcover.head()

Unnamed: 0,id,left,top,right,bottom,HISTO_0,HISTO_1,HISTO_3,HISTO_4,HISTO_5,HISTO_6,HISTO_7,HISTO_8,HISTO_9,HISTO_10,HISTO_14,HISTO_15,HISTO_16,HISTO_17,HISTO_18,HISTO_NODATA
0,1,-631830,-938170,-631330,-938670,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
1,2,-631830,-938670,-631330,-939170,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
2,3,-631830,-939170,-631330,-939670,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
3,4,-631830,-939670,-631330,-940170,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
4,5,-631830,-940170,-631330,-940670,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0


### Wind

In [39]:
wind = pd.read_csv("C:/Users/omhai/OneDrive/Desktop/Shetty/Capstone Project/Wind/Texas_Akshay.csv")
wind.head()

Unnamed: 0,ID,Center_Latitude,Center_Longitude,Vmax
0,0,36.44359,-107.514549,0.0
1,1,36.439094,-107.514092,0.0
2,2,36.434599,-107.513635,0.0
3,3,36.430103,-107.513178,0.0
4,4,36.425608,-107.512721,0.0


In [40]:
wind = wind[["ID","Vmax"]]

### ML

In [41]:
reg_harvey = outage.merge(wind,left_on="id",right_on="ID")
reg_harvey = reg_harvey.merge(landcover,on="id")
reg_harvey = reg_harvey.merge(rainfall,on="id")
reg_harvey.head()

Unnamed: 0,id,day_26,nightlight_prev,outage,ID,Vmax,left,top,right,bottom,HISTO_0,HISTO_1,HISTO_3,HISTO_4,HISTO_5,HISTO_6,HISTO_7,HISTO_8,HISTO_9,HISTO_10,HISTO_14,HISTO_15,HISTO_16,HISTO_17,HISTO_18,HISTO_NODATA,Center_Latitude,Center_Longitude,Rainfall
0,206984,1.4,2.3,39.130435,206984,6.328723,-587830,-1029670,-587330,-1030170,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,31.904865,-106.641237,0.076977
1,206985,1.696302,2.426987,30.106655,206985,6.329144,-587830,-1030170,-587330,-1030670,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,31.900344,-106.640882,0.076977
2,206986,2.1,2.45,14.285714,206986,6.329567,-587830,-1030670,-587330,-1031170,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,31.895823,-106.640526,0.040701
3,206987,2.1,2.3,8.695652,206987,6.329989,-587830,-1031170,-587330,-1031670,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,31.891301,-106.640171,0.040701
4,206992,1.5,6.2,75.806452,206992,6.375113,-587830,-1033670,-587330,-1034170,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,31.868695,-106.638396,0.040701


In [42]:
reg_harvey.columns

Index(['id', 'day_26', 'nightlight_prev', 'outage', 'ID', 'Vmax', 'left',
       'top', 'right', 'bottom', 'HISTO_0', 'HISTO_1', 'HISTO_3', 'HISTO_4',
       'HISTO_5', 'HISTO_6', 'HISTO_7', 'HISTO_8', 'HISTO_9', 'HISTO_10',
       'HISTO_14', 'HISTO_15', 'HISTO_16', 'HISTO_17', 'HISTO_18',
       'HISTO_NODATA', 'Center_Latitude', 'Center_Longitude', 'Rainfall'],
      dtype='object')

In [43]:
reg_harvey = reg_harvey[['nightlight_prev', 'outage','Vmax', 'HISTO_0', 'HISTO_1', 'HISTO_3', 'HISTO_4',
       'HISTO_5', 'HISTO_6', 'HISTO_7', 'HISTO_8', 'HISTO_9', 'HISTO_10',
       'HISTO_14', 'HISTO_15', 'HISTO_16', 'HISTO_17', 'HISTO_18',
       'Rainfall']]

reg_harvey.head()

Unnamed: 0,nightlight_prev,outage,Vmax,HISTO_0,HISTO_1,HISTO_3,HISTO_4,HISTO_5,HISTO_6,HISTO_7,HISTO_8,HISTO_9,HISTO_10,HISTO_14,HISTO_15,HISTO_16,HISTO_17,HISTO_18,Rainfall
0,2.3,39.130435,6.328723,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076977
1,2.426987,30.106655,6.329144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076977
2,2.45,14.285714,6.329567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040701
3,2.3,8.695652,6.329989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040701
4,6.2,75.806452,6.375113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040701


In [44]:
nan_count = reg_harvey.isna().sum()
print(nan_count)

nightlight_prev    0
outage             0
Vmax               0
HISTO_0            0
HISTO_1            0
HISTO_3            0
HISTO_4            0
HISTO_5            0
HISTO_6            0
HISTO_7            0
HISTO_8            0
HISTO_9            0
HISTO_10           0
HISTO_14           0
HISTO_15           0
HISTO_16           0
HISTO_17           0
HISTO_18           0
Rainfall           0
dtype: int64


In [45]:
reg_harvey.shape

(8523, 19)

In [47]:
reg_harvey.head()

Unnamed: 0,nightlight_prev,outage,Vmax,HISTO_0,HISTO_1,HISTO_3,HISTO_4,HISTO_5,HISTO_6,HISTO_7,HISTO_8,HISTO_9,HISTO_10,HISTO_14,HISTO_15,HISTO_16,HISTO_17,HISTO_18,Rainfall
0,2.3,39.130435,6.328723,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076977
1,2.426987,30.106655,6.329144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076977
2,2.45,14.285714,6.329567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040701
3,2.3,8.695652,6.329989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040701
4,6.2,75.806452,6.375113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040701


In [48]:
df = reg_harvey.copy()

#df = df.drop("HISTO_NODATA", axis=1)
#df = df[df["Vmax"]>20]

print(df.shape)

# Separate independent and dependent variables
X = df.drop('outage', axis=1)  # Independent variables
y = df['outage']  # Dependent variable

# Standardize independent variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Create and fit the regression model
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)

# Print the model coefficients
print("Model Coefficients:")
for feature, coef in zip(X.columns, reg_model.coef_):
    print(feature, ":", coef)

# Predict on the test set
y_pred = reg_model.predict(X_test)

# Calculate R2 score on the test set
r2 = r2_score(y_test, y_pred)

# Print the R2 score
print("R2 score on the test set:", r2)

(8523, 19)
Model Coefficients:
nightlight_prev : -2.5392011175470177
Vmax : 1.509560787664685
HISTO_0 : -0.14495717264129845
HISTO_1 : -0.16631792768026726
HISTO_3 : 0.10018732170831554
HISTO_4 : -0.16218582169821533
HISTO_5 : -3175494915226.4526
HISTO_6 : -5797541240.207974
HISTO_7 : -0.31168483415655185
HISTO_8 : -0.2355390095131349
HISTO_9 : -0.3902606442813406
HISTO_10 : 0.10669500110989763
HISTO_14 : -0.02446553404216774
HISTO_15 : -0.24339579859484617
HISTO_16 : 0.10779812871557838
HISTO_17 : 0.051587971918983615
HISTO_18 : 0.44351596870126603
Rainfall : 0.10885033541985445
R2 score on the test set: -1.9859974547191664e+23
