## Imports

In [1]:
import pandas as pd
import numpy as np
import pickle
from glob import glob
import json
from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Read data and format

In [2]:
df = pd.read_csv("drive/MyDrive/data/full_data_clean.csv", low_memory=False, index_col=0)
official_test_df = pd.read_csv("drive/MyDrive/data/test_processed.csv", index_col=0)

In [3]:
df["Event Date"] = pd.to_datetime(df["Event Date"]).astype(int)
official_test_df["Event Date"] = pd.to_datetime(official_test_df["Event Date"]).astype(int)
region_test = official_test_df.pop("Region")

  """Entry point for launching an IPython kernel.
  


**Recover missing coordinates**

In [4]:
df["Latitude"].isna().sum()

138978

In [5]:
coordinates = {}
for mapping in glob("drive/MyDrive/data/full_data_location_maps/locations_batch_*.json"):
  with open(mapping, "r") as f:
    batch = json.load(f)
    coordinates.update(batch)


In [6]:
df.loc[[int(x) for x in coordinates.keys()], ["Latitude", "Longitude"]] = [x["coordinates"] for x in coordinates.values()]

In [7]:
df["Latitude"].isna().sum()

129239

In [8]:
df.dropna(subset=["Total Fatal Injuries"], inplace=True)

## Check that train dataset and competition test df have the same categories

In [9]:
missing_values = {}
for col in official_test_df.columns:
  if col == "Region":
    continue
  values = official_test_df[col].unique()
  new_values = [value for value in values if value not in df[col].values]
  print(f"For col {col} there are {len(new_values)} missing")

  missing_values[col] = new_values

For col Event Date there are 0 missing
For col Location there are 385 missing
For col Country there are 1 missing
For col Latitude there are 186 missing
For col Longitude there are 186 missing
For col Airport Code there are 5 missing
For col Airport Name there are 2 missing
For col Aircraft Damage there are 0 missing
For col Aircraft Category there are 1 missing
For col Make there are 19 missing
For col Amateur Built there are 0 missing
For col Number of Engines there are 0 missing
For col Engine Type there are 0 missing
For col Schedule there are 1 missing
For col Purpose of Flight there are 1 missing
For col Air Carrier there are 19 missing
For col Total Serious Injuries there are 0 missing
For col Total Minor Injuries there are 0 missing
For col Total Uninjured there are 0 missing
For col Weather Condition there are 3 missing
For col Broad Phase of Flight there are 1 missing
For col Avg_Make_Passenger_Capacity there are 0 missing


In [10]:
missing_values["Location"][:5]

[' CEDAR RAPIDS, IA ',
 ' STIGLER, OK ',
 ' NEWPORT BEACH, CA ',
 ' SUNSET HILLS, MO ',
 ' COLDWATER, OH ']

In [11]:
missing_values["Country"]

[nan]

In [12]:
missing_values["Airport Code"]

[nan, '36', '85', '54', '63']

In [13]:
missing_values["Airport Name"]

[nan, ' EGLIN, AFB ']

In [14]:
missing_values["Aircraft Category"]

[nan]

In [15]:
missing_values["Make"]

['PIPER',
 'CESSNA',
 'MOONEY',
 'BEECH',
 'LOCKHEED',
 'BELL',
 'BOEING',
 'AERO COMMANDER',
 'DOUGLAS',
 'GRUMMAN',
 'AEROSPATIALE',
 'LEARJET',
 'GRUMMAN AMERICAN',
 'de Havilland',
 'MITSUBISHI',
 'BELLANCA',
 'ROCKWELL',
 'McDonnell Douglas',
 'MBB']

In [16]:
missing_values["Air Carrier"]

[nan,
 ' LADCO, INC. ',
 ' GALAXY AIRLINES, INC. ',
 ' SUSQUEHANNA AIRLINES, INC. ',
 ' WINGS WEST AIRLINES, INC. ',
 ' AVIATION ENTERPRISES,INC. ',
 ' ALASKA AIR SERVICE, INC. ',
 ' (DBA: RACO HELCIOPTERS, CORP) ',
 ' AIR CONTINENTAL, INC. ',
 ' WESTCON ASSOCIATES, INC. (DBA: GOLDEN EAGLE) ',
 ' CRAIG HUNTINGTON (DBA: HELICOPTER WEST, INC.[EMS]) ',
 ' WATERFRONT AIRWAYS,INC. ',
 ' TYEE AIRLINES, INC. ',
 ' BASCO FLYING SERVICE, INC. ',
 ' CARIB AIR SERVICE, INC. ',
 ' PETROLEUM HELICOPTERS, INC. ',
 ' BAKER AVIATION, INC. ',
 ' CONTINENTAL HELICOPTERS, INC. ',
 ' AIR FLORIDA, INC ']

In [17]:
missing_values["Weather Condition"]

['Clear Skies', 'Not Clear.  Use Instruments', 'Unknown']

In [18]:
official_test_df["Weather Condition"].unique()

array(['Clear Skies', 'Not Clear.  Use Instruments', 'Unknown'],
      dtype=object)

In [19]:
df["Weather Condition"].unique()

array(['VMC', 'IMC', nan, 'UNK'], dtype=object)

In [20]:
# Change 1:
official_test_df["Location"] = official_test_df["Location"].str.strip()
# Change 2:
df.loc[df["Airport Code"] == '036', "Airport Code"] = "36"
df.loc[df["Airport Code"] == '085', "Airport Code"] = "85"
df.loc[df["Airport Code"] == '054', "Airport Code"] = "54"
df.loc[df["Airport Code"] == '063', "Airport Code"] = "63"
# Change 3:
official_test_df.loc[official_test_df["Airport Name"] == " EGLIN, AFB ", "Airport Name"] = "EGLIN, AFB"
# Change 4:
for val in missing_values["Make"]:
  official_test_df.loc[official_test_df["Make"] == val, "Make"] = val.capitalize()
# Change 5:
for val in missing_values["Air Carrier"]:
  if not isinstance(val, str):
    continue
  official_test_df.loc[official_test_df["Air Carrier"] == val, "Air Carrier"] = val.strip()
# Change 6:
weather_condition_map = {'Clear Skies': "VMC", 'Not Clear.  Use Instruments': "IMC", 'Unknown': "UNK"}
official_test_df["Weather Condition"] = official_test_df["Weather Condition"].apply(lambda x: weather_condition_map[x])
df["Weather Condition"].fillna("UNK", inplace=True)

In [21]:
missing_values = {}
for col in official_test_df.columns:
  if col == "Region":
    continue
  values = official_test_df[col].unique()
  new_values = [value for value in values if value not in df[col].values]
  print(f"For col {col} there are {len(new_values)} missing")

  missing_values[col] = new_values

For col Event Date there are 0 missing
For col Location there are 0 missing
For col Country there are 1 missing
For col Latitude there are 186 missing
For col Longitude there are 186 missing
For col Airport Code there are 1 missing
For col Airport Name there are 1 missing
For col Aircraft Damage there are 0 missing
For col Aircraft Category there are 1 missing
For col Make there are 0 missing
For col Amateur Built there are 0 missing
For col Number of Engines there are 0 missing
For col Engine Type there are 0 missing
For col Schedule there are 1 missing
For col Purpose of Flight there are 1 missing
For col Air Carrier there are 1 missing
For col Total Serious Injuries there are 0 missing
For col Total Minor Injuries there are 0 missing
For col Total Uninjured there are 0 missing
For col Weather Condition there are 0 missing
For col Broad Phase of Flight there are 1 missing
For col Avg_Make_Passenger_Capacity there are 0 missing


## Encode categorical variables and turn data into sparse matrix for SKLearn model

In [22]:
categorical_columns = df.describe(include="O").columns
encoder = OneHotEncoder()

encoder.fit(df[categorical_columns])#.reshape(-1, 1))

OneHotEncoder()

In [23]:
random_generator = np.random.default_rng(seed=1)
train_idx = random_generator.choice(df.index, size=int(df.shape[0]*0.875), replace=False, shuffle=False)

In [24]:
train = df.loc[train_idx, :]
test = df.loc[df.index.difference(train_idx), :]

In [25]:
y_train = train.pop("Total Fatal Injuries")
y_test = test.pop("Total Fatal Injuries")

In [26]:
train_features = csr_matrix(train[train.describe(exclude="O").columns].fillna(0))
train_encodings = encoder.transform(train[categorical_columns])

train_features = hstack((train_features, train_encodings))

In [27]:
test_features = csr_matrix(test[test.describe(exclude="O").columns].fillna(0))
test_encodings = encoder.transform(test[categorical_columns])

test_features = hstack((test_features, test_encodings))

## Train and test model

In [None]:
rforest = RandomForestRegressor()

In [None]:
rforest.fit(train_features, y_train)

RandomForestRegressor()

In [None]:
filename = 'drive/MyDrive/october-contest/sklearn_rforest.sav'
pickle.dump(rforest, open(filename, 'wb'))

In [None]:
print(f"RMSE on Test set is: {(sum((rforest.predict(test_features) - y_test)**2) / len(y_test))**0.5}")

RMSE on Test set is: 3.1324016779025396


## Test on official test split

In [30]:
features_test_official = csr_matrix(official_test_df[official_test_df.describe(exclude="O").columns].fillna(0))
encodings_test_official = encoder.transform(official_test_df[categorical_columns])

features_test_official = hstack((features_test_official, encodings_test_official))

In [None]:
predictions = rforest.predict(features_test_official)

In [None]:
test_predictions = pd.DataFrame({"Total Fatal Injuries": predictions})

In [None]:
test_predictions["ID"] =  test_predictions.index + 1
test_predictions.set_index("ID", inplace=True)

## Send to Kaggle

In [None]:
!pip install --upgrade --force-reinstall --no-deps kaggle -q

[?25l[K     |█████▋                          | 10 kB 29.8 MB/s eta 0:00:01[K     |███████████▏                    | 20 kB 35.4 MB/s eta 0:00:01[K     |████████████████▊               | 30 kB 27.3 MB/s eta 0:00:01[K     |██████████████████████▎         | 40 kB 16.6 MB/s eta 0:00:01[K     |███████████████████████████▉    | 51 kB 19.1 MB/s eta 0:00:01[K     |████████████████████████████████| 58 kB 5.6 MB/s 
[?25h  Building wheel for kaggle (setup.py) ... [?25l[?25hdone


In [None]:
!mkdir ~/.kaggle
!cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/

In [None]:
test_predictions.to_csv("submission2.csv")
test_predictions.to_csv("drive/MyDrive/october-contest/submission2.csv")

In [None]:
!kaggle competitions submit -c october-contest -f submission2.csv -m "submission2"

100% 3.60k/3.60k [00:00<00:00, 7.90kB/s]
Successfully submitted to BT Raptor: October Contest

## Reload model for future use

In [28]:
filename = 'drive/MyDrive/october-contest/sklearn_rforest.sav'
saved_forest = pickle.load(open(filename, 'rb'))

In [29]:
print(f"RMSE on Test set is: {(sum((saved_forest.predict(test_features) - y_test)**2) / len(y_test))**0.5}")

RMSE on Test set is: 3.1219979545043497


In [32]:
predictions2 = saved_forest.predict(features_test_official)

In [33]:
test_predictions2 = pd.DataFrame({"Total Fatal Injuries": predictions2})

In [34]:
test_predictions2["ID"] =  test_predictions2.index + 1
test_predictions2.set_index("ID", inplace=True)

In [35]:
test_predictions2.to_csv("submission2.csv")
test_predictions2.to_csv("drive/MyDrive/october-contest/submission2.csv")