In [1]:
import pandas as pd
from pathlib import Path
from datetime import datetime
from tqdm.auto import tqdm
import numpy as np
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)
tqdm.pandas()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [2]:
data_path = Path(r"D:\Productivity\Studying\PMLDL_A2\data\raw\ml-100k")

# Explore u.data

In [3]:
df = pd.read_csv(data_path / "u.data", sep="\t", header=None)
df.columns = ["user_id", "item_id", "rating", "timestamp"]
# unix seconds since 1/1/1970 UTC conversion; alternative: datetime.fromtimestamp(881250949)
df["timestamp"] = df["timestamp"].apply(lambda x: pd.Timestamp(x, unit="s"))
df

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16
...,...,...,...,...
99995,880,476,3,1997-11-22 05:10:44
99996,716,204,5,1997-11-17 19:39:03
99997,276,1090,1,1997-09-20 22:49:55
99998,13,225,2,1997-12-17 22:52:36


In [4]:
df.user_id.value_counts()

user_id
405    737
655    685
13     636
450    540
276    518
      ... 
441     20
36      20
812     20
895     20
93      20
Name: count, Length: 943, dtype: int64

Users have never been asked to rate the same item twice

In [5]:
df[["user_id", "item_id"]].duplicated().any()

False

# Explore and preprocess u.item

The dataset doesn't correspond the schema provided. There's an empty column and no second "release" column. Let's delete it. We won't be using the links either, drop them too.

In [6]:
item_df = pd.read_csv(data_path / "u.item", sep= "|", encoding='latin-1', header=None)
item_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Drop excess

In [7]:
item_df = item_df.drop(columns=[0, 3, 4])
item_df = item_df.dropna()
item_df

Unnamed: 0,1,2,5,6,7,8,9,10,11,12,...,14,15,16,17,18,19,20,21,22,23
0,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,Mat' i syn (1997),06-Feb-1998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,B. Monkey (1998),06-Feb-1998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,Sliding Doors (1998),01-Jan-1998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,You So Crazy (1994),01-Jan-1994,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## Split release years and titles. Encode them

In [8]:
import spacy 
nlp = spacy.load("en_core_web_sm")
tokenizer = nlp.tokenizer

def split_title(title):
    *actural_title, release = title.split()
    release = release.removeprefix("(").removesuffix(")")
    release = int(release) if release.isdigit() else np.nan
    actural_title = " ".join(actural_title)
    return actural_title, release

def embed(title):
    return nlp(tokenizer(title)).vector

In [9]:
item_df[["title", "year1"]] = item_df[1].progress_apply(split_title).progress_apply(pd.Series)
item_df.dropna(inplace=True)
item_df.drop(columns=[1], inplace=True)

  0%|          | 0/1681 [00:00<?, ?it/s]

  0%|          | 0/1681 [00:00<?, ?it/s]

In [10]:
embeddings = item_df["title"].progress_apply(embed).parallel_apply(pd.Series)
item_df.drop(columns=["title"], inplace=True)
item_df["year1"] = (item_df["year1"]-item_df["year1"].min())/(item_df["year1"].max()-item_df["year1"].min())

  0%|          | 0/1680 [00:00<?, ?it/s]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=210), Label(value='0 / 210'))), HB…

In [11]:
item_df = pd.concat([item_df, embeddings], axis=1, ignore_index=True)
item_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,107,108,109,110,111,112,113,114,115,116
0,01-Jan-1995,0,0,0,1,1,1,0,0,0,...,0.880769,-0.020001,-1.334084,0.297919,-0.228033,-0.496077,1.755286,0.080202,0.024104,0.970577
1,01-Jan-1995,0,1,1,0,0,0,0,0,0,...,0.549181,-0.129407,-0.533101,-0.451775,-1.237758,-0.193940,2.175473,-0.205619,0.169546,1.099319
2,01-Jan-1995,0,0,0,0,0,0,0,0,0,...,-0.815718,0.151895,-0.223589,0.331158,-0.593922,-0.616442,2.066849,0.362578,0.192173,0.966223
3,01-Jan-1995,0,1,0,0,0,1,0,0,1,...,-0.018125,-0.712686,-1.003018,-0.109466,-0.086747,-0.519322,1.551831,-0.361525,0.390277,1.138312
4,01-Jan-1995,0,0,0,0,0,0,1,0,1,...,-0.580432,0.079627,-0.244669,-0.329614,-1.106851,-0.048058,2.611301,0.377608,0.864265,1.025843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,06-Feb-1998,0,0,0,0,0,0,0,0,1,...,0.597517,0.110161,0.180132,0.099105,-0.327815,0.481098,0.728221,-0.377583,0.280629,1.150406
1678,06-Feb-1998,0,0,0,0,0,0,0,0,0,...,0.497504,0.127760,-0.877288,1.068902,0.571075,-0.393900,2.085971,0.534923,0.119280,0.553088
1679,01-Jan-1998,0,0,0,0,0,0,0,0,1,...,0.478829,-0.190052,-0.173051,-0.735751,-0.606304,0.704382,1.427145,0.139404,0.325960,0.890898
1680,01-Jan-1994,0,0,0,0,0,1,0,0,0,...,-0.314565,0.014890,0.444865,0.629190,1.350931,-0.945164,1.289780,-0.048561,-0.583076,0.577224


## Encode video release dates

In [12]:
from dateutil import parser

def encode_date(date):
    date = parser.parse(date)
    return np.sin(date.day), np.cos(date.day), np.sin(date.month), np.cos(date.month), date.year


date_features = item_df[0].progress_apply(encode_date).parallel_apply(pd.Series)
date_features[4] = (date_features[4]-date_features[4].min())/(date_features[4].max()-date_features[4].min())
date_features

  0%|          | 0/1680 [00:00<?, ?it/s]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=210), Label(value='0 / 210'))), HB…

Unnamed: 0,0,1,2,3,4
0,0.841471,0.540302,0.841471,0.540302,0.960526
1,0.841471,0.540302,0.841471,0.540302,0.960526
2,0.841471,0.540302,0.841471,0.540302,0.960526
3,0.841471,0.540302,0.841471,0.540302,0.960526
4,0.841471,0.540302,0.841471,0.540302,0.960526
...,...,...,...,...,...
1677,-0.279415,0.960170,0.909297,-0.416147,1.000000
1678,-0.279415,0.960170,0.909297,-0.416147,1.000000
1679,0.841471,0.540302,0.841471,0.540302,1.000000
1680,0.841471,0.540302,0.841471,0.540302,0.947368


In [13]:
item_df.drop(columns=[0], inplace=True)
item_df = pd.concat([item_df, date_features], axis=1, ignore_index=True)

In [14]:
item_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,111,112,113,114,115,116,117,118,119,120
0,0,0,0,1,1,1,0,0,0,0,...,-0.496077,1.755286,0.080202,0.024104,0.970577,0.841471,0.540302,0.841471,0.540302,0.960526
1,0,1,1,0,0,0,0,0,0,0,...,-0.193940,2.175473,-0.205619,0.169546,1.099319,0.841471,0.540302,0.841471,0.540302,0.960526
2,0,0,0,0,0,0,0,0,0,0,...,-0.616442,2.066849,0.362578,0.192173,0.966223,0.841471,0.540302,0.841471,0.540302,0.960526
3,0,1,0,0,0,1,0,0,1,0,...,-0.519322,1.551831,-0.361525,0.390277,1.138312,0.841471,0.540302,0.841471,0.540302,0.960526
4,0,0,0,0,0,0,1,0,1,0,...,-0.048058,2.611301,0.377608,0.864265,1.025843,0.841471,0.540302,0.841471,0.540302,0.960526
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,0,0,0,0,0,0,0,0,1,0,...,0.481098,0.728221,-0.377583,0.280629,1.150406,-0.279415,0.960170,0.909297,-0.416147,1.000000
1678,0,0,0,0,0,0,0,0,0,0,...,-0.393900,2.085971,0.534923,0.119280,0.553088,-0.279415,0.960170,0.909297,-0.416147,1.000000
1679,0,0,0,0,0,0,0,0,1,0,...,0.704382,1.427145,0.139404,0.325960,0.890898,0.841471,0.540302,0.841471,0.540302,1.000000
1680,0,0,0,0,0,1,0,0,0,0,...,-0.945164,1.289780,-0.048561,-0.583076,0.577224,0.841471,0.540302,0.841471,0.540302,0.947368


We won't be using u.genres as these are included into u.item dataset

# Explore and preprocess u.user 

In [15]:
df_user = pd.read_csv(data_path / "u.user", sep= "|", encoding='latin-1', header=None, names=["id", "age", "gender", "occupation", "zip_code"])
df_user

Unnamed: 0,id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


## Process occupation
There're only 21 occupation types, we can encode them with ohe

In [16]:
df_occupation = pd.read_csv(data_path / "u.occupation", sep= "|", encoding='latin-1', header=None)

occupation_dtype = pd.CategoricalDtype(categories=df_occupation[0].to_list())
df_user["occupation"] = pd.Series(df_user["occupation"], dtype=occupation_dtype)
df_occupation

Unnamed: 0,0
0,administrator
1,artist
2,doctor
3,educator
4,engineer
5,entertainment
6,executive
7,healthcare
8,homemaker
9,lawyer


In [17]:
df_user["occupation"]

0         technician
1              other
2             writer
3         technician
4              other
           ...      
938          student
939    administrator
940          student
941        librarian
942          student
Name: occupation, Length: 943, dtype: category
Categories (21, object): ['administrator', 'artist', 'doctor', 'educator', ..., 'scientist', 'student', 'technician', 'writer']

In [18]:
df_occupation = pd.get_dummies(df_user["occupation"], dtype=float)
df_occupation.head()

Unnamed: 0,administrator,artist,doctor,educator,engineer,entertainment,executive,healthcare,homemaker,lawyer,...,marketing,none,other,programmer,retired,salesman,scientist,student,technician,writer
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Process zip codes

In [19]:
import pgeocode

nomi = pgeocode.Nominatim('us')

In [20]:
df_geo = df_user.zip_code.parallel_apply(nomi.query_postal_code)
df_geo.head()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=118), Label(value='0 / 118'))), HB…

Unnamed: 0,postal_code,country_code,place_name,state_name,state_code,county_name,county_code,community_name,community_code,latitude,longitude,accuracy
0,85711,US,Tucson,Arizona,AZ,Pima,19.0,,,32.2127,-110.8829,4.0
1,94043,US,Mountain View,California,CA,Santa Clara,85.0,,,37.4056,-122.0775,4.0
2,32067,US,Orange Park,Florida,FL,Clay,19.0,,,30.1661,-81.7065,4.0
3,43537,US,Maumee,Ohio,OH,Lucas,95.0,,,41.5817,-83.6628,4.0
4,15213,US,Pittsburgh,Pennsylvania,PA,Allegheny,3.0,,,40.444,-79.9552,4.0


Many states are undersampled, hence counties and cities must be so too. Taking coordinates instead

In [21]:
df_geo["state_code"].value_counts().sort_values(ascending=True).head(10)

state_code
SD    1
WY    1
AR    1
ME    2
HI    2
ND    2
NM    2
MT    2
WV    3
RI    3
Name: count, dtype: int64

In [22]:
coordinates = df_geo[["latitude", "longitude"]]

## Process age and gender

Age is kind of a linear feature so minmax it or whatever

In [23]:
df_user.age = (df_user.age-df_user.age.min())/(df_user.age.max()-df_user.age.min())

2 genders is releaving, it's handy to ohe

In [24]:
df_user.gender = df_user.gender.map({"M": 0, "F": 1})

## Assemble u.user

In [25]:
df_user[["latitude", "longitude"]] = coordinates
df_user.drop(columns=["zip_code", "occupation"], inplace = True)

df_user = pd.concat([df_occupation, df_user], axis=1)
df_user.rename(columns={"id": "user_id"}, inplace=True)
df_user = df_user.set_index("user_id", drop=True)
df_user

Unnamed: 0_level_0,administrator,artist,doctor,educator,engineer,entertainment,executive,healthcare,homemaker,lawyer,...,retired,salesman,scientist,student,technician,writer,age,gender,latitude,longitude
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.257576,0,32.2127,-110.8829
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.696970,1,37.4056,-122.0775
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.242424,0,30.1661,-81.7065
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.257576,0,41.5817,-83.6628
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.393939,1,40.4440,-79.9552
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.287879,1,26.1848,-80.2406
940,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.378788,0,42.3471,-71.1027
941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.196970,0,45.5483,-122.8276
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.621212,1,29.4821,-98.4554


# Assemble u.data

In [26]:
item_df["item_id"] = item_df.index + 1
item_df.set_index("item_id", drop=True, inplace=True)
item_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,111,112,113,114,115,116,117,118,119,120
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,1,1,1,0,0,0,0,...,-0.496077,1.755286,0.080202,0.024104,0.970577,0.841471,0.540302,0.841471,0.540302,0.960526
2,0,1,1,0,0,0,0,0,0,0,...,-0.193940,2.175473,-0.205619,0.169546,1.099319,0.841471,0.540302,0.841471,0.540302,0.960526
3,0,0,0,0,0,0,0,0,0,0,...,-0.616442,2.066849,0.362578,0.192173,0.966223,0.841471,0.540302,0.841471,0.540302,0.960526
4,0,1,0,0,0,1,0,0,1,0,...,-0.519322,1.551831,-0.361525,0.390277,1.138312,0.841471,0.540302,0.841471,0.540302,0.960526
5,0,0,0,0,0,0,1,0,1,0,...,-0.048058,2.611301,0.377608,0.864265,1.025843,0.841471,0.540302,0.841471,0.540302,0.960526
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,0,0,0,0,0,0,0,0,1,0,...,0.481098,0.728221,-0.377583,0.280629,1.150406,-0.279415,0.960170,0.909297,-0.416147,1.000000
1679,0,0,0,0,0,0,0,0,0,0,...,-0.393900,2.085971,0.534923,0.119280,0.553088,-0.279415,0.960170,0.909297,-0.416147,1.000000
1680,0,0,0,0,0,0,0,0,1,0,...,0.704382,1.427145,0.139404,0.325960,0.890898,0.841471,0.540302,0.841471,0.540302,1.000000
1681,0,0,0,0,0,1,0,0,0,0,...,-0.945164,1.289780,-0.048561,-0.583076,0.577224,0.841471,0.540302,0.841471,0.540302,0.947368


In [27]:
df = df.join(item_df, on="item_id").join(df_user, on="user_id")
df.drop(columns=["user_id", "item_id", "timestamp"], inplace=True)
df.dropna(inplace=True)
df

Unnamed: 0,rating,0,1,2,3,4,5,6,7,8,...,retired,salesman,scientist,student,technician,writer,age,gender,latitude,longitude
0,3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.636364,0,44.9347,-93.1651
1,3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.484848,1,,
2,1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.272727,0,38.2503,-85.7034
3,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.318182,0,40.5384,-105.0547
4,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.606061,0,45.0139,-93.1571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.090909,0,43.6322,-116.2052
99996,5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.439394,1,41.0328,-81.2484
99997,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.212121,0,36.9959,-122.0578
99998,2,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.606061,0,34.0247,-80.9532
