## Creating features for AIS 

In [None]:

#Feature1: syncError

# Step 1: Compute SyncError_TS
combined_data["SyncError_TS"] = (combined_data["Ts"] - combined_data["Slotnumber"]).abs()

# Step 2: Compute total counts (including NaN)
total_records = len(combined_data)  # Total row number
syncerror_zero_count = (combined_data["SyncError_TS"] == 0).sum()
syncerror_neither_zero_nan_count = (combined_data["SyncError_TS"] > 0).sum()
nan_count = combined_data["SyncError_TS"].isna().sum()

# Calculate proportions
syncerror_zero_ratio = syncerror_zero_count / total_records
syncerror_neither_zero_nan_ratio = syncerror_neither_zero_nan_count / total_records
nan_ratio = nan_count / total_records

# Output results
print('0 ratio:', syncerror_zero_ratio)
print('Neither 0 nor NaN ratio:', syncerror_neither_zero_nan_ratio)
print('NaN ratio:', nan_ratio)

display(combined_data.head(10)) # skip if don't want to check the data frame


0 ratio: 0.7204504598038175
Neither 0 nor NaN ratio: 0.2795483992909654
NaN ratio: 1.1409052170172857e-06


Unnamed: 0,Channelnumber,Id,Navstatus,Slotoffset,Slotnumber,Sto,Slotincrement,Keepflag,Ts,Sog,Specialmanoeuvre,Toa,Difftoa,old_slot_number,SyncError_TS
0,1,1,8.0,,1980.0,2.0,0.0,,1980,0.0,0.0,0.0,0.0,,0.0
1,1,1,8.0,2250.0,1980.0,2.0,0.0,,1980,0.0,0.0,0.0,0.0,1980.0,0.0
2,2,1,7.0,2250.0,1980.0,6.0,0.0,,1980,0.1,0.0,0.003326,0.0,1980.0,0.0
3,1,1,15.0,2299.0,2029.0,3.0,0.0,,2029,0.0,0.0,1.318451,0.0,1980.0,0.0
4,1,1,0.0,2258.0,2037.0,3.0,0.0,,2037,11.7,0.0,1.518524,0.0,2029.0,0.0
5,2,1,0.0,2258.0,2045.0,6.0,0.0,,2045,0.0,0.0,1.740845,0.0,2037.0,0.0
6,2,1,0.0,2250.0,2045.0,6.0,0.0,,2045,0.0,0.0,1.740845,0.0,2045.0,0.0
7,1,1,15.0,2277.0,2072.0,3.0,0.0,,2072,7.2,0.0,2.459625,0.0,2045.0,0.0
8,1,1,5.0,2253.0,2075.0,3.0,0.0,,2075,7.0,0.0,2.540253,0.0,2072.0,0.0
9,1,1,0.0,2272.0,2097.0,6.0,0.0,,2097,9.4,0.0,3.119324,0.0,2075.0,0.0


In [78]:
# feature 2: Difference between nominal Difftoa and actual Difftoa

# Define the nominal reporting interval function based on the table
def calculate_nominal_interval(nav_status, sog):
    if nav_status == 1 and sog <= 3:  # Anchor and speed <= 3 knots
        return 180  # 3 minutes in seconds
    elif nav_status == 1 and sog > 3:  # Anchor and speed > 3 knots
        return 10  # 10 seconds
    elif nav_status == 5 and sog <= 3:  # Moored and speed <= 3 knots
        return 180  # 3 minutes in seconds
    elif nav_status == 5 and sog > 3:  # Moored and speed > 3 knots
        return 10  # 10 seconds
    elif sog <= 14:  # Speed 0-14 knots
        return 10  # 10 seconds
    elif sog <= 14 and nav_status not in [1, 5]:  # Speed 0-14 knots and changing course
        return 3.333  # 3 1/3 seconds
    elif 14 < sog <= 23:  # Speed 14-23 knots
        return 6  # 6 seconds
    elif 14 < sog <= 23 and nav_status not in [1, 5]:  # Speed 14-23 knots and changing course
        return 2  # 2 seconds
    elif sog > 23:  # Speed > 23 knots
        return 2  # 2 seconds
    else:
        return np.nan  # Undefined case

# Apply the function to calculate Nominal_Interval for each row
combined_data["Nominal_Interval"] = combined_data.apply(
    lambda row: calculate_nominal_interval(row["Navstatus"], row["Sog"]), axis=1
)

# Calculate Interval_Bias as the difference between Actual_Interval and Nominal_Interval
combined_data["Interval_Bias"] = combined_data["Difftoa"] - combined_data["Nominal_Interval"]

combined_data = combined_data.drop(columns= ['Nominal_Interval'])
combined_data['SyncError_TS'] = combined_data['SyncError_TS'].fillna(0)



display(combined_data.head(20)) # skip if don't want to check the data frame
print('Number of NaN in interval bias:', combined_data['Interval_Bias'].isna().sum())
print('Number of NaN in SyncError_TS:', combined_data['SyncError_TS'].isna().sum())


Unnamed: 0,Channelnumber,Id,Navstatus,Slotoffset,Slotnumber,Sto,Slotincrement,Keepflag,Ts,Sog,Specialmanoeuvre,Toa,Difftoa,old_slot_number,SyncError_TS,Interval_Bias
0,1,1,8.0,,1980.0,2.0,0.0,,1980,0.0,0.0,0.0,0.0,,0.0,-10.0
1,1,1,8.0,2250.0,1980.0,2.0,0.0,,1980,0.0,0.0,0.0,0.0,1980.0,0.0,-10.0
2,2,1,7.0,2250.0,1980.0,6.0,0.0,,1980,0.1,0.0,0.003326,0.0,1980.0,0.0,-10.0
3,1,1,15.0,2299.0,2029.0,3.0,0.0,,2029,0.0,0.0,1.318451,0.0,1980.0,0.0,-10.0
4,1,1,0.0,2258.0,2037.0,3.0,0.0,,2037,11.7,0.0,1.518524,0.0,2029.0,0.0,-10.0
5,2,1,0.0,2258.0,2045.0,6.0,0.0,,2045,0.0,0.0,1.740845,0.0,2037.0,0.0,-10.0
6,2,1,0.0,2250.0,2045.0,6.0,0.0,,2045,0.0,0.0,1.740845,0.0,2045.0,0.0,-10.0
7,1,1,15.0,2277.0,2072.0,3.0,0.0,,2072,7.2,0.0,2.459625,0.0,2045.0,0.0,-10.0
8,1,1,5.0,2253.0,2075.0,3.0,0.0,,2075,7.0,0.0,2.540253,0.0,2072.0,0.0,-10.0
9,1,1,0.0,2272.0,2097.0,6.0,0.0,,2097,9.4,0.0,3.119324,0.0,2075.0,0.0,-10.0


Number of NaN in interval bias: 0
Number of NaN in SyncError_TS: 0


In [79]:
#feature 3: the use of new time slot under special maneuver 


# Step 1: selection (id=3 and SpecialManeuver=2)
special_operation_data = combined_data[
    (combined_data["Id"] == 3) | (combined_data["Specialmanoeuvre"] == 2 )
]
print(len(special_operation_data))
# Step 2: create bew feature which I name it as NewTimeSlot_Used
special_operation_data["NewTimeSlot_Used"] = (
    (combined_data["Slotincrement"] > 0) &  # SlotIncrement effective
    (combined_data["Keepflag"] == 1)       # KeepFlag = 1
)

# Step 3: add this feature in my dataframe
combined_data["NewTimeSlot_Used"] = special_operation_data["NewTimeSlot_Used"]

display(combined_data.head(10))
print('Number of NaN in NewTimeSlot_Used:', combined_data["NewTimeSlot_Used"].isna().sum())
print(combined_data['NewTimeSlot_Used'].isna().sum(), len(combined_data),combined_data['NewTimeSlot_Used'].isna().sum()/len(combined_data) )

# count the number of True and False in this column
true_count = combined_data["NewTimeSlot_Used"].sum() 
false_count = combined_data["NewTimeSlot_Used"].shape[0] - combined_data["NewTimeSlot_Used"].isna().sum() - true_count

# output the results
print("Number of True in NewTimeSlot_Used:", true_count)
print("Number of False in NewTimeSlot_Used:", false_count)



386168


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  special_operation_data["NewTimeSlot_Used"] = (


Unnamed: 0,Channelnumber,Id,Navstatus,Slotoffset,Slotnumber,Sto,Slotincrement,Keepflag,Ts,Sog,Specialmanoeuvre,Toa,Difftoa,old_slot_number,SyncError_TS,Interval_Bias,NewTimeSlot_Used
0,1,1,8.0,,1980.0,2.0,0.0,,1980,0.0,0.0,0.0,0.0,,0.0,-10.0,
1,1,1,8.0,2250.0,1980.0,2.0,0.0,,1980,0.0,0.0,0.0,0.0,1980.0,0.0,-10.0,
2,2,1,7.0,2250.0,1980.0,6.0,0.0,,1980,0.1,0.0,0.003326,0.0,1980.0,0.0,-10.0,
3,1,1,15.0,2299.0,2029.0,3.0,0.0,,2029,0.0,0.0,1.318451,0.0,1980.0,0.0,-10.0,
4,1,1,0.0,2258.0,2037.0,3.0,0.0,,2037,11.7,0.0,1.518524,0.0,2029.0,0.0,-10.0,
5,2,1,0.0,2258.0,2045.0,6.0,0.0,,2045,0.0,0.0,1.740845,0.0,2037.0,0.0,-10.0,
6,2,1,0.0,2250.0,2045.0,6.0,0.0,,2045,0.0,0.0,1.740845,0.0,2045.0,0.0,-10.0,
7,1,1,15.0,2277.0,2072.0,3.0,0.0,,2072,7.2,0.0,2.459625,0.0,2045.0,0.0,-10.0,
8,1,1,5.0,2253.0,2075.0,3.0,0.0,,2075,7.0,0.0,2.540253,0.0,2072.0,0.0,-10.0,
9,1,1,0.0,2272.0,2097.0,6.0,0.0,,2097,9.4,0.0,3.119324,0.0,2075.0,0.0,-10.0,


Number of NaN in NewTimeSlot_Used: 2243323
2243323 2629491 0.853139638051623
Number of True in NewTimeSlot_Used: 33550
Number of False in NewTimeSlot_Used: 352618


In [80]:
# Feature: STO distribution
# If STO is within [4, 7], deviation is 0. Otherwise, calculate the absolute deviation from the nearest bound (4 or 7).

def calculate_sto_deviation(sto_value):
    if 4 <= sto_value <= 7:
        return 0
    elif sto_value < 4:
        return abs(sto_value - 4)
    else:
        return abs(sto_value - 7)

# Apply the function to create a new feature
combined_data["STO_Deviation"] = combined_data["Sto"].apply(calculate_sto_deviation)

# Step 2: Add this feature to the feature_data
combined_data["STO_Deviation"] = combined_data["STO_Deviation"]
combined_data.drop(columns= 'STO_Deviation')

display(combined_data.head(10))
print('NaN ratio IN STO_DEVIATION:', combined_data['STO_Deviation'].isna().sum()/ len(combined_data))

Unnamed: 0,Channelnumber,Id,Navstatus,Slotoffset,Slotnumber,Sto,Slotincrement,Keepflag,Ts,Sog,Specialmanoeuvre,Toa,Difftoa,old_slot_number,SyncError_TS,Interval_Bias,NewTimeSlot_Used,STO_Deviation
0,1,1,8.0,,1980.0,2.0,0.0,,1980,0.0,0.0,0.0,0.0,,0.0,-10.0,,2.0
1,1,1,8.0,2250.0,1980.0,2.0,0.0,,1980,0.0,0.0,0.0,0.0,1980.0,0.0,-10.0,,2.0
2,2,1,7.0,2250.0,1980.0,6.0,0.0,,1980,0.1,0.0,0.003326,0.0,1980.0,0.0,-10.0,,0.0
3,1,1,15.0,2299.0,2029.0,3.0,0.0,,2029,0.0,0.0,1.318451,0.0,1980.0,0.0,-10.0,,1.0
4,1,1,0.0,2258.0,2037.0,3.0,0.0,,2037,11.7,0.0,1.518524,0.0,2029.0,0.0,-10.0,,1.0
5,2,1,0.0,2258.0,2045.0,6.0,0.0,,2045,0.0,0.0,1.740845,0.0,2037.0,0.0,-10.0,,0.0
6,2,1,0.0,2250.0,2045.0,6.0,0.0,,2045,0.0,0.0,1.740845,0.0,2045.0,0.0,-10.0,,0.0
7,1,1,15.0,2277.0,2072.0,3.0,0.0,,2072,7.2,0.0,2.459625,0.0,2045.0,0.0,-10.0,,1.0
8,1,1,5.0,2253.0,2075.0,3.0,0.0,,2075,7.0,0.0,2.540253,0.0,2072.0,0.0,-10.0,,1.0
9,1,1,0.0,2272.0,2097.0,6.0,0.0,,2097,9.4,0.0,3.119324,0.0,2075.0,0.0,-10.0,,0.0


NaN ratio IN STO_DEVIATION: 0.0


In [81]:
# feature 5: slotnumber difference
print('ratio of NaN in Slotnumbers:', combined_data['Slotnumber'].isna().sum() / len(combined_data))
def calculate_slot_number_difference(slot_numbers):
    differences = []
    for i in range(len(slot_numbers)):
        if i == 0:
            # not applicable for the first row
            differences.append(None)
        else:
            difference = abs(slot_numbers[i] - slot_numbers[i - 1])
            differences.append(difference)
    return differences


combined_data["SlotChange"] = calculate_slot_number_difference(combined_data["Slotnumber"].tolist())
    
# add this new feature back into my data frame
combined_data["SlotChange"] = combined_data["SlotChange"]
display(combined_data.head(10))
    
    



ratio of NaN in Slotnumbers: 1.1409052170172857e-06


Unnamed: 0,Channelnumber,Id,Navstatus,Slotoffset,Slotnumber,Sto,Slotincrement,Keepflag,Ts,Sog,Specialmanoeuvre,Toa,Difftoa,old_slot_number,SyncError_TS,Interval_Bias,NewTimeSlot_Used,STO_Deviation,SlotChange
0,1,1,8.0,,1980.0,2.0,0.0,,1980,0.0,0.0,0.0,0.0,,0.0,-10.0,,2.0,
1,1,1,8.0,2250.0,1980.0,2.0,0.0,,1980,0.0,0.0,0.0,0.0,1980.0,0.0,-10.0,,2.0,0.0
2,2,1,7.0,2250.0,1980.0,6.0,0.0,,1980,0.1,0.0,0.003326,0.0,1980.0,0.0,-10.0,,0.0,0.0
3,1,1,15.0,2299.0,2029.0,3.0,0.0,,2029,0.0,0.0,1.318451,0.0,1980.0,0.0,-10.0,,1.0,49.0
4,1,1,0.0,2258.0,2037.0,3.0,0.0,,2037,11.7,0.0,1.518524,0.0,2029.0,0.0,-10.0,,1.0,8.0
5,2,1,0.0,2258.0,2045.0,6.0,0.0,,2045,0.0,0.0,1.740845,0.0,2037.0,0.0,-10.0,,0.0,8.0
6,2,1,0.0,2250.0,2045.0,6.0,0.0,,2045,0.0,0.0,1.740845,0.0,2045.0,0.0,-10.0,,0.0,0.0
7,1,1,15.0,2277.0,2072.0,3.0,0.0,,2072,7.2,0.0,2.459625,0.0,2045.0,0.0,-10.0,,1.0,27.0
8,1,1,5.0,2253.0,2075.0,3.0,0.0,,2075,7.0,0.0,2.540253,0.0,2072.0,0.0,-10.0,,1.0,3.0
9,1,1,0.0,2272.0,2097.0,6.0,0.0,,2097,9.4,0.0,3.119324,0.0,2075.0,0.0,-10.0,,0.0,22.0


In [82]:
#feature 6: channel alternation
# check whether it is alternating normally(current channelnumber != previous channelnumber)
combined_data['is_alternating'] = (combined_data['Channelnumber'] != combined_data['Channelnumber'].shift(1)).astype(int)

# check the results
print(combined_data[['Channelnumber', 'is_alternating']].head(10))


   Channelnumber  is_alternating
0              1               1
1              1               0
2              2               1
3              1               1
4              1               0
5              2               1
6              2               0
7              1               1
8              1               0
9              1               0
