# Logging of the project

In this notebook you will find all the steps it took us to acurately predict the temperature of nuclear waste canister. 

### **Imports** 

In [72]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer

### **Loading of the datasets** 

In [73]:
coordinates_test = pd.read_csv("data/Coordinates_Test.csv")
coordinates_train = pd.read_csv("data/Coordinates_Training.csv")
humidity_test = pd.read_csv("data/Test_Time_humidity.csv")
pressure_test = pd.read_csv("data/Test_Time_pressure.csv")
humidity_train = pd.read_csv("data/Training_data_humidity.csv")
pressure_train = pd.read_csv("data/Training_data_pressure.csv")
temperature_train = pd.read_csv("data/Training_data_temperature.csv")

### **Visualizing the datasets**

In [74]:
display(coordinates_train.sample(10))
coordinates_train.info()
coordinates_train["Material"].unique()

Unnamed: 0.1,Unnamed: 0,Sensor ID,Index,Material,Coor X [m],Coor Y [m],Coor Z [m],R [m]
91,91,N_92,92,OPA,-7.532021,6.694555,-12.707346,14.771865
327,327,N_328,328,OPA,1.240192,18.991145,-2.635413,2.912641
524,524,N_525,525,OPA,-2.235352,17.118021,0.35416,2.263234
240,240,N_241,241,OPA,-14.345496,30.049331,-5.513969,15.368706
670,670,N_671,671,OPA,2.365877,21.614208,-1.712185,2.920437
102,102,N_103,103,OPA,12.113425,8.824034,-11.806631,16.915425
362,362,N_363,363,OPA,7.456434,7.38664,4.925227,8.936234
381,381,N_382,382,OPA,8.15572,23.966095,13.652734,15.903236
483,483,N_484,484,EDZ,0.491072,38.274912,-1.527516,1.604511
93,93,N_94,94,OPA,-13.808908,6.248573,-9.307314,16.652689


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  900 non-null    int64  
 1   Sensor ID   900 non-null    object 
 2   Index       900 non-null    int64  
 3   Material    900 non-null    object 
 4   Coor X [m]  900 non-null    float64
 5   Coor Y [m]  900 non-null    float64
 6   Coor Z [m]  900 non-null    float64
 7   R [m]       900 non-null    float64
dtypes: float64(4), int64(2), object(2)
memory usage: 56.4+ KB


array(['OPA', 'SHCR', 'GBM', 'EDZ', 'VOID', 'CAN', 'BBLOCK'], dtype=object)

We can see there are no missing data and apparently no false measures (outliers) on the position, but some columns are useless like the numbering of rows, the Sensor ID, and the index. Because the rows are in an ascending order and as long as the indices match between the files, the name of the sensor and its number doesn't matter. We are also renaming the columns to make it easier later.

In [75]:
coordinates_train = coordinates_train[["Material", "Coor X [m]", "Coor Y [m]", "Coor Z [m]", "R [m]"]].copy()
coordinates_test = coordinates_test[["Material", "Coor X [m]", "Coor Y [m]", "Coor Z [m]", "R [m]"]].copy()

# changing the column names for faster typing later
new_col_names: dict = {
    "Coor X [m]": "x",
    "Coor Y [m]": "y",
    "Coor Z [m]": "z",
    "R [m]": "r"
}
coordinates_train.rename(columns = new_col_names, inplace=True)
coordinates_test.rename(columns = new_col_names, inplace=True)

display(coordinates_train.head(5))

Unnamed: 0,Material,x,y,z,r
0,OPA,0.208042,14.436936,-2.875503,2.883019
1,OPA,-8.970832,28.229841,-0.134437,8.971839
2,OPA,-14.289501,6.685726,-10.399048,17.672862
3,OPA,6.114855,2.685645,-3.189981,6.896914
4,OPA,4.048845,48.70859,11.260503,11.966289


In [76]:
display(humidity_train.iloc[:,100:130])
mean_humidity = humidity_train.mean(axis=0).iloc[1:] ## not keeping the time for the mean
mean_humidity.dropna(inplace=True)
print(f"Global mean: {np.mean(mean_humidity, axis=0)}, variance: {np.std(mean_humidity, axis=0)}")

Unnamed: 0,N_100,N_101,N_102,N_103,N_104,N_105,N_106,N_107,N_108,N_109,...,N_120,N_121,N_122,N_123,N_124,N_125,N_126,N_127,N_128,N_129
0,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
1,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
2,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
3,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
4,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
5,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
6,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
7,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
8,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
9,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100


Global mean: 98.19472180013585, variance: 5.56673083885869


The humidity doesn't seem to contain much information and we will discard it for our first model.

In [77]:
display(pressure_train.sample(10))
pressure_train.isnull().sum().sum() # See how many missing values there are

Unnamed: 0,M.Time[d],N_1,N_2,N_3,N_4,N_5,N_6,N_7,N_8,N_9,...,N_891,N_892,N_893,N_894,N_895,N_896,N_897,N_898,N_899,N_900
2,1558,278.79731,1461.224531,1655.972023,1321.91935,1473.56518,1480.255784,1650.589484,1511.207764,1397.33862,...,-57373.82834,201.4704,395.458253,648.902434,-405.531701,-60416.87545,-771.05471,334.539419,-62924.41589,830.870655
29,5587,-126.443948,1427.4156,1599.832439,1044.165672,1582.56119,1568.662633,1518.937457,1609.704476,1287.358849,...,-32874.04125,-92.627052,206.471809,410.454022,-794.210744,-50782.56296,-1354.899393,-47.678273,-58459.67072,736.076259
31,7028,-247.388122,1281.580935,1559.693354,984.00412,1567.306355,1559.751146,1470.519975,1549.189461,1197.920901,...,-29736.69634,-228.482724,61.758113,261.733048,-946.253126,-48070.66786,-1508.850859,-203.434323,-57072.7679,588.075131
4,1563,275.898544,1459.753528,1655.879443,1321.089332,1473.447601,1480.170896,1650.327726,1510.901301,1396.702138,...,-57310.61967,202.456185,397.004063,646.629125,-400.066605,-60402.23717,-774.086664,333.414898,-62770.54127,831.441631
8,1585,265.274994,1453.181008,1655.398671,1317.40782,1472.869729,1479.801887,1649.09648,1509.411524,1393.851404,...,-57012.03405,247.67348,441.757632,651.895122,-315.44352,-60335.95127,-778.741243,363.366767,-62415.09885,857.330082
24,2871,220.590628,1764.328854,1627.127567,1177.595176,1514.708687,1502.646936,1583.963759,1599.758946,1421.908767,...,-45129.22197,362.488181,660.676556,919.2861,-322.67046,-56816.59191,-935.809451,419.367328,-60670.56052,1160.232987
14,1690,287.097752,1451.970943,1651.831317,1299.148087,1469.282453,1477.984462,1641.815859,1500.891685,1381.229319,...,-55693.05116,504.396283,728.974542,868.340373,-68.449727,-60005.66628,-743.262046,578.352554,-61979.87118,1081.92989
15,1726,300.221062,1466.396511,1650.295649,1292.845973,1468.076811,1477.408017,1638.976203,1498.459855,1378.656806,...,-55285.56093,541.204987,778.055936,932.991348,-49.625102,-59896.79005,-737.122182,608.174854,-61912.32413,1130.218586
25,3202,168.347064,1736.121618,1628.175612,1158.814089,1535.210595,1517.832736,1578.222402,1625.36561,1419.178591,...,-42912.40451,282.803432,584.873899,830.135195,-405.55045,-55999.048,-1001.066685,345.034443,-60384.26328,1098.582695
12,1639,267.866343,1443.820177,1653.774847,1308.10231,1471.105169,1478.860017,1645.585023,1505.048171,1386.710497,...,-56300.48098,411.933274,617.105378,755.089384,-134.087135,-60164.38828,-759.468621,499.865198,-62109.93672,985.338182


96

We see there are 96 missing values for the pressure, we'll use KNN imputation to replace them, with 2 neighbors to hopefully make the data coherent in its progression.

In [78]:
imputer = KNNImputer(missing_values = np.nan, n_neighbors = 2).set_output(transform="pandas")
pressure_train = imputer.fit_transform(pressure_train)
pressure_test = imputer.fit_transform(pressure_test)
pressure_train.isnull().sum().sum() 


0

In [83]:
display(temperature_train.head(10))

Unnamed: 0,M.Time[d],N_1,N_2,N_3,N_4,N_5,N_6,N_7,N_8,N_9,...,N_891,N_892,N_893,N_894,N_895,N_896,N_897,N_898,N_899,N_900
0,1554.0,17.623059,17.15422,17.641578,17.455701,16.613589,16.604935,17.662407,16.503001,16.943823,...,17.503931,17.225297,17.498277,17.268529,17.573474,17.412215,17.526257,17.36494,24.026562,17.538194
1,1556.0,17.62086,17.154263,17.641672,17.45585,16.415312,16.605042,17.662519,16.503121,16.943985,...,17.510776,17.22329,17.498581,17.267488,17.578925,17.409841,17.52286,17.363663,33.729552,17.53746
2,1558.0,17.618608,17.154303,17.641766,17.455998,16.415377,16.605148,17.662632,16.50324,16.944146,...,17.534085,17.223733,17.501874,2872.837827,17.599256,17.407913,17.520157,17.36385,41.602481,17.537433
3,1560.0,17.616334,2717.706176,17.641859,17.456146,16.41544,16.605254,17.662744,16.503357,16.944307,...,17.58161,17.228355,17.50967,17.266326,17.640317,17.40677,17.51875,17.366504,48.21898,17.538652
4,1563.0,17.612991,17.154388,17.642,17.456367,16.415531,16.605414,17.662912,16.50353,16.944544,...,17.723547,17.249726,17.535358,17.267759,17.757592,17.408069,17.521699,17.379102,56.258743,17.545154
5,1567.0,17.609008,17.154454,17.642187,17.456661,16.415646,16.605626,17.663136,16.503753,16.944855,...,2766.426947,17.310388,17.598978,17.275449,18.010005,17.417672,17.538472,17.415806,64.775395,17.565561
6,1572.0,17.605614,17.154568,17.64242,17.457028,16.415779,16.60589,17.663415,16.504023,16.945233,...,18.634191,17.438013,17.72479,17.297646,18.444537,17.444923,17.583032,17.495517,73.147489,17.612783
7,1578.0,17.605318,17.154841,17.6427,17.457465,16.415925,16.606206,17.663749,16.504334,16.94567,...,19.53027,17.656072,17.934088,17.346024,19.076916,17.501008,17.670392,17.636533,80.996009,17.701098
8,1585.0,17.612025,17.15553,17.643025,17.457972,16.416077,16.606574,17.664138,16.504678,16.946154,...,20.726138,17.976554,18.239215,17.433777,19.889638,17.596132,17.812325,17.851145,88.108493,17.842181
9,1595.0,17.638763,17.157931,17.643489,17.458689,16.416256,16.607098,17.664689,16.505137,16.946813,...,22.524169,18.511739,18.750719,17.618164,21.073668,17.779369,18.073014,18.223945,95.570527,18.100318


In [82]:
## We first impute the missing data with the 2 nearest neighbors as done with the pressure
temperature_train = imputer.fit_transform(temperature_train)
temperature_train.isnull().sum().sum()

0

But here we notice sudden spikes in the temperature going from about 20 to 2'000+ and goign back down a few days later. We can consider those value as measurement errors.

### **Feature engineering**