In this notebook, we perform the **testing phase** of the project.  
After building and training the model on **3 governorates** (Dakahlia, Fayoum, and Matrouh),  
we now evaluate its performance on **2 unseen governorates**:  

- **North Sinai**  
- **New Valley**  

The goal of this step is to **assess the generalization ability** of the trained model  
and verify whether the preprocessing pipeline and modeling approach can handle data from new regions effectively.  

This testing process provides insights into the **robustness and reliability** of the model  
before applying it to user-provided data for desertification prediction in other locations.  


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os, glob
import pandas as pd

In [None]:
# Connect to drive that we have uploaded the data on
data_path = '/content/drive/MyDrive/grad_project_data'
files = os.listdir(data_path)
print(len(files), "files found")

16 files found


In [None]:
govs = ['NorthSinai', 'NewValley']

In [None]:
all_data = pd.DataFrame()

In [None]:
for gov in govs:
    gov_path = os.path.join(data_path, gov)

    csv_files = glob.glob(os.path.join(gov_path, '*.csv'))
    print(f"{gov}: {len(csv_files)} files")

    for file in csv_files:
        df = pd.read_csv(file)
        all_data = pd.concat([all_data, df], ignore_index=True)

NorthSinai: 36 files
NewValley: 36 files


In [None]:
print("Final shape:", all_data.shape)
all_data.head()

Final shape: (19459, 20)


Unnamed: 0,longitude,latitude,year,month,area,ndvi,t2m_c,td2m_c,rh_pct,tp_m,ssrd_jm2,LC_Type1,sand,silt,clay,soc,ph,bdod,cec,POP
0,32.209095,30.178902,2023,2,NorthSinai,0.0804,12.618554,3.882845,55.253975,0.008923,6611885000.0,60,366,261,373,47,80,138,161,0.504184
1,33.6464,30.331615,2023,2,NorthSinai,0.1062,10.287051,2.998181,60.57323,0.02443,6967053000.0,60,332,346,324,42,80,141,135,0.000762
2,33.56555,31.0323,2023,2,NorthSinai,0.1013,12.752968,6.350627,65.034386,0.107716,6576160000.0,60,333,353,315,71,79,146,152,0.67581
3,32.62232,30.160936,2023,2,NorthSinai,0.26025,12.631394,4.875687,59.182293,0.010358,6718644000.0,40,396,329,276,52,80,137,141,0.705407
4,32.34384,29.846525,2023,2,NorthSinai,0.11265,12.745646,3.865558,54.72894,0.023517,6864726000.0,60,413,306,281,58,80,137,146,1.8e-05


In [None]:
all_data['area'].value_counts()

Unnamed: 0_level_0,count
area,Unnamed: 1_level_1
NorthSinai,19207
NewValley,252


In [None]:
# Take data where year is 2025 and not from Aug to Dec 'last 4 months in 2025' (with keeping data of 2023 & 2024)
filtered_df = all_data[~((all_data['year'] == 2025) & (all_data['month'].between(8, 12)))]

In [None]:
filtered_df.shape

(16647, 20)

In [None]:
filtered_df.duplicated().sum()

np.int64(334)

In [None]:
des_df = filtered_df.drop_duplicates()

In [None]:
# del ndvi col
des_df.drop(columns=['ndvi'])

Unnamed: 0,longitude,latitude,year,month,area,t2m_c,td2m_c,rh_pct,tp_m,ssrd_jm2,LC_Type1,sand,silt,clay,soc,ph,bdod,cec,POP
0,32.209095,30.178902,2023,2,NorthSinai,12.618554,3.882845,55.253975,0.008923,6.611885e+09,60,366,261,373,47,80,138,161,0.504184
1,33.646400,30.331615,2023,2,NorthSinai,10.287051,2.998181,60.573230,0.024430,6.967053e+09,60,332,346,324,42,80,141,135,0.000762
2,33.565550,31.032300,2023,2,NorthSinai,12.752968,6.350627,65.034386,0.107716,6.576160e+09,60,333,353,315,71,79,146,152,0.675810
3,32.622320,30.160936,2023,2,NorthSinai,12.631394,4.875687,59.182293,0.010358,6.718644e+09,40,396,329,276,52,80,137,141,0.705407
4,32.343840,29.846525,2023,2,NorthSinai,12.745646,3.865558,54.728940,0.023517,6.864726e+09,60,413,306,281,58,80,137,146,0.000018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19440,30.655008,24.564432,2025,7,NewValley,33.726680,5.622256,17.361670,0.000194,1.275674e+10,60,532,213,256,48,82,130,167,0.000107
19441,30.331615,23.962560,2025,7,NewValley,33.863846,5.078693,16.590380,0.000194,1.268245e+10,60,532,209,260,45,82,133,161,0.071939
19442,30.601110,24.753078,2025,7,NewValley,33.846478,5.679476,17.314203,0.000194,1.274665e+10,40,510,261,229,69,82,133,185,0.000872
19443,30.681960,24.600365,2025,7,NewValley,33.726680,5.622256,17.361670,0.000194,1.275674e+10,60,505,253,243,34,81,133,166,0.007005


In [None]:
# Save the filtered dataset
des_df.to_csv('/content/drive/MyDrive/grad_project_data/test_data.csv', index=False)