In [1]:
# Check if running in Google Colab
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')

    # Define the base path for your data directory for Colab
    basePath = '/content/drive/MyDrive'

else:
    # Running locally or in a different environment
    basePath = '..'

In [2]:
import pandas as pd
import numpy as np

In [3]:
df_grouped = pd.read_csv(basePath + '/data/processed/grouped_data1.csv')

In [4]:
#Finding the features: like % increase/change in the temperature/other factors for each country and then taking them separate -> Since not all countries have all the years, we'll calculate 'a value' for each country -> which says that:    The % change in factor x in years around 2000s and in years around 2020s was highest in these 10 countries.

In [None]:
start_year_period1 = 2000
end_year_period1 = 2012
start_year_period2 = 2013
end_year_period2 = 2025

# Filter df_filtered for the first time period (around 2000s)
df_period1 = df_grouped[(df_grouped['DATE'] >= start_year_period1) & (df_grouped['DATE'] <= end_year_period1)].copy()

# Filter df_filtered for the second time period (around 2020s)
df_period2 = df_grouped[(df_grouped['DATE'] >= start_year_period2) & (df_grouped['DATE'] <= end_year_period2)].copy()

print("Data filtered for Period 1 (2000-2012):")
display(df_period1)

print("\nData filtered for Period 2 (2013-2025):")
display(df_period2)

print(df_period1['COUNTRY'].nunique())
print(df_period2['COUNTRY'].nunique())

Data filtered for Period 1 (2000-2012):


Unnamed: 0,COUNTRY,STATION,STATION_NAME,DATE,LATITUDE,LONGITUDE,ELEVATION,CDSD,CLDD,DP01,...,FZF0,FZF5,FZF6,HDSD,HDSD_ATTRIBUTES,HTDD,PRCP,TAVG,TMAX,TMIN
49,Tunisia,TS000060765,"GABES, TS",2005,33.880,10.100,4.0,,,38.0,...,,,,,,,146.5,,,
50,Tunisia,TS000060765,"GABES, TS",2006,33.880,10.100,4.0,,,33.0,...,,,,,,,399.5,,,
51,Tunisia,TS000060765,"GABES, TS",2007,33.880,10.100,4.0,,,35.0,...,,,,,,,351.0,,,
52,Tunisia,TS000060765,"GABES, TS",2008,33.880,10.100,4.0,,,28.0,...,,,,,,,132.3,,,
53,Tunisia,TS000060765,"GABES, TS",2009,33.880,10.100,4.0,,,36.0,...,,,,,,,155.3,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51029,Wallis and Futuna [France],WF000917530,"HIHIFO ILE WALLIS, WF",2012,-13.233,-176.167,27.0,,,245.0,...,,,,,,,2965.7,27.6,30.4,24.7
51126,Senegal,SG000061612,"PODOR, SG",2009,16.633,-14.917,7.0,,,,...,,,,,,,,,37.0,
51127,Senegal,SG000061612,"PODOR, SG",2010,16.633,-14.917,7.0,,,,...,,,,,,,,,38.1,
51128,Senegal,SG000061612,"PODOR, SG",2011,16.633,-14.917,7.0,,,,...,,,,,,,,,37.5,



Data filtered for Period 2 (2013-2025):


Unnamed: 0,COUNTRY,STATION,STATION_NAME,DATE,LATITUDE,LONGITUDE,ELEVATION,CDSD,CLDD,DP01,...,FZF0,FZF5,FZF6,HDSD,HDSD_ATTRIBUTES,HTDD,PRCP,TAVG,TMAX,TMIN
56,Tunisia,TS000060765,"GABES, TS",2022,33.880,10.100,4.0,,,24.0,...,,,,,,,48.2,,,
89,Tunisia,TS000060725,"JENDOUBA, TS",2014,36.480,8.800,143.0,,,93.0,...,,,,,,,490.9,,,
90,Tunisia,TS000060725,"JENDOUBA, TS",2016,36.480,8.800,143.0,,,101.0,...,,,,,,,394.3,,,
91,Tunisia,TS000060725,"JENDOUBA, TS",2018,36.480,8.800,143.0,,,96.0,...,,,,,,,500.4,,,
92,Tunisia,TS000060725,"JENDOUBA, TS",2019,36.480,8.800,143.0,,,112.0,...,,,,,,,465.8,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51038,Wallis and Futuna [France],WF000917530,"HIHIFO ILE WALLIS, WF",2021,-13.233,-176.167,27.0,,,254.0,...,,,,,,,2504.5,,,
51039,Wallis and Futuna [France],WF000917530,"HIHIFO ILE WALLIS, WF",2022,-13.233,-176.167,27.0,,,285.0,...,,,,,,,2247.8,,,
51040,Wallis and Futuna [France],WF000917530,"HIHIFO ILE WALLIS, WF",2023,-13.233,-176.167,27.0,,,264.0,...,,,,,,,2751.9,,,
51041,Wallis and Futuna [France],WF000917530,"HIHIFO ILE WALLIS, WF",2024,-13.233,-176.167,27.0,,,243.0,...,,,,,,,2268.9,,,


124
113


In [6]:
# Calculate the average TAVG for each country in period 1
avg_tavg_period1 = df_period1.groupby('COUNTRY')['TAVG'].mean().reset_index()
avg_tavg_period1.rename(columns={'TAVG': 'Avg_TAVG_Period1'}, inplace=True)

# Calculate the average TAVG for each country in period 2
avg_tavg_period2 = df_period2.groupby('COUNTRY')['TAVG'].mean().reset_index()
avg_tavg_period2.rename(columns={'TAVG': 'Avg_TAVG_Period2'}, inplace=True)

print("Average TAVG per country for Period 1 (2000-2012):")
display(avg_tavg_period1.head())

print("\nAverage TAVG per country for Period 2 (2013-2025):")
display(avg_tavg_period2.head())

Average TAVG per country for Period 1 (2000-2012):


Unnamed: 0,COUNTRY,Avg_TAVG_Period1
0,Algeria,
1,American Samoa [United States],27.992308
2,Antarctica,-6.014286
3,Argentina,15.982432
4,Armenia,



Average TAVG per country for Period 2 (2013-2025):


Unnamed: 0,COUNTRY,Avg_TAVG_Period2
0,Albania,
1,Algeria,
2,American Samoa [United States],27.85
3,Antarctica,-12.314286
4,Argentina,


In [None]:
# Merge the two dataframes on the 'COUNTRY' column
tavg_change = pd.merge(avg_tavg_period1, avg_tavg_period2, on='COUNTRY', how='outer')

# Calculate the percentage change in TAVG
# Avoid division by zero or calculating change from NaN
tavg_change['TAVG_Change_Percent'] = ((tavg_change['Avg_TAVG_Period2'] - tavg_change['Avg_TAVG_Period1']) / tavg_change['Avg_TAVG_Period1']) * 100

print("Percentage Change in Average TAVG per Country (Period 1 to Period 2):")
display(tavg_change.sort_values(by='TAVG_Change_Percent', ascending=False).head(10))

Percentage Change in Average TAVG per Country (Period 1 to Period 2):


Unnamed: 0,COUNTRY,Avg_TAVG_Period1,Avg_TAVG_Period2,TAVG_Change_Percent
3,Antarctica,-6.014286,-12.314286,104.750594
60,Jan Mayen [Norway],0.653846,1.158333,77.156863
48,Greenland [Denmark],-4.534783,-6.284615,38.586916
41,Finland,3.005128,3.741667,24.509386
7,Austria,5.635897,6.46,14.622384
107,Slovakia,7.0,7.9625,13.75
36,Estonia,6.323077,7.118182,12.574652
53,India,24.96,27.381818,9.702797
46,Germany,9.571154,10.379167,8.442167
121,Ukraine,9.313462,10.046429,7.869974


In [8]:
tavg_change.to_csv(basePath + '/data/processed/tavg_change.csv', index=False)

In [9]:
# Calculate the average TMIN for each country in period 1
avg_tmin_period1 = df_period1.groupby('COUNTRY')['TMIN'].mean().reset_index()
avg_tmin_period1.rename(columns={'TMIN': 'Avg_TMIN_Period1'}, inplace=True)

# Calculate the average TMIN for each country in period 2
avg_tmin_period2 = df_period2.groupby('COUNTRY')['TMIN'].mean().reset_index()
avg_tmin_period2.rename(columns={'TMIN': 'Avg_TMIN_Period2'}, inplace=True)

# Merge the two dataframes on the 'COUNTRY' column
tmin_change = pd.merge(avg_tmin_period1, avg_tmin_period2, on='COUNTRY', how='outer')

# Calculate the percentage change in TMIN
tmin_change['TMIN_Change_Percent'] = ((tmin_change['Avg_TMIN_Period2'] - tmin_change['Avg_TMIN_Period1']) / tmin_change['Avg_TMIN_Period1']) * 100

# Display the results, sorting by the percentage change
print("Percentage Change in Average TMIN per Country (Period 1 to Period 2):")
display(tmin_change.sort_values(by='TMIN_Change_Percent', ascending=False).head(10))

Percentage Change in Average TMIN per Country (Period 1 to Period 2):


Unnamed: 0,COUNTRY,Avg_TMIN_Period1,Avg_TMIN_Period2,TMIN_Change_Percent
3,Antarctica,-9.106667,-16.1625,77.479868
118,Turkey,9.530435,16.166667,69.631995
107,Slovakia,1.569231,2.475,57.720588
7,Austria,2.115385,2.903333,37.248485
104,Saudi Arabia,20.144444,26.85,33.287369
36,Estonia,2.492308,3.281818,31.67789
101,Romania,2.843333,3.604545,26.771821
48,Greenland [Denmark],-8.0125,-9.257143,15.533764
22,Chile,8.82,10.075,14.229025
121,Ukraine,4.675,5.339286,14.20932


In [10]:
#NOTE: Even though the Global warming is causing temperatures to rise in Antarctica, this is average of minimum temperature across the year and then again averaged for a decade, so even if it is having hot days, because of some extremely cold days (some days getting colder), the average tmin here is decreased....Minimum temperatures are getting colder (more negative).
"""
This could mean:

    More extreme cold events in winter.

    More days with very low temperatures (you confirmed this).

At the same time, number of days with high temperatures is increasing
"""
print()


