In [57]:
import pandas as pd
import functions as func
import numpy as np
import yaml
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from IPython.display import display


In [59]:
from functions import clean_data, feature_engineering, split_data, train_model, evaluate_model, plot_predictions, plot_distribution


In [62]:
# Paths to your two data files
tunis_dams_rain_data_path = '../data/raw/tunis_dams_rain_mehdi.csv'
weather_data_path = '../data/raw/weather.csv'

def load_data(file_path):
    """Loads CSV file into a pandas DataFrame."""
    return pd.read_csv(file_path)
tunis_dams_rain_data = load_data(tunis_dams_rain_data_path)
weather_data = load_data(weather_data_path)
# Display a preview of each dataset
display(tunis_dams_rain_data.head())
display(weather_data.head())


Unnamed: 0,date,MELLEGUE,BEN METIR,KASSEB,BARBARA,SIDI SALEM,BOU-HEURTMA,JOUMINE,GHEZALA,SEJNANE,...,SIDI AÏCH,EL BREK,BEZIRK,CHIBA,MASRI,LEBNA,HMA,ABID,Zarga,Ziatine
0,2014-01-08,26.721,48.091,67.485,60.787,448.591,71.317,84.411,7.883,89.022,...,0.201,2.4,3.255,0.293,2.822,9.82,1.604,4.608,,
1,2014-01-09,21.315,44.527,64.278,58.725,400.528,62.8,74.739,7.221,79.611,...,0.201,2.4,2.201,0.134,1.549,8.17,1.491,3.558,,
2,2014-01-10,26.113,40.278,61.178,57.022,360.543,54.719,66.061,6.586,69.456,...,0.201,2.4,1.59,0.14,0.924,7.1,1.413,2.789,,
3,2014-01-12,25.978,33.311,55.713,57.318,346.387,46.205,53.778,5.995,51.276,...,0.201,2.4,0.998,1.116,1.882,12.562,1.209,4.705,,
4,2014-02-08,26.519,47.955,67.405,60.708,446.569,71.011,84.088,7.854,88.667,...,0.201,2.4,3.215,0.285,2.77,9.735,1.599,4.584,,


Unnamed: 0,date,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
0,2014-01-01,6.8,1.3,14.7,0.0,,273.0,19.3,,,
1,2014-01-02,7.5,,14.4,,,,,,,
2,2014-01-03,10.4,,,0.0,,,,,,
3,2014-01-04,10.6,3.5,18.8,0.0,,197.0,9.5,,,
4,2014-01-05,11.7,8.9,20.0,0.0,,,9.2,,,


In [64]:
# Display dataset previews
def preview_data(df, name):
    """Displays the first 5 rows of a DataFrame."""
    print(f"Preview of {name}:")
    display(df.head())

preview_data(tunis_dams_rain_data, 'tunis_dams_rain_data')
preview_data(weather_data, 'weather_data')


Preview of tunis_dams_rain_data:


Unnamed: 0,date,MELLEGUE,BEN METIR,KASSEB,BARBARA,SIDI SALEM,BOU-HEURTMA,JOUMINE,GHEZALA,SEJNANE,...,SIDI AÏCH,EL BREK,BEZIRK,CHIBA,MASRI,LEBNA,HMA,ABID,Zarga,Ziatine
0,2014-01-08,26.721,48.091,67.485,60.787,448.591,71.317,84.411,7.883,89.022,...,0.201,2.4,3.255,0.293,2.822,9.82,1.604,4.608,,
1,2014-01-09,21.315,44.527,64.278,58.725,400.528,62.8,74.739,7.221,79.611,...,0.201,2.4,2.201,0.134,1.549,8.17,1.491,3.558,,
2,2014-01-10,26.113,40.278,61.178,57.022,360.543,54.719,66.061,6.586,69.456,...,0.201,2.4,1.59,0.14,0.924,7.1,1.413,2.789,,
3,2014-01-12,25.978,33.311,55.713,57.318,346.387,46.205,53.778,5.995,51.276,...,0.201,2.4,0.998,1.116,1.882,12.562,1.209,4.705,,
4,2014-02-08,26.519,47.955,67.405,60.708,446.569,71.011,84.088,7.854,88.667,...,0.201,2.4,3.215,0.285,2.77,9.735,1.599,4.584,,


Preview of weather_data:


Unnamed: 0,date,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
0,2014-01-01,6.8,1.3,14.7,0.0,,273.0,19.3,,,
1,2014-01-02,7.5,,14.4,,,,,,,
2,2014-01-03,10.4,,,0.0,,,,,,
3,2014-01-04,10.6,3.5,18.8,0.0,,197.0,9.5,,,
4,2014-01-05,11.7,8.9,20.0,0.0,,,9.2,,,


In [66]:
# Check for missing values
def check_missing_values(df, name):
    print(f"Checking for missing values in {name}:")
    print(df.isnull().sum())

check_missing_values(tunis_dams_rain_data, 'tunis_dams_rain_data')
check_missing_values(weather_data, 'weather_data')


Checking for missing values in tunis_dams_rain_data:
date              0
MELLEGUE          0
BEN METIR         0
KASSEB            0
BARBARA           0
SIDI SALEM        0
BOU-HEURTMA       0
JOUMINE           0
GHEZALA           0
SEJNANE           0
S. EL BARRAK      0
SILIANA           0
LAKHMESS          0
RMIL              0
BIR M'CHERGA      0
RMEL              0
NEBHANA           0
SIDI SAAD         0
EL HAOUAREB       0
SIDI AÏCH         0
EL BREK           0
BEZIRK            0
CHIBA             0
MASRI             0
LEBNA             0
HMA               4
ABID              4
Zarga           755
Ziatine         755
dtype: int64
Checking for missing values in weather_data:
date       0
tavg       0
tmin      94
tmax      63
prcp     154
snow    1978
wdir     581
wspd     315
wpgt    1978
pres     960
tsun    1978
dtype: int64


In [68]:
# Remove duplicates
def remove_duplicates(df, name):
    cleaned_df = df.drop_duplicates()
    print(f"{name} after removing duplicates:")
    display(cleaned_df.head())
    return cleaned_df

tunis_dams_rain_data_cleaned = remove_duplicates(tunis_dams_rain_data, 'tunis_dams_rain_data')
weather_data_cleaned = remove_duplicates(weather_data, 'weather_data')

tunis_dams_rain_data after removing duplicates:


Unnamed: 0,date,MELLEGUE,BEN METIR,KASSEB,BARBARA,SIDI SALEM,BOU-HEURTMA,JOUMINE,GHEZALA,SEJNANE,...,SIDI AÏCH,EL BREK,BEZIRK,CHIBA,MASRI,LEBNA,HMA,ABID,Zarga,Ziatine
0,2014-01-08,26.721,48.091,67.485,60.787,448.591,71.317,84.411,7.883,89.022,...,0.201,2.4,3.255,0.293,2.822,9.82,1.604,4.608,,
1,2014-01-09,21.315,44.527,64.278,58.725,400.528,62.8,74.739,7.221,79.611,...,0.201,2.4,2.201,0.134,1.549,8.17,1.491,3.558,,
2,2014-01-10,26.113,40.278,61.178,57.022,360.543,54.719,66.061,6.586,69.456,...,0.201,2.4,1.59,0.14,0.924,7.1,1.413,2.789,,
3,2014-01-12,25.978,33.311,55.713,57.318,346.387,46.205,53.778,5.995,51.276,...,0.201,2.4,0.998,1.116,1.882,12.562,1.209,4.705,,
4,2014-02-08,26.519,47.955,67.405,60.708,446.569,71.011,84.088,7.854,88.667,...,0.201,2.4,3.215,0.285,2.77,9.735,1.599,4.584,,


weather_data after removing duplicates:


Unnamed: 0,date,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
0,2014-01-01,6.8,1.3,14.7,0.0,,273.0,19.3,,,
1,2014-01-02,7.5,,14.4,,,,,,,
2,2014-01-03,10.4,,,0.0,,,,,,
3,2014-01-04,10.6,3.5,18.8,0.0,,197.0,9.5,,,
4,2014-01-05,11.7,8.9,20.0,0.0,,,9.2,,,


In [70]:
# Check data types
def check_data_types(df, name):
    """Prints the data types of each column in a DataFrame."""
    print(f"Data types in {name}:")
    print(df.dtypes)

check_data_types(tunis_dams_rain_data_cleaned, 'tunis_dams_rain_data')
check_data_types(weather_data_cleaned, 'weather_data')


Data types in tunis_dams_rain_data:
date             object
MELLEGUE        float64
BEN METIR       float64
KASSEB          float64
BARBARA         float64
SIDI SALEM      float64
BOU-HEURTMA     float64
JOUMINE         float64
GHEZALA         float64
SEJNANE         float64
S. EL BARRAK    float64
SILIANA         float64
LAKHMESS        float64
RMIL            float64
BIR M'CHERGA    float64
RMEL            float64
NEBHANA         float64
SIDI SAAD       float64
EL HAOUAREB     float64
SIDI AÏCH       float64
EL BREK         float64
BEZIRK          float64
CHIBA           float64
MASRI           float64
LEBNA           float64
HMA             float64
ABID            float64
Zarga           float64
Ziatine         float64
dtype: object
Data types in weather_data:
date     object
tavg    float64
tmin    float64
tmax    float64
prcp    float64
snow    float64
wdir    float64
wspd    float64
wpgt    float64
pres    float64
tsun    float64
dtype: object


In [72]:
# Feature Engineering
def add_date_features(df, name):
    """Adds year, month, and day features from a 'date' column if present."""
    if 'date' in df.columns:
        df['Year'] = pd.DatetimeIndex(df['date']).year
        df['Month'] = pd.DatetimeIndex(df['date']).month
        df['Day'] = pd.DatetimeIndex(df['date']).day
        print(f"Added date features to {name}.")
    return df

def add_rainfall_feature(df):
    """Adds a cumulative rainfall feature if 'rainfall' column exists."""
    if 'rainfall' in df.columns:
        df['Cumulative_Rainfall'] = df['rainfall'].cumsum()
        print("Added 'Cumulative_Rainfall' feature.")
    return df

def add_temperature_change_feature(df):
    """Adds a temperature change feature if 'temperature' column exists."""
    if 'temperature' in df.columns:
        df['Temperature_Change'] = df['temperature'].diff()
        print("Added 'Temperature_Change' feature.")
    return df


In [74]:
tunis_dams_rain_data_cleaned = add_date_features(tunis_dams_rain_data_cleaned, 'tunis_dams_rain_data')
tunis_dams_rain_data_cleaned = add_rainfall_feature(tunis_dams_rain_data_cleaned)

weather_data_cleaned = add_date_features(weather_data_cleaned, 'weather_data')
weather_data_cleaned = add_temperature_change_feature(weather_data_cleaned)

Added date features to tunis_dams_rain_data.
Added date features to weather_data.


In [76]:
# Preview the updated datasets
preview_data(tunis_dams_rain_data_cleaned, 'updated tunis_dams_rain_data')
preview_data(weather_data_cleaned, 'updated weather_data')


Preview of updated tunis_dams_rain_data:


Unnamed: 0,date,MELLEGUE,BEN METIR,KASSEB,BARBARA,SIDI SALEM,BOU-HEURTMA,JOUMINE,GHEZALA,SEJNANE,...,CHIBA,MASRI,LEBNA,HMA,ABID,Zarga,Ziatine,Year,Month,Day
0,2014-01-08,26.721,48.091,67.485,60.787,448.591,71.317,84.411,7.883,89.022,...,0.293,2.822,9.82,1.604,4.608,,,2014,1,8
1,2014-01-09,21.315,44.527,64.278,58.725,400.528,62.8,74.739,7.221,79.611,...,0.134,1.549,8.17,1.491,3.558,,,2014,1,9
2,2014-01-10,26.113,40.278,61.178,57.022,360.543,54.719,66.061,6.586,69.456,...,0.14,0.924,7.1,1.413,2.789,,,2014,1,10
3,2014-01-12,25.978,33.311,55.713,57.318,346.387,46.205,53.778,5.995,51.276,...,1.116,1.882,12.562,1.209,4.705,,,2014,1,12
4,2014-02-08,26.519,47.955,67.405,60.708,446.569,71.011,84.088,7.854,88.667,...,0.285,2.77,9.735,1.599,4.584,,,2014,2,8


Preview of updated weather_data:


Unnamed: 0,date,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,Year,Month,Day
0,2014-01-01,6.8,1.3,14.7,0.0,,273.0,19.3,,,,2014,1,1
1,2014-01-02,7.5,,14.4,,,,,,,,2014,1,2
2,2014-01-03,10.4,,,0.0,,,,,,,2014,1,3
3,2014-01-04,10.6,3.5,18.8,0.0,,197.0,9.5,,,,2014,1,4
4,2014-01-05,11.7,8.9,20.0,0.0,,,9.2,,,,2014,1,5


In [19]:
# Step: Fill NaN values with 0

# Fill all NaN values in tunis_dams_rain_data with 0
tunis_dams_rain_data_cleaned = tunis_dams_rain_data_cleaned.fillna(0)

# Fill all NaN values in weather_data with 0
weather_data_cleaned = weather_data_cleaned.fillna(0)

# Display the datasets after handling NaN values
print("tunis_dams_rain_data after handling NaN values (filled with 0):")
display(tunis_dams_rain_data_cleaned.head())

print("\nweather_data after handling NaN values (filled with 0):")
display(weather_data_cleaned.head())


tunis_dams_rain_data after handling NaN values (filled with 0):


Unnamed: 0,date,MELLEGUE,BEN METIR,KASSEB,BARBARA,SIDI SALEM,BOU-HEURTMA,JOUMINE,GHEZALA,SEJNANE,...,CHIBA,MASRI,LEBNA,HMA,ABID,Zarga,Ziatine,Year,Month,Day
0,2014-01-08,26.721,48.091,67.485,60.787,448.591,71.317,84.411,7.883,89.022,...,0.293,2.822,9.82,1.604,4.608,0.0,0.0,2014,1,8
1,2014-01-09,21.315,44.527,64.278,58.725,400.528,62.8,74.739,7.221,79.611,...,0.134,1.549,8.17,1.491,3.558,0.0,0.0,2014,1,9
2,2014-01-10,26.113,40.278,61.178,57.022,360.543,54.719,66.061,6.586,69.456,...,0.14,0.924,7.1,1.413,2.789,0.0,0.0,2014,1,10
3,2014-01-12,25.978,33.311,55.713,57.318,346.387,46.205,53.778,5.995,51.276,...,1.116,1.882,12.562,1.209,4.705,0.0,0.0,2014,1,12
4,2014-02-08,26.519,47.955,67.405,60.708,446.569,71.011,84.088,7.854,88.667,...,0.285,2.77,9.735,1.599,4.584,0.0,0.0,2014,2,8



weather_data after handling NaN values (filled with 0):


Unnamed: 0,date,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,Year,Month,Day
0,2014-01-01,6.8,1.3,14.7,0.0,0.0,273.0,19.3,0.0,0.0,0.0,2014,1,1
1,2014-01-02,7.5,0.0,14.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2014,1,2
2,2014-01-03,10.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2014,1,3
3,2014-01-04,10.6,3.5,18.8,0.0,0.0,197.0,9.5,0.0,0.0,0.0,2014,1,4
4,2014-01-05,11.7,8.9,20.0,0.0,0.0,0.0,9.2,0.0,0.0,0.0,2014,1,5
