In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

In [25]:
def read_data(path,**kwargs):
    df=pd.DataFrame()
    try:
        df=pd.read_csv(path,**kwargs)
    except FileNotFoundError:
        print(f"File {path} not found")
    except Exception as e:
        print(f"An error occured: {e}")
    return df

In [26]:
train=read_data('../data/plain/train.csv')
test=read_data('../data/plain/test.csv')

In [4]:
def extract_temporal_features(df):
    df['week_of_year'] = ((df['day'] - 1) // 7) + 1
    df['month'] = df['day'].apply(day_to_month)
    df['day_of_week'] = (df['day'] - 1) % 7
    return df

In [5]:
def day_to_month(day):
        """Converts day of the year to month."""
        if 1 <= day <= 31:
           return 1  # January
        elif 32 <= day <= 59:
            return 2  # February (non-leap year)
        elif 60 <= day <= 90:
            return 3  # March
        elif 91 <= day <= 120:
            return 4  # April
        elif 121 <= day <= 151:
            return 5  # May
        elif 152 <= day <= 181:
            return 6  # June
        elif 182 <= day <= 212:
            return 7  # July
        elif 213 <= day <= 243:
            return 8  # August
        elif 244 <= day <= 273:
            return 9  # September
        elif 274 <= day <= 304:
            return 10 # October
        elif 305 <= day <= 334:
            return 11 # November
        else:
            return 12   # December  


In [27]:
train = extract_temporal_features(train)
test = extract_temporal_features(test)

In [7]:
train.head()

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,week_of_year,month,day_of_week
0,0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1,1,1,0
1,1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1,1,1,1
2,2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1,1,1,2
3,3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1,1,1,3
4,4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0,1,1,4


In [29]:
from sklearn.preprocessing import PowerTransformer

# Apply log transformation (for non-zero data) or PowerTransformer
features_to_transform = ['dewpoint', 'humidity', 'cloud', 'sunshine']
transformer = PowerTransformer(method='yeo-johnson')
def transform_features(df, features_to_transform, transformer):
    df[features_to_transform] = transformer.fit_transform(df[features_to_transform])
    return df
train = transform_features(train, features_to_transform, transformer)
test = transform_features(test, features_to_transform, transformer)

In [31]:
def impute_missing_values(df):
    imputer = KNNImputer(n_neighbors=5)
    df_imputed = imputer.fit_transform(df)
    return pd.DataFrame(df_imputed, columns=df.columns)
train = impute_missing_values(train)
test = impute_missing_values(test)

In [32]:
# Generate new features
def generate_new_features(df):
    df['temp_range'] = df['maxtemp'] - df['mintemp']
    df['sin_wind'] = np.sin(np.radians(df['winddirection']))
    df['cos_wind'] = np.cos(np.radians(df['winddirection']))
    return df
train = generate_new_features(train)
test = generate_new_features(test)

In [37]:
# Rolling averages
def generate_rolling_features(df):
    df['temp_7day_avg'] = df['temparature'].rolling(window=7, min_periods=1).mean()
    df['temp_30day_avg'] = df['temparature'].rolling(window=30, min_periods=1).mean()
    df['temp_90day_avg'] = df['temparature'].rolling(window=90, min_periods=1).mean()
   
    return df
train = generate_rolling_features(train)
test = generate_rolling_features(test)

In [38]:
#Remove outliers from the whole datasets
def remove_outliers(df, features):
    for col in features:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        df[col] = df[col].clip(Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)
    return df
features=train.drop(["id","rainfall"],axis=1).select_dtypes(include=[np.number]).columns
train = remove_outliers(train, features)
test = remove_outliers(test, features)
            


In [39]:
#scale features 
def scale_features(df, features):
    scaler = StandardScaler()
    df[features] = scaler.fit_transform(df[features])
    return df
features=train.drop(["id","rainfall"],axis=1).select_dtypes(include=[np.number]).columns
train = scale_features(train, features)
test = scale_features(test, features)

In [40]:
train.head()

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,...,rainfall,week_of_year,month,day_of_week,temp_range,sin_wind,cos_wind,temp_7day_avg,temp_30day_avg,temp_90day_avg
0,0.0,-1.701361,0.674366,-0.913809,-0.642199,-0.448967,-0.541843,0.623142,0.717107,-0.562997,...,1.0,-1.673055,-1.576593,-1.495064,-1.917212,0.819448,0.531505,-0.672507,-0.69541,-0.760443
1,1.0,-1.691853,1.046807,-1.798289,-1.350846,-1.259737,-1.111844,1.820625,1.115619,-1.376222,...,1.0,-1.673055,-1.576593,-0.994424,-2.514239,0.666234,0.746027,-1.043346,-1.079628,-1.184162
2,2.0,-1.682346,1.862629,-1.232222,-1.504067,-1.497036,-1.597165,-0.950455,-1.47477,1.182128,...,1.0,-1.673055,-1.576593,-0.493783,0.404561,0.932339,0.294159,-1.220413,-1.263083,-1.386478
3,3.0,-1.672838,-0.035045,-1.462187,-1.178472,-1.042214,-0.941734,1.820625,1.73419,-1.376222,...,1.0,-1.673055,-1.576593,0.006858,-1.983548,0.819448,0.531505,-1.223754,-1.266544,-1.390296
4,4.0,-1.663331,1.454718,-0.89612,-1.063556,-1.378387,-1.581177,-2.916398,-1.509377,0.336518,...,0.0,-1.673055,-1.576593,0.507499,1.266934,0.477351,0.931206,-1.201704,-1.243699,-1.365102


In [41]:
#save the preprocessed data
train.to_csv('../data/preprocessed/train.csv', index=False)
test.to_csv('../data/preprocessed/test.csv', index=False)