In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection._split import _BaseKFold

class GroupedTimeSeriesSplit(_BaseKFold):
    def __init__(self, n_splits=5, group_column=None):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.n_splits = n_splits
        self.group_column = group_column

    def split(self, data, date_column=None):
        if self.group_column is None:
            raise ValueError("Group column not specified.")

        unique_groups = data[self.group_column].unique()
        n_groups = len(unique_groups)

        if n_groups < self.n_splits:
            raise ValueError("Number of groups is less than n_splits.")

        tscv = TimeSeriesSplit(n_splits=self.n_splits)

        for train_index, test_index in tscv.split(unique_groups):
            train_groups = unique_groups[train_index]
            test_groups = unique_groups[test_index]
            train_mask = data[self.group_column].isin(train_groups)
            test_mask = data[self.group_column].isin(test_groups)
            yield np.where(train_mask)[0], np.where(test_mask)[0]

# Sample data
data = pd.read_csv("C:\\Users\\Dell\\Desktop\\studies\\SIT\\placement\\iqgate\\dataset\\train.csv")
df = pd.DataFrame(data)

# Grouped Time Series Cross-Validation
# Grouped Time Series Cross-Validation
gkf = GroupedTimeSeriesSplit(n_splits=min(5, len(df)), group_column="city")

for train_index, test_index in gkf.split(df):
    print("Train:", train_index, "Test:", test_index)
    print("Train Cities:", df.iloc[train_index]["city"].unique())
    print("Test Cities:", df.iloc[test_index]["city"].unique())
    print()


Train: [   0    1    2 ... 6469 6472 6475] Test: [  11   12   13 ... 6463 6468 6474]
Train Cities: ['Athens']
Test Cities: ['Irakleion']

Train: [   0    1    2 ... 6472 6474 6475] Test: [  23   24   25 ... 6467 6470 6477]
Train Cities: ['Athens' 'Irakleion']
Test Cities: ['Patra']

Train: [   0    1    2 ... 6474 6475 6477] Test: [  36   37   38 ... 6465 6478 6479]
Train Cities: ['Athens' 'Irakleion' 'Patra']
Test Cities: ['Thessaloniki']

Train: [   0    1    2 ... 6477 6478 6479] Test: [  57   58   59 ... 6471 6473 6476]
Train Cities: ['Athens' 'Irakleion' 'Patra' 'Thessaloniki']
Test Cities: ['Larisa']

Train: [   0    1    2 ... 6477 6478 6479] Test: [6480 6481 6482 ... 7557 7558 7559]
Train Cities: ['Athens' 'Irakleion' 'Patra' 'Thessaloniki' 'Larisa']
Test Cities: [nan]



In [9]:
# Identify the problematic value in the 'capacity' column
print(df['capacity'])

# Removing and replacing the values in column with diff. measuring unit to just ml and removing ml symbol
df['capacity'] = df['capacity'].replace('500ml', 500)
df['capacity'] = df['capacity'].replace('1.5lt', 1500)
df['capacity'] = df['capacity'].replace('330ml', 330)

# Convert the 'capacity' column to float
df['capacity'] = pd.to_numeric(df['capacity'], errors='coerce')

# Verify the conversion
print(df['capacity'])


0       500ml
1       1.5lt
2       330ml
3       500ml
4       330ml
        ...  
7555      NaN
7556      NaN
7557      NaN
7558      NaN
7559      NaN
Name: capacity, Length: 7560, dtype: object
0        500.0
1       1500.0
2        330.0
3        500.0
4        330.0
         ...  
7555       NaN
7556       NaN
7557       NaN
7558       NaN
7559       NaN
Name: capacity, Length: 7560, dtype: float64


In [10]:
#replacing null values in capacity column with median of that column
median_values = df['capacity'].median()

# Replace NaN values with median
df['capacity'].fillna(median_values, inplace=True)

# Verify that NaN values are replaced
print(df)

       id      date    city       lat      long       pop    shop  \
0     0.0  31/01/12  Athens  37.97945  23.71622  672130.0  shop_1   
1     1.0  31/01/12  Athens  37.97945  23.71622  672130.0  shop_1   
2     2.0  31/01/12  Athens  37.97945  23.71622  672130.0  shop_1   
3     3.0  31/01/12  Athens  37.97945  23.71622  672130.0  shop_1   
4     4.0  31/01/12  Athens  37.97945  23.71622  672130.0  shop_1   
...   ...       ...     ...       ...       ...       ...     ...   
7555  NaN       NaN     NaN       NaN       NaN       NaN     NaN   
7556  NaN       NaN     NaN       NaN       NaN       NaN     NaN   
7557  NaN       NaN     NaN       NaN       NaN       NaN     NaN   
7558  NaN       NaN     NaN       NaN       NaN       NaN     NaN   
7559  NaN       NaN     NaN       NaN       NaN       NaN     NaN   

            brand container  capacity  price  quantity  
0     kinder-cola     glass     500.0   0.96   13280.0  
1     kinder-cola   plastic    1500.0   2.86    6727.0  


In [6]:
# removing all the remaining rows with null values as it was throwing errors
# Identify columns with NaN values
columns_with_nan = df.columns[df.isna().any()].tolist()

# Print columns with NaN values
print("Columns with NaN values:", columns_with_nan)

# Remove rows with NaN values
df = df.dropna()

# Verify that NaN values are removed
print(df.info())


Columns with NaN values: []
<class 'pandas.core.frame.DataFrame'>
Index: 6391 entries, 0 to 6479
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         6391 non-null   float64
 1   date       6391 non-null   object 
 2   city       6391 non-null   object 
 3   lat        6391 non-null   float64
 4   long       6391 non-null   float64
 5   pop        6391 non-null   float64
 6   shop       6391 non-null   object 
 7   brand      6391 non-null   object 
 8   container  6391 non-null   object 
 9   capacity   6391 non-null   float64
 10  price      6391 non-null   float64
 11  quantity   6391 non-null   float64
dtypes: float64(7), object(5)
memory usage: 649.1+ KB
None


In [5]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Function to train and evaluate a model using GroupedTimeSeriesSplit
def train_and_evaluate_model(data, features, target, model_type='random_forest', n_splits=5, group_column=None):
    # Initialize the model
    if model_type == 'random_forest':
        model = RandomForestRegressor(random_state=42)
    elif model_type == 'decision_tree':
        model = DecisionTreeRegressor(random_state=42)
    elif model_type == 'xgb':
        model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
    else:
        raise ValueError("Invalid model_type. Choose either 'random_forest' or 'decision_tree'.")
    
    # Initialize the custom cross-validator
    gkf = GroupedTimeSeriesSplit(n_splits=n_splits, group_column=group_column)
    
    # Initialize lists to store evaluation metrics
    mse_scores = []
    
    # Iterate over the splits
    for train_index, test_index in gkf.split(data):
        # Split data into train and test sets
        train_data, test_data = data.iloc[train_index], data.iloc[test_index]
        
        # Train the model
        model.fit(train_data[features], train_data[target])
        
        # Make predictions
        predictions = model.predict(test_data[features])
        
        # Calculate Mean Squared Error
        mse = mean_squared_error(test_data[target], predictions)
        
        # Store the MSE
        mse_scores.append(mse)
    
    # Calculate the mean of MSE scores
    mean_mse = np.mean(mse_scores)
    
    return mean_mse

# Features and target variable
features = ["pop", "capacity", "price"]
target = "quantity"

# Evaluate the RandomForestRegressor model
rf_mse = train_and_evaluate_model(df, features, target, model_type='random_forest', n_splits=4, group_column="city")
print("Random Forest Mean Squared Error:", rf_mse)

# Evaluate the DecisionTreeRegressor model
dt_xgb = train_and_evaluate_model(df, features, target, model_type='decision_tree', n_splits=4, group_column="city")
print("Decision Tree Mean Squared Error:", dt_xgb)

# Evaluate the XGB model
dt_mse = train_and_evaluate_model(df, features, target, model_type='xgb', n_splits=4, group_column="city")
print("XGB Mean Squared Error:", dt_mse)


Random Forest Mean Squared Error: 253605610.09772924
Decision Tree Mean Squared Error: 299417990.9122438
XGB Mean Squared Error: 253710753.28492486
