In [3]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Define relative path
data_path = r"C:/Users/BALA/OneDrive - University of Hertfordshire/Desktop/mutual-fund-recommender/data/raw/Mutual_Funds.csv"

# Load data
df = pd.read_csv(data_path)
print("Loaded dataset with shape:", df.shape)
df.head()


Loaded dataset with shape: (29033646, 7)


Unnamed: 0,Fund_House,Scheme_Type,Scheme_Category,Scheme_Code,Scheme_Name,Date,NAV
0,Standard Chartered Mutual Fund,Open Ended Schemes,Income,100027,Grindlays Super Saver Income Fund-GSSIF-Half Y...,29-05-2008,10.7205
1,Standard Chartered Mutual Fund,Open Ended Schemes,Income,100027,Grindlays Super Saver Income Fund-GSSIF-Half Y...,28-05-2008,10.725
2,Standard Chartered Mutual Fund,Open Ended Schemes,Income,100027,Grindlays Super Saver Income Fund-GSSIF-Half Y...,27-05-2008,10.7216
3,Standard Chartered Mutual Fund,Open Ended Schemes,Income,100027,Grindlays Super Saver Income Fund-GSSIF-Half Y...,26-05-2008,10.7206
4,Standard Chartered Mutual Fund,Open Ended Schemes,Income,100027,Grindlays Super Saver Income Fund-GSSIF-Half Y...,23-05-2008,10.7152


In [4]:
# Convert 'Date' to datetime and drop invalid ones
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y', errors='coerce')
df = df.dropna(subset=['Date'])

# Sort by Scheme_Code and Date
df = df.sort_values(['Scheme_Code', 'Date']).reset_index(drop=True)


In [5]:
# Convert NAV to numeric
df['NAV'] = pd.to_numeric(df['NAV'], errors='coerce')
df = df.dropna(subset=['NAV'])

print("After cleaning, data shape:", df.shape)


After cleaning, data shape: (29033646, 7)


In [11]:
import numpy as np

# Initialize empty column for scaled NAV
df['NAV_scaled'] = np.nan

# Group by Scheme_Code and scale NAV using transform
def scale_nav(group):
    scaler = MinMaxScaler()
    group = group.copy()
    group['NAV_scaled'] = scaler.fit_transform(group[['NAV']])
    group['Scheme_Code'] = group['Scheme_Code'].iloc[0]  # Re-assign explicitly
    return group

# Keep Scheme_Code in the group (default behavior)
df_scaled = df.groupby('Scheme_Code', group_keys=False, observed=True).apply(
    scale_nav
).reset_index(drop=True)

print(" Scaling complete. Data shape:", df_scaled.shape)


  df_scaled = df.groupby('Scheme_Code', group_keys=False, observed=True).apply(


 Scaling complete. Data shape: (29033646, 8)


In [12]:
#Save the Preprocessed CSV
output_path = "C:/Users/BALA/OneDrive - University of Hertfordshire/Desktop/mutual-fund-recommender/data/processed/preprocessed_mutual_funds.csv"

df_scaled.to_csv(output_path, index=False)

print(f" Preprocessed data saved to: {output_path}")
df_scaled.head()


 Preprocessed data saved to: C:/Users/BALA/OneDrive - University of Hertfordshire/Desktop/mutual-fund-recommender/data/processed/preprocessed_mutual_funds.csv


Unnamed: 0,Fund_House,Scheme_Type,Scheme_Category,Scheme_Code,Scheme_Name,Date,NAV,NAV_scaled
0,Standard Chartered Mutual Fund,Open Ended Schemes,Income,100027,Grindlays Super Saver Income Fund-GSSIF-Half Y...,2006-04-03,10.0774,0.0
1,Standard Chartered Mutual Fund,Open Ended Schemes,Income,100027,Grindlays Super Saver Income Fund-GSSIF-Half Y...,2006-04-04,10.0796,0.002189
2,Standard Chartered Mutual Fund,Open Ended Schemes,Income,100027,Grindlays Super Saver Income Fund-GSSIF-Half Y...,2006-04-05,10.0836,0.00617
3,Standard Chartered Mutual Fund,Open Ended Schemes,Income,100027,Grindlays Super Saver Income Fund-GSSIF-Half Y...,2006-04-07,10.0856,0.00816
4,Standard Chartered Mutual Fund,Open Ended Schemes,Income,100027,Grindlays Super Saver Income Fund-GSSIF-Half Y...,2006-04-10,10.0906,0.013136


In [13]:
print("Unique Scheme_Codes:", df_scaled['Scheme_Code'].nunique())
print("Date Range:", df_scaled['Date'].min(), "to", df_scaled['Date'].max())


Unique Scheme_Codes: 35354
Date Range: 2006-04-01 00:00:00 to 2023-07-23 00:00:00
