In [1]:
import pandas as pd
import numpy as np
import re
import unicodedata
from pathlib import Path

# ------------------------------------------------------------------
# 1. Paths
# ------------------------------------------------------------------
RAW_DIR = Path(r"D:\Darryl\Coding\s_p\data\raw")
PROCESSED_DIR = Path(r"D:\Darryl\Coding\s_p\data\processed")

src_file = RAW_DIR / "sebi_investment_advisors.csv"
out_clean_csv = PROCESSED_DIR / "sebi_investment_advisors_cleaned.csv"

# ------------------------------------------------------------------
# 2. Load CSV with correct header row
# ------------------------------------------------------------------
df = pd.read_csv(src_file, skiprows=2)   # skip the first 2 junk rows

print("Columns loaded:", df.columns.tolist()[:10], "...")  # peek at first few col names

# ------------------------------------------------------------------
# 3. Clean column names
# ------------------------------------------------------------------
def clean_col(c):
    c = unicodedata.normalize("NFKC", str(c))
    c = c.strip().lower()
    c = re.sub(r"[\s/\\\-]+", "_", c)
    c = re.sub(r"[^\w_]", "", c)
    c = re.sub(r"_+", "_", c).strip("_")
    return c

df.columns = [clean_col(c) for c in df.columns]

# ------------------------------------------------------------------
# 4. Helper functions
# ------------------------------------------------------------------
def extract_sebi_reg(s):
    if pd.isna(s): return np.nan
    s = str(s).upper().replace(" ", "")
    m = re.search(r"(INA\d{5,})", s)
    return m.group(1) if m else np.nan

EMAIL_RE = re.compile(r"[A-Z0-9._%+\-]+@[A-Z0-9.\-]+\.[A-Z]{2,}", re.I)
def split_emails(s):
    if pd.isna(s): return []
    s = s.replace(";", ",").replace("/", ",")
    return [e.strip().lower() for e in s.split(",") if EMAIL_RE.fullmatch(e.strip())]

def only_digits(s): 
    return re.sub(r"\D", "", str(s))

def normalize_in_phone(num):
    d = only_digits(num)
    if d.startswith("91") and len(d) == 12: 
        d = d[2:]
    if d.startswith("0") and len(d) == 11: 
        d = d[1:]
    if len(d) == 10: 
        return "+91" + d
    return "+91" + d if d else np.nan

def split_phones(s):
    if pd.isna(s): return []
    s = re.sub(r"[;/|]", ",", str(s))
    nums = []
    for p in s.split(","):
        norm = normalize_in_phone(p)
        if norm: nums.append(norm)
    return list(dict.fromkeys(nums))

def parse_date(s):
    try: 
        return pd.to_datetime(s, errors="coerce", dayfirst=True)
    except: 
        return pd.NaT

def extract_pincode(s):
    if pd.isna(s): return np.nan
    m = re.search(r"\b\d{6}\b", str(s))
    return m.group(0) if m else np.nan

# ------------------------------------------------------------------
# 5. Apply cleaners
# ------------------------------------------------------------------
clean = df.copy()

if "registration_no" in clean.columns:
    clean["sebi_reg_no_norm"] = clean["registration_no"].map(extract_sebi_reg)

if "email_id" in clean.columns:
    emails = clean["email_id"].map(split_emails)
    clean["email_primary"] = emails.map(lambda x: x[0] if x else np.nan)
    clean["emails_all"] = emails.map(lambda x: ", ".join(x) if x else np.nan)

if "telephone" in clean.columns:
    phones = clean["telephone"].map(split_phones)
    clean["phone_primary"] = phones.map(lambda x: x[0] if x else np.nan)
    clean["phones_all"] = phones.map(lambda x: ", ".join(x) if x else np.nan)

if "from" in clean.columns:
    clean["reg_date"] = clean["from"].map(parse_date)

if "to" in clean.columns:
    clean["expiry_date"] = clean["to"].map(parse_date)
    clean["days_to_expiry"] = (clean["expiry_date"] - pd.Timestamp.today()).dt.days

if "pincode" in clean.columns:
    clean["pincode_norm"] = clean["pincode"].map(extract_pincode)
elif "address" in clean.columns:
    clean["pincode_norm"] = clean["address"].map(extract_pincode)

# ------------------------------------------------------------------
# 6. Fix pincode columns
# ------------------------------------------------------------------
if "pincode" in clean.columns:
    clean = clean.drop(columns=["pincode"])
if "pincode_norm" in clean.columns:
    clean = clean.rename(columns={"pincode_norm": "pincode"})

# ------------------------------------------------------------------
# 7. Save one cleaned file
# ------------------------------------------------------------------
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
clean.to_csv(out_clean_csv, index=False)
print(f"\nSaved cleaned CSV → {out_clean_csv}")

# ------------------------------------------------------------------
# 8. Preview
# ------------------------------------------------------------------
print(clean.head(10))


Columns loaded: ['Name', 'Registration No.', 'Contact Person', 'Address', 'Email-Id', 'Telephone', 'Fax', 'City', 'State', 'Pincode'] ...

Saved cleaned CSV → D:\Darryl\Coding\s_p\data\processed\sebi_investment_advisors_cleaned.csv
                                          name registration_no  \
0                                KAVITHA MENON    INA000000037   
1                     PRAKASH CHANDRA PRAHARAJ    INA000000045   
2           VALUEFY SOLUTIONS  PRIVATE LIMITED    INA000000060   
3                     ICICI SECURITIES LIMITED    INA000000094   
4               ASTEYA INVESTMENT MANAGERS LLP    INA000000276   
5   SPT INVESTMENT ADVISORY SERVICES PVT. LTD.    INA000000326   
6          V R WEALTH ADVISORS PRIVATE LIMITED    INA000000383   
7  BARCLAYS SECURITIES (INDIA) PRIVATE LIMITED    INA000000391   
8   PLAN AHEAD WEALTH ADVISORS PRIVATE LIMITED    INA000000409   
9                             SANDIP SABHARWAL    INA000000425   

                           contact_person

In [2]:
import pandas as pd
import numpy as np
import re
import unicodedata
from pathlib import Path

# ------------------------------------------------------------------
# 1. Paths
# ------------------------------------------------------------------
RAW_DIR = Path(r"D:\Darryl\Coding\s_p\data\raw")
PROCESSED_DIR = Path(r"D:\Darryl\Coding\s_p\data\processed")

src_file = RAW_DIR / "sebi_research_analysts.csv"
out_clean_csv = PROCESSED_DIR / "sebi_research_analysts_cleaned.csv"

# ------------------------------------------------------------------
# 2. Load CSV with correct header row
# ------------------------------------------------------------------
df = pd.read_csv(src_file, skiprows=2)   # skip first 2 junk rows if same format

print("Columns loaded:", df.columns.tolist()[:10], "...")  # peek at first few col names

# ------------------------------------------------------------------
# 3. Clean column names
# ------------------------------------------------------------------
def clean_col(c):
    c = unicodedata.normalize("NFKC", str(c))
    c = c.strip().lower()
    c = re.sub(r"[\s/\\\-]+", "_", c)
    c = re.sub(r"[^\w_]", "", c)
    c = re.sub(r"_+", "_", c).strip("_")
    return c

df.columns = [clean_col(c) for c in df.columns]

# ------------------------------------------------------------------
# 4. Helper functions
# ------------------------------------------------------------------
def extract_sebi_reg(s):
    if pd.isna(s): return np.nan
    s = str(s).upper().replace(" ", "")
    m = re.search(r"(IN[A-Z]\d{5,})", s)
    return m.group(1) if m else np.nan

EMAIL_RE = re.compile(r"[A-Z0-9._%+\-]+@[A-Z0-9.\-]+\.[A-Z]{2,}", re.I)
def split_emails(s):
    if pd.isna(s): return []
    s = s.replace(";", ",").replace("/", ",")
    return [e.strip().lower() for e in s.split(",") if EMAIL_RE.fullmatch(e.strip())]

def only_digits(s): return re.sub(r"\D", "", str(s))

# normalize indian phones

def normalize_in_phone(num):
    d = only_digits(num)
    if d.startswith("91") and len(d) == 12: d = d[2:]
    if d.startswith("0") and len(d) == 11: d = d[1:]
    if len(d) == 10: return "+91" + d
    return "+91" + d if d else np.nan

def split_phones(s):
    if pd.isna(s): return []
    s = re.sub(r"[;/|]", ",", str(s))
    nums = []
    for p in s.split(","):
        norm = normalize_in_phone(p)
        if norm: nums.append(norm)
    return list(dict.fromkeys(nums))

def parse_date(s):
    try: return pd.to_datetime(s, errors="coerce", dayfirst=True)
    except: return pd.NaT

def extract_pincode(s):
    if pd.isna(s): return np.nan
    m = re.search(r"\b\d{6}\b", str(s))
    return m.group(0) if m else np.nan

# ------------------------------------------------------------------
# 5. Apply cleaners
# ------------------------------------------------------------------
clean = df.copy()

if "registration_no" in clean.columns:
    clean["sebi_reg_no_norm"] = clean["registration_no"].map(extract_sebi_reg)

if "email_id" in clean.columns:
    emails = clean["email_id"].map(split_emails)
    clean["email_primary"] = emails.map(lambda x: x[0] if x else np.nan)
    clean["emails_all"] = emails.map(lambda x: ", ".join(x) if x else np.nan)

if "telephone" in clean.columns:
    phones = clean["telephone"].map(split_phones)
    clean["phone_primary"] = phones.map(lambda x: x[0] if x else np.nan)
    clean["phones_all"] = phones.map(lambda x: ", ".join(x) if x else np.nan)

if "from" in clean.columns:
    clean["reg_date"] = clean["from"].map(parse_date)

if "to" in clean.columns:
    clean["expiry_date"] = clean["to"].map(parse_date)
    clean["days_to_expiry"] = (clean["expiry_date"] - pd.Timestamp.today()).dt.days

# fix pincode → no .0
if "pincode" in clean.columns:
    clean["pincode"] = clean["pincode"].map(extract_pincode)
else:
    clean["pincode"] = clean["address"].map(extract_pincode)

# ------------------------------------------------------------------
# 6. Save one cleaned file
# ------------------------------------------------------------------
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
clean.to_csv(out_clean_csv, index=False)
print(f"\nSaved cleaned CSV → {out_clean_csv}")

# ------------------------------------------------------------------
# 7. Preview
# ------------------------------------------------------------------
clean.head()


Columns loaded: ['Name', 'Registration No.', 'Contact Person', 'Address', 'Email-Id', 'Telephone', 'Fax', 'City', 'State', 'Pincode'] ...

Saved cleaned CSV → D:\Darryl\Coding\s_p\data\processed\sebi_research_analysts_cleaned.csv


Unnamed: 0,name,registration_no,contact_person,address,email_id,telephone,fax,city,state,pincode,...,to,country,sebi_reg_no_norm,email_primary,emails_all,phone_primary,phones_all,reg_date,expiry_date,days_to_expiry
0,STAKEHOLDERS EMPOWERMENT SERVICES,INH000000016,MR. DEVENDRA BHANDARI,"A202, MUKTANGAN COMPLEX,, UPPER GOVIND NAGAR, ...",devendra.bhandari@sesgovernance.com,2240220322,2240220322,MUMBAI,MAHARASHTRA,400097,...,Perpetual,,INH000000016,devendra.bhandari@sesgovernance.com,devendra.bhandari@sesgovernance.com,912240220322,912240220322,2015-01-20,NaT,
1,INSTITUTIONAL INVESTOR ADVISORY SERVICES INDIA...,INH000000024,AMIT TANDON,"GROUND FLOOR, DGP HOUSE, 88-C, OLD PRABHADEVI ...",amit.tandon@iias.in,2222721575,2222721575,MUMBAI,MAHARASHTRA,400025,...,Perpetual,,INH000000024,amit.tandon@iias.in,amit.tandon@iias.in,912222721575,912222721575,2015-01-30,NaT,
2,DEEPAK KUMAR KANODIA,INH000000032,DEEPAK KUMAR KANODIA,"Flat no. 201, E wing, Sai Mannat, Sector-34, K...",deepakkrkanodia@gmail.com,022 28392567,022 28392567,NAVI MUMBAI,MAHARASHTRA,400059,...,Perpetual,,INH000000032,deepakkrkanodia@gmail.com,deepakkrkanodia@gmail.com,912228392567,912228392567,2015-02-04,NaT,
3,BOB CAPITAL MARKETS LIMITED,INH000000040,MR. GOLAK B PANDA,"3RD FLOOR, UTI TOWER, BANDRA-KURLA COMPLEX, BA...",golak.panda@bobcaps.in,2266718535,2266718535,MUMBAI,MAHARASHTRA,400051,...,Perpetual,,INH000000040,golak.panda@bobcaps.in,golak.panda@bobcaps.in,912266718535,912266718535,2015-02-04,NaT,
4,GEPL CAPITAL PRIVATE LIMITED,INH000000081,MR. SUBHASH SHARMA,"D-21, DHANRAJ MAHAL,, C.S.M. MARG, COLABA,",subhash@geplcapital.com,2266182445,2266182445,MUMBAI,MAHARASHTRA,400001,...,Perpetual,,INH000000081,subhash@geplcapital.com,subhash@geplcapital.com,912266182445,912266182445,2015-02-10,NaT,
