In [2]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

CSV_PATH = r"C:\Users\Dell\OneDrive\Desktop\infosys intern 6.0\education_policies.csv"
TRAIN_PATH = r"C:\Users\Dell\OneDrive\Desktop\infosys intern 6.0\train_policies.csv"
TEST_PATH = r"C:\Users\Dell\OneDrive\Desktop\infosys intern 6.0\test_policies.csv"


In [3]:
def generate_synthetic_policies(n=500):
    states = ["Karnataka","Maharashtra","Tamil Nadu","Uttar Pradesh","Delhi",
              "Kerala","West Bengal","Gujarat","Rajasthan","Punjab"]
    sectors = ["Primary","Secondary","Higher Education","Vocational","Early Childhood"]
    target_groups = ["Students","Teachers","Rural Students","Urban Students",
                     "Women","Disadvantaged Groups","All"]
    statuses = ["Proposed","Implemented","Under Review","Pilot"]
    years = list(range(2015, 2026))
    stakeholders_list = [
        "Ministry of Education, Local NGOs",
        "State Education Department, Private Partners",
        "Teachers' Unions, Community Leaders",
        "Central Government, Donors",
        "EdTech Companies, Universities"
    ]
    funding_ranges = [(0.5,5),(5,20),(20,100),(0.1,0.5)]
    aspects = ["learning outcomes","infrastructure","teacher quality",
               "digital access","early childhood development","vocational skills"]
    interventions = ["grants to schools","teacher training programs","digital device distribution",
                     "curriculum reform","scholarship schemes","public-private partnerships"]
    focuses = ["marginalized communities","gender equity","rural accessibility",
               "urban inclusion","STEM education","literacy and numeracy"]
    secondary_aspects = ["community participation","governance","assessment quality","safety standards"]

    records = []
    for i in range(1, n+1):
        policy_id = f"P{1000+i}"
        title = f"{random.choice(['National','State','District'])} {random.choice(sectors)} Education Reform {random.randint(1,99)}"
        sector = random.choice(sectors)
        region = random.choice(states)
        year = random.choice(years)
        target_group = random.choice(target_groups)
        status = random.choice(statuses)
        funding = round(random.uniform(*random.choice(funding_ranges)), 2)
        stakeholders = random.choice(stakeholders_list)
        impact_score = round(random.uniform(0.1, 0.99), 3)
        summary = f"This policy aims to improve {random.choice(aspects)} in {sector} through {random.choice(interventions)} in {region}."
        goals = f"Increase reach by {random.randint(5,40)}% in {random.randint(1,5)} years."
        full_text = f"{summary} Goals: {goals}"

        records.append({
            "policy_id": policy_id,
            "title": title,
            "sector": sector,
            "region": region,
            "year": year,
            "target_group": target_group,
            "status": status,
            "funding_million_usd": funding,
            "stakeholders": stakeholders,
            "impact_score": impact_score,
            "summary": summary,
            "goals": goals,
            "full_text": full_text
        })

    return pd.DataFrame(records)


In [4]:
def preprocess(df):
    df = df.copy()
    df["text_for_nlp"] = (
        df["title"].astype(str) + ". " +
        df["full_text"].astype(str) + ". Stakeholders: " +
        df["stakeholders"].astype(str)
    ).str.lower()
    return df


In [7]:
# Load existing dataset
df = pd.read_csv(r"C:\Users\Dell\OneDrive\Desktop\infosys intern 6.0\education_policies.csv")

# Preprocess & split
df = preprocess(df)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED, shuffle=True)

# Save split datasets
train_df.to_csv(TRAIN_PATH, index=False)
test_df.to_csv(TEST_PATH, index=False)

print(f"Data prepared: {len(train_df)} train, {len(test_df)} test.")


Data prepared: 400 train, 100 test.


In [8]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

MODEL_PATH = "policy_vectorizer.pkl"
MATRIX_PATH = "policy_tfidf_matrix.pkl"

# Load data
train_df = pd.read_csv(TRAIN_PATH)
full_df = pd.read_csv(CSV_PATH)

# Preprocess both
train_df = preprocess(train_df)
full_df = preprocess(full_df)

# Train TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
vectorizer.fit(train_df["text_for_nlp"])

# Transform full data
tfidf_matrix = vectorizer.transform(full_df["text_for_nlp"])

# Save model & matrix
joblib.dump(vectorizer, MODEL_PATH)
joblib.dump({"matrix": tfidf_matrix, "df": full_df}, MATRIX_PATH)

print(f"✅ Model trained and saved to {MODEL_PATH} and {MATRIX_PATH}")


✅ Model trained and saved to policy_vectorizer.pkl and policy_tfidf_matrix.pkl


In [9]:
import numpy as np

# Summary
num_docs, num_terms = tfidf_matrix.shape
sparsity = (tfidf_matrix.nnz / (num_docs * num_terms)) * 100
print(f"TF-IDF matrix shape: ({num_docs}, {num_terms}), Sparsity: {sparsity:.2f}%")

# Compute top 20 words/phrases by total TF-IDF weight
feature_names = np.array(vectorizer.get_feature_names_out())
tfidf_sums = np.array(tfidf_matrix.sum(axis=0)).flatten()
top_indices = tfidf_sums.argsort()[::-1][:20]

print("Top 20 words/phrases by TF-IDF importance:")
for term, score in zip(feature_names[top_indices], tfidf_sums[top_indices]):
    print(f"{term}: {score:.2f}")


TF-IDF matrix shape: (500, 1318), Sparsity: 7.12%
Top 20 words/phrases by TF-IDF importance:
and: 44.78
to: 40.11
education: 31.80
state: 22.28
reform: 21.28
includes: 21.27
learning: 21.09
improve: 21.03
on: 20.98
set: 20.71
strengthen: 20.62
monitoring: 19.62
community: 19.61
by: 18.69
goals: 18.55
stakeholders: 18.55
education reform: 18.55
vocational: 18.55
outcomes: 18.53
teachers: 18.34
