In [3]:
# Govt Scheme Dataset Preprocessing & Training in Jupyter Notebook

import pandas as pd
import numpy as np

# 1. Load the dataset
df = pd.read_csv("indian_government_schemes_dataset_updated.csv")
print("Dataset loaded. Shape:", df.shape)
df.head()

# 2. Check for null/missing values
missing = df.isnull().sum()
print("\nMissing Values:\n", missing[missing > 0])

# 3. Fill missing values (optional)
df.fillna("Not Available", inplace=True)

# 4. Convert categorical values if needed (for ML models)
# Let's encode sector and state for demo
from sklearn.preprocessing import LabelEncoder

le_sector = LabelEncoder()
le_state = LabelEncoder()
df['sector_encoded'] = le_sector.fit_transform(df['sector'])
df['state_encoded'] = le_state.fit_transform(df['state'])

# 5. Example ML task (optional): Predict scheme type from state, age, sector
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Define inputs and outputs (you can change target column as needed)
X = df[['min_age', 'max_age', 'state_encoded', 'sector_encoded']]
y = df['scheme_name']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
print("Model training complete.")

# 7. Evaluate model
accuracy = model.score(X_test, y_test)
print(f"Model Accuracy: {accuracy:.2f}")

# 8. Make sample prediction
sample_input = pd.DataFrame({
    'min_age': [18],
    'max_age': [25],
    'state_encoded': [le_state.transform(['Bihar'])[0]],
    'sector_encoded': [le_sector.transform(['Education'])[0]]
})

predicted_scheme = model.predict(sample_input)
print("Predicted scheme for sample input:", predicted_scheme[0])


Dataset loaded. Shape: (2000, 9)

Missing Values:
 Series([], dtype: int64)
Model training complete.
Model Accuracy: 0.24
Predicted scheme for sample input: AICTE Pragati Scholarship
