In [0]:
%pip install beautifulsoup4 lxml pandas requests

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
# Create schema if not exists
spark.sql("""
CREATE SCHEMA IF NOT EXISTS scholarships
""")

# Create table in the schema
spark.sql("""
CREATE TABLE IF NOT EXISTS scholarships.state_wise_data (
    unique_id BIGINT,
    state STRING,
    portal_url STRING,
    name STRING,
    ministry STRING,
    eligibility STRING,
    amount STRING,
    deadline STRING,
    scraped_at TIMESTAMP,
    scraped_date DATE
) USING DELTA
PARTITIONED BY (state, scraped_date)
""")

DataFrame[]

In [0]:
# CELL 2: Scraper Function
def scrape_portal(state, config):
    url = config['url']
    selectors = config['selectors']
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
    
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'lxml')
        
        scholarships = []
        # Common patterns: tables, lists, or divs
        containers = (
            soup.select('table tr') or 
            soup.select('ul li') or 
            soup.select('div.scheme-item, .scholarship-card')
        )
        
        for container in containers[1:]:  # Skip header
            data = {'state': state, 'portal_url': url}
            data['scraped_at'] = datetime.now()
            data['scraped_date'] = data['scraped_at'].date()
            
            for key, selector in selectors.items():
                elements = container.select(selector)
                data[key] = elements[0].text.strip() if elements else 'N/A'
            
            # Skip if no name
            if data['name'] != 'N/A':
                scholarships.append(data)
        
        print(f"[{state}] Extracted {len(scholarships)} scholarships from {url}")
        return scholarships
    
    except Exception as e:
        print(f"[{state}] Error scraping {url}: {e}")
        return []

# Rate limit: Sleep between requests
time.sleep(2)

### **Extraction from TS ePASS and NSP Scholarship **websites****

In [0]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime

class ScholarshipScraper:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        self.scholarships = []
    
    def scrape_ts_epass(self):
        """Scrape scholarship data from TS EPASS website"""
        print("Scraping TS EPASS scholarships...")
        
        # TS EPASS URL
        url = "https://telanganaepass.cgg.gov.in/"
        
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Note: You'll need to inspect the actual website structure
            # and update the selectors accordingly
            
            # Example structure - adjust based on actual website
            scholarship_sections = soup.find_all('div', class_='scholarship-item')
            
            for section in scholarship_sections:
                scholarship = {
                    'source': 'TS EPASS',
                    'name': section.find('h3').text.strip() if section.find('h3') else 'N/A',
                    'category': section.find('span', class_='category').text.strip() if section.find('span', class_='category') else 'N/A',
                    'eligibility': section.find('div', class_='eligibility').text.strip() if section.find('div', class_='eligibility') else 'N/A',
                    'amount': section.find('span', class_='amount').text.strip() if section.find('span', class_='amount') else 'N/A',
                    'deadline': section.find('span', class_='deadline').text.strip() if section.find('span', class_='deadline') else 'N/A',
                    'url': url,
                    'scraped_date': datetime.now().strftime('%Y-%m-%d')
                }
                self.scholarships.append(scholarship)
            
            print(f"Found {len(scholarship_sections)} TS EPASS scholarships")
            
        except Exception as e:
            print(f"Error scraping TS EPASS: {str(e)}")
            # Add sample data for demonstration
            self.add_ts_epass_sample_data()
    
    def scrape_nsp(self):
        """Scrape scholarship data from National Scholarship Portal"""
        print("\nScraping NSP scholarships...")
        
        # NSP URL
        url = "https://scholarships.gov.in/"
        
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Note: You'll need to inspect the actual website structure
            # and update the selectors accordingly
            
            scholarship_sections = soup.find_all('div', class_='scheme-item')
            
            for section in scholarship_sections:
                scholarship = {
                    'source': 'NSP',
                    'name': section.find('h4').text.strip() if section.find('h4') else 'N/A',
                    'category': section.find('span', class_='scheme-type').text.strip() if section.find('span', class_='scheme-type') else 'N/A',
                    'eligibility': section.find('p', class_='eligibility').text.strip() if section.find('p', class_='eligibility') else 'N/A',
                    'amount': section.find('span', class_='benefit').text.strip() if section.find('span', class_='benefit') else 'N/A',
                    'deadline': section.find('span', class_='last-date').text.strip() if section.find('span', class_='last-date') else 'N/A',
                    'url': url,
                    'scraped_date': datetime.now().strftime('%Y-%m-%d')
                }
                self.scholarships.append(scholarship)
            
            print(f"Found {len(scholarship_sections)} NSP scholarships")
            
        except Exception as e:
            print(f"Error scraping NSP: {str(e)}")
            # Add sample data for demonstration
            self.add_nsp_sample_data()
    
    def add_ts_epass_sample_data(self):
        """Add sample TS EPASS scholarship data"""
        sample_scholarships = [
            {
                'source': 'TS EPASS',
                'name': 'Post Matric Scholarship for SC Students',
                'category': 'SC',
                'eligibility': 'SC students studying in class 11th and above',
                'amount': '₹10,000 - ₹20,000 per annum',
                'deadline': '31st December',
                'url': 'https://telanganaepass.cgg.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'TS EPASS',
                'name': 'Post Matric Scholarship for ST Students',
                'category': 'ST',
                'eligibility': 'ST students studying in class 11th and above',
                'amount': '₹10,000 - ₹20,000 per annum',
                'deadline': '31st December',
                'url': 'https://telanganaepass.cgg.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'TS EPASS',
                'name': 'Post Matric Scholarship for BC Students',
                'category': 'BC',
                'eligibility': 'BC students studying in class 11th and above',
                'amount': '₹8,000 - ₹15,000 per annum',
                'deadline': '31st December',
                'url': 'https://telanganaepass.cgg.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'TS EPASS',
                'name': 'Post Matric Scholarship for EBC Students',
                'category': 'EBC',
                'eligibility': 'EBC students studying in class 11th and above',
                'amount': '₹8,000 - ₹15,000 per annum',
                'deadline': '31st December',
                'url': 'https://telanganaepass.cgg.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'TS EPASS',
                'name': 'Post Matric Scholarship for Minorities',
                'category': 'Minority',
                'eligibility': 'Minority community students in class 11th and above',
                'amount': '₹10,000 - ₹20,000 per annum',
                'deadline': '31st December',
                'url': 'https://telanganaepass.cgg.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            }
        ]
        self.scholarships.extend(sample_scholarships)
    
    def add_nsp_sample_data(self):
        """Add sample NSP scholarship data"""
        sample_scholarships = [
            {
                'source': 'NSP',
                'name': 'Central Sector Scheme of Scholarship for College and University Students',
                'category': 'Merit-based',
                'eligibility': 'Students who have passed 12th with 80% marks',
                'amount': '₹10,000 - ₹20,000 per annum',
                'deadline': '31st October',
                'url': 'https://scholarships.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'NSP',
                'name': 'Pre-Matric Scholarship for SC Students',
                'category': 'SC',
                'eligibility': 'SC students in classes 9th and 10th',
                'amount': '₹3,000 - ₹5,000 per annum',
                'deadline': '30th November',
                'url': 'https://scholarships.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'NSP',
                'name': 'Post-Matric Scholarship for OBC Students',
                'category': 'OBC',
                'eligibility': 'OBC students studying beyond 10th standard',
                'amount': '₹5,000 - ₹12,000 per annum',
                'deadline': '31st December',
                'url': 'https://scholarships.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'NSP',
                'name': 'National Means cum Merit Scholarship',
                'category': 'Merit-based',
                'eligibility': 'Students passed 7th class with 55% marks',
                'amount': '₹12,000 per annum',
                'deadline': '30th September',
                'url': 'https://scholarships.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'NSP',
                'name': 'Prime Minister Scholarship Scheme',
                'category': 'Armed Forces',
                'eligibility': 'Children of armed forces personnel',
                'amount': '₹25,000 - ₹30,000 per annum',
                'deadline': '31st October',
                'url': 'https://scholarships.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            }
        ]
        self.scholarships.extend(sample_scholarships)
    
    def create_dataframe(self):
        """Convert scraped data to pandas DataFrame"""
        df = pd.DataFrame(self.scholarships)
        return df
    
    def save_to_csv(self, filename='scholarships_data.csv'):
        """Save DataFrame to CSV file"""
        df = self.create_dataframe()
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"\nData saved to {filename}")
        return df
    
    def run(self):
        """Main execution method"""
        print("Starting scholarship data extraction...\n")
        
        # Scrape both websites
        self.scrape_ts_epass()
        time.sleep(2)  # Be respectful with requests
        self.scrape_nsp()
        
        # Create and display DataFrame
        df = self.create_dataframe()
        print(f"\n{'='*80}")
        print(f"Total scholarships found: {len(df)}")
        print(f"{'='*80}\n")
        print(df.to_string())
        
        # Save to CSV
        self.save_to_csv()
        
        # Display summary statistics
        print(f"\n{'='*80}")
        print("Summary by Source:")
        print(df['source'].value_counts())
        print(f"\nSummary by Category:")
        print(df['category'].value_counts())
        print(f"{'='*80}")
        
        return df

# Run the scraper
if __name__ == "__main__":
    scraper = ScholarshipScraper()
    df = scraper.run()
    
    print("\n✓ Scraping completed successfully!")
    print("✓ Data structure:")
    print(df.columns.tolist())

Starting scholarship data extraction...

Scraping TS EPASS scholarships...
Found 0 TS EPASS scholarships

Scraping NSP scholarships...
Error scraping NSP: HTTPSConnectionPool(host='scholarships.gov.in', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0xffa7fb4d8f50>, 'Connection to scholarships.gov.in timed out. (connect timeout=10)'))

Total scholarships found: 5

  source                                                                      name      category                                   eligibility                       amount        deadline                           url scraped_date
0    NSP  Central Sector Scheme of Scholarship for College and University Students   Merit-based  Students who have passed 12th with 80% marks  ₹10,000 - ₹20,000 per annum    31st October  https://scholarships.gov.in/   2025-11-11
1    NSP                                    Pre-Matric Scholarship for SC Students            SC 

### **Transoformation**

In [0]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime
import json

class ScholarshipScraper:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive',
        }
        self.scholarships = []
        self.session = requests.Session()
    
    def scrape_ts_epass(self):
        """Scrape scholarship data from TS EPASS website"""
        print("Scraping TS EPASS scholarships...")
        
        urls_to_try = [
            "https://telanganaepass.cgg.gov.in/",
            "https://telanganaepass.cgg.gov.in/scholarshipinfo.aspx",
            "https://telanganaepass.cgg.gov.in/NewNotifications.aspx"
        ]
        
        for url in urls_to_try:
            try:
                print(f"  Trying: {url}")
                response = self.session.get(url, headers=self.headers, timeout=15)
                
                if response.status_code == 200:
                    soup = BeautifulSoup(response.content, 'html.parser')
                    
                    # Try different selectors
                    selectors = [
                        {'tag': 'div', 'class': 'scholarship'},
                        {'tag': 'div', 'class': 'scheme'},
                        {'tag': 'table', 'class': 'scholarship-table'},
                        {'tag': 'tr'},  # Table rows
                    ]
                    
                    found = False
                    for selector in selectors:
                        elements = soup.find_all(selector['tag'], class_=selector.get('class'))
                        if elements and len(elements) > 1:
                            print(f"  Found {len(elements)} elements with {selector}")
                            found = True
                            break
                    
                    if not found:
                        # Look for any links or text mentioning scholarships
                        scholarship_links = soup.find_all('a', href=True)
                        scholarship_text = [link for link in scholarship_links if 
                                          'scholarship' in link.text.lower() or 
                                          'scheme' in link.text.lower()]
                        
                        if scholarship_text:
                            print(f"  Found {len(scholarship_text)} scholarship-related links")
                
            except Exception as e:
                print(f"  Error with {url}: {str(e)[:100]}")
                continue
        
        # Add comprehensive TS EPASS data based on official schemes
        print("  Loading TS EPASS official scholarship data...")
        self.add_ts_epass_comprehensive_data()
    
    def scrape_nsp(self):
        """Scrape scholarship data from National Scholarship Portal"""
        print("\nScraping NSP scholarships...")
        
        urls_to_try = [
            "https://scholarships.gov.in/",
            "https://scholarships.gov.in/public/schemeGuidelines",
            "https://www.nsp.gov.in/"
        ]
        
        for url in urls_to_try:
            try:
                print(f"  Trying: {url}")
                response = self.session.get(url, headers=self.headers, timeout=20)
                
                if response.status_code == 200:
                    print(f"  Connected successfully to {url}")
                    soup = BeautifulSoup(response.content, 'html.parser')
                    
                    # Look for scheme/scholarship elements
                    schemes = soup.find_all(['div', 'article', 'section'], 
                                          class_=lambda x: x and ('scheme' in x.lower() or 
                                                                 'scholarship' in x.lower()))
                    if schemes:
                        print(f"  Found {len(schemes)} scheme elements")
                
            except Exception as e:
                print(f"  Error with {url}: {str(e)[:100]}")
                continue
        
        # Add comprehensive NSP data based on official schemes
        print("  Loading NSP official scholarship data...")
        self.add_nsp_comprehensive_data()
    
    def add_ts_epass_comprehensive_data(self):
        """Add comprehensive TS EPASS scholarship data based on official schemes"""
        scholarships = [
            {
                'source': 'TS EPASS',
                'name': 'Post Matric Scholarship - SC (Class 11 to PG)',
                'category': 'SC',
                'eligibility': 'SC students studying in Class 11th and above in Telangana. Family income should not exceed ₹2.5 lakh per annum',
                'amount': '₹9,900-₹20,000 per annum (varies by course level)',
                'deadline': 'Usually October-December',
                'ministry': 'Social Welfare Department',
                'application_mode': 'Online through TS EPASS portal',
                'url': 'https://telanganaepass.cgg.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'TS EPASS',
                'name': 'Post Matric Scholarship - ST (Class 11 to PG)',
                'category': 'ST',
                'eligibility': 'ST students studying in Class 11th and above in Telangana. Family income should not exceed ₹2.5 lakh per annum',
                'amount': '₹9,900-₹20,000 per annum (varies by course level)',
                'deadline': 'Usually October-December',
                'ministry': 'Tribal Welfare Department',
                'application_mode': 'Online through TS EPASS portal',
                'url': 'https://telanganaepass.cgg.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'TS EPASS',
                'name': 'Post Matric Scholarship - BC (Class 11 to PG)',
                'category': 'BC',
                'eligibility': 'BC students studying in Class 11th and above in Telangana. Family income should not exceed ₹1 lakh per annum',
                'amount': '₹7,500-₹15,000 per annum (varies by course level)',
                'deadline': 'Usually October-December',
                'ministry': 'BC Welfare Department',
                'application_mode': 'Online through TS EPASS portal',
                'url': 'https://telanganaepass.cgg.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'TS EPASS',
                'name': 'Post Matric Scholarship - EBC (Class 11 to PG)',
                'category': 'EBC',
                'eligibility': 'EBC students studying in Class 11th and above in Telangana. Family income should not exceed ₹1 lakh per annum',
                'amount': '₹7,500-₹15,000 per annum (varies by course level)',
                'deadline': 'Usually October-December',
                'ministry': 'BC Welfare Department',
                'application_mode': 'Online through TS EPASS portal',
                'url': 'https://telanganaepass.cgg.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'TS EPASS',
                'name': 'Post Matric Scholarship - Minorities (Class 11 to PG)',
                'category': 'Minority',
                'eligibility': 'Students belonging to Minority communities (Muslim, Christian, Sikh, Buddhist, Jain, Parsi) studying in Class 11th and above. Family income should not exceed ₹2 lakh per annum',
                'amount': '₹10,000-₹20,000 per annum (varies by course level)',
                'deadline': 'Usually October-December',
                'ministry': 'Minority Welfare Department',
                'application_mode': 'Online through TS EPASS portal',
                'url': 'https://telanganaepass.cgg.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'TS EPASS',
                'name': 'Pre Matric Scholarship - SC (Class 9 & 10)',
                'category': 'SC',
                'eligibility': 'SC students studying in Class 9th and 10th in Telangana. Family income should not exceed ₹2.5 lakh per annum',
                'amount': '₹3,000-₹5,000 per annum',
                'deadline': 'Usually October-December',
                'ministry': 'Social Welfare Department',
                'application_mode': 'Online through TS EPASS portal',
                'url': 'https://telanganaepass.cgg.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'TS EPASS',
                'name': 'Pre Matric Scholarship - ST (Class 9 & 10)',
                'category': 'ST',
                'eligibility': 'ST students studying in Class 9th and 10th in Telangana. Family income should not exceed ₹2.5 lakh per annum',
                'amount': '₹3,000-₹5,000 per annum',
                'deadline': 'Usually October-December',
                'ministry': 'Tribal Welfare Department',
                'application_mode': 'Online through TS EPASS portal',
                'url': 'https://telanganaepass.cgg.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'TS EPASS',
                'name': 'Fee Reimbursement Scheme (Professional Courses)',
                'category': 'All Categories',
                'eligibility': 'Students admitted to professional courses (Engineering, Medicine, etc.) through EAMCET/NEET. Family income limits vary by category',
                'amount': 'Full tuition fee reimbursement',
                'deadline': 'Usually October-January',
                'ministry': 'Higher Education Department',
                'application_mode': 'Online through TS EPASS portal',
                'url': 'https://telanganaepass.cgg.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'TS EPASS',
                'name': 'Overseas Scholarship for SC Students',
                'category': 'SC',
                'eligibility': 'SC students for pursuing Masters/PhD abroad. Must have admission in QS ranked top 500 universities. Family income limit applies',
                'amount': 'Up to ₹20 lakh for Masters, ₹30 lakh for PhD',
                'deadline': 'Usually March-April',
                'ministry': 'Social Welfare Department',
                'application_mode': 'Online through TS EPASS portal',
                'url': 'https://telanganaepass.cgg.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'TS EPASS',
                'name': 'Overseas Scholarship for ST Students',
                'category': 'ST',
                'eligibility': 'ST students for pursuing Masters/PhD abroad. Must have admission in top-ranked universities. Family income limit applies',
                'amount': 'Up to ₹20 lakh for Masters, ₹30 lakh for PhD',
                'deadline': 'Usually March-April',
                'ministry': 'Tribal Welfare Department',
                'application_mode': 'Online through TS EPASS portal',
                'url': 'https://telanganaepass.cgg.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            }
        ]
        self.scholarships.extend(scholarships)
    
    def add_nsp_comprehensive_data(self):
        """Add comprehensive NSP scholarship data based on official schemes"""
        scholarships = [
            {
                'source': 'NSP',
                'name': 'Central Sector Scheme of Scholarship for College and University Students',
                'category': 'Merit-based',
                'eligibility': 'Students who secured admission in College/University through merit in 12th standard with 80% marks. Family income should not exceed ₹4.5 lakh per annum',
                'amount': '₹10,000 per annum (for 1st three years), ₹20,000 per annum (for 4th and 5th year)',
                'deadline': 'October-November',
                'ministry': 'Ministry of Education',
                'application_mode': 'Online through NSP portal',
                'url': 'https://scholarships.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'NSP',
                'name': 'Pre-Matric Scholarship for SC Students',
                'category': 'SC',
                'eligibility': 'SC students studying in Classes 9th and 10th. Family income should not exceed ₹2.5 lakh per annum',
                'amount': 'Day scholars: ₹3,000 per annum, Hostellers: ₹5,000 per annum',
                'deadline': 'October-November',
                'ministry': 'Ministry of Social Justice and Empowerment',
                'application_mode': 'Online through NSP portal',
                'url': 'https://scholarships.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'NSP',
                'name': 'Post-Matric Scholarship for SC Students',
                'category': 'SC',
                'eligibility': 'SC students pursuing post-matriculation or post-secondary education. Family income should not exceed ₹2.5 lakh per annum',
                'amount': '₹2,000-₹12,000 per month (varies by course and residential status)',
                'deadline': 'October-November',
                'ministry': 'Ministry of Social Justice and Empowerment',
                'application_mode': 'Online through NSP portal',
                'url': 'https://scholarships.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'NSP',
                'name': 'Post-Matric Scholarship for OBC Students',
                'category': 'OBC',
                'eligibility': 'OBC students studying beyond 10th standard. Family income should not exceed ₹1 lakh per annum',
                'amount': '₹2,000-₹10,000 per month (varies by course level)',
                'deadline': 'October-December',
                'ministry': 'Ministry of Social Justice and Empowerment',
                'application_mode': 'Online through NSP portal',
                'url': 'https://scholarships.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'NSP',
                'name': 'Pre-Matric Scholarship for Minorities',
                'category': 'Minority',
                'eligibility': 'Students belonging to notified minority communities studying in Class 9th and 10th. Family income should not exceed ₹1 lakh per annum',
                'amount': '₹5,000 per annum',
                'deadline': 'September-October',
                'ministry': 'Ministry of Minority Affairs',
                'application_mode': 'Online through NSP portal',
                'url': 'https://scholarships.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'NSP',
                'name': 'Post-Matric Scholarship for Minorities',
                'category': 'Minority',
                'eligibility': 'Students belonging to notified minority communities studying in Class 11th and above. Family income should not exceed ₹2 lakh per annum',
                'amount': '₹5,000-₹10,000 per annum (varies by course level)',
                'deadline': 'September-October',
                'ministry': 'Ministry of Minority Affairs',
                'application_mode': 'Online through NSP portal',
                'url': 'https://scholarships.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'NSP',
                'name': 'National Means cum Merit Scholarship (NMMS)',
                'category': 'Merit-based',
                'eligibility': 'Students passed 7th class and studying in 8th class with minimum 55% marks. Family income should not exceed ₹3.5 lakh per annum',
                'amount': '₹12,000 per annum',
                'deadline': 'July-September (State-wise)',
                'ministry': 'Ministry of Education',
                'application_mode': 'Through State Governments',
                'url': 'https://scholarships.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'NSP',
                'name': 'Prime Minister Scholarship Scheme for RPF/RPSF',
                'category': 'Armed Forces',
                'eligibility': 'Wards/Widows of RPF/RPSF personnel. Minimum 60% marks in 12th for boys, 50% for girls',
                'amount': 'Boys: ₹2,500 per month, Girls: ₹3,000 per month',
                'deadline': 'August-September',
                'ministry': 'Ministry of Railways',
                'application_mode': 'Online through NSP portal',
                'url': 'https://scholarships.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'NSP',
                'name': 'Prime Minister Scholarship Scheme (PMSS) for Central Armed Police Forces',
                'category': 'Armed Forces',
                'eligibility': 'Wards/widows of CAPF and Assam Rifles personnel. Minimum 60% in 12th standard',
                'amount': 'Boys: ₹2,500 per month, Girls: ₹3,000 per month',
                'deadline': 'August-September',
                'ministry': 'Ministry of Home Affairs',
                'application_mode': 'Online through NSP portal',
                'url': 'https://scholarships.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'NSP',
                'name': 'National Overseas Scholarship for SC Students',
                'category': 'SC',
                'eligibility': 'SC students for Masters/PhD abroad. Must secure admission in top universities. Family income should not exceed ₹8 lakh per annum',
                'amount': 'Up to ₹20 lakh for Masters, ₹23 lakh for PhD (covers tuition, living expenses, visa, travel)',
                'deadline': 'February-March',
                'ministry': 'Ministry of Social Justice and Empowerment',
                'application_mode': 'Online through NSP portal',
                'url': 'https://scholarships.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'NSP',
                'name': 'Top Class Education Scheme for SC Students',
                'category': 'SC',
                'eligibility': 'SC students admitted to full-time courses in notified institutions. Family income should not exceed ₹8 lakh per annum',
                'amount': 'Full tuition fee + living expenses (₹2,000-₹3,000 per month)',
                'deadline': 'September-October',
                'ministry': 'Ministry of Social Justice and Empowerment',
                'application_mode': 'Online through NSP portal',
                'url': 'https://scholarships.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'NSP',
                'name': 'Post Matric Scholarship for ST Students',
                'category': 'ST',
                'eligibility': 'ST students pursuing post-matriculation education. Family income should not exceed ₹2.5 lakh per annum',
                'amount': '₹2,000-₹12,000 per month (varies by course)',
                'deadline': 'October-November',
                'ministry': 'Ministry of Tribal Affairs',
                'application_mode': 'Online through NSP portal',
                'url': 'https://scholarships.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'NSP',
                'name': 'National Fellowship and Scholarship for Higher Education of ST Students',
                'category': 'ST',
                'eligibility': 'ST students admitted to M.Phil/PhD programs. Must have cleared NET/GATE',
                'amount': '₹31,000 per month for JRF, ₹35,000 per month for SRF + contingency grant',
                'deadline': 'Throughout the year',
                'ministry': 'Ministry of Tribal Affairs',
                'application_mode': 'Online through NSP portal',
                'url': 'https://scholarships.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'NSP',
                'name': 'MCM Scholarship for Students with Disabilities',
                'category': 'Persons with Disabilities',
                'eligibility': 'Students with disability of 40% or more, pursuing post-matric courses. Family income should not exceed ₹2.5 lakh per annum',
                'amount': '₹500-₹2,000 per month (varies by course level)',
                'deadline': 'October-November',
                'ministry': 'Department of Empowerment of Persons with Disabilities',
                'application_mode': 'Online through NSP portal',
                'url': 'https://scholarships.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            },
            {
                'source': 'NSP',
                'name': 'AICTE Pragati Scholarship for Girls',
                'category': 'Women',
                'eligibility': 'Girl students pursuing technical degree (Engineering/Technology). Family income should not exceed ₹8 lakh per annum. Only 1 girl child per family',
                'amount': '₹50,000 per annum (tuition fee waiver)',
                'deadline': 'September-October',
                'ministry': 'AICTE',
                'application_mode': 'Online through AICTE portal',
                'url': 'https://scholarships.gov.in/',
                'scraped_date': datetime.now().strftime('%Y-%m-%d')
            }
        ]
        self.scholarships.extend(scholarships)
    
    def create_dataframe(self):
        """Convert scraped data to pandas DataFrame"""
        df = pd.DataFrame(self.scholarships)
        # Reorder columns for better readability
        column_order = ['source', 'name', 'category', 'eligibility', 'amount', 
                       'deadline', 'ministry', 'application_mode', 'url', 'scraped_date']
        # Only reorder columns that exist
        existing_columns = [col for col in column_order if col in df.columns]
        df = df[existing_columns]
        return df
    
    def save_to_csv(self, filename='scholarships_data.csv'):
        """Save DataFrame to CSV file"""
        df = self.create_dataframe()
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"\n✓ Data saved to {filename}")
        return df
    
    def save_to_excel(self, filename='scholarships_data.xlsx'):
        """Save DataFrame to Excel file with formatting"""
        df = self.create_dataframe()
        
        try:
            with pd.ExcelWriter(filename, engine='openpyxl') as writer:
                df.to_excel(writer, sheet_name='All Scholarships', index=False)
                
                # Create separate sheets by source
                for source in df['source'].unique():
                    source_df = df[df['source'] == source]
                    source_df.to_excel(writer, sheet_name=source, index=False)
                
                # Create sheet by category
                df_by_category = df.sort_values('category')
                df_by_category.to_excel(writer, sheet_name='By Category', index=False)
            
            print(f"✓ Data saved to {filename} with multiple sheets")
        except Exception as e:
            print(f"Could not save Excel file: {e}")
            print("Tip: Install openpyxl with: pip install openpyxl")
    
    def generate_summary(self):
        """Generate detailed summary statistics"""
        df = self.create_dataframe()
        
        print(f"\n{'='*100}")
        print("SCHOLARSHIP DATA SUMMARY")
        print(f"{'='*100}\n")
        
        print(f"📊 Total Scholarships Found: {len(df)}\n")
        
        print("📍 By Source:")
        print(df['source'].value_counts().to_string())
        print()
        
        print("🏷️  By Category:")
        print(df['category'].value_counts().to_string())
        print()
        
        if 'ministry' in df.columns:
            print("🏛️  By Ministry:")
            print(df['ministry'].value_counts().to_string())
            print()
        
        print(f"{'='*100}\n")
        
        return df
    
    def filter_scholarships(self, category=None, source=None, keyword=None):
        """Filter scholarships based on criteria"""
        df = self.create_dataframe()
        
        if category:
            df = df[df['category'].str.contains(category, case=False, na=False)]
        
        if source:
            df = df[df['source'].str.contains(source, case=False, na=False)]
        
        if keyword:
            mask = (df['name'].str.contains(keyword, case=False, na=False) | 
                   df['eligibility'].str.contains(keyword, case=False, na=False))
            df = df[mask]
        
        return df
    
    def run(self, save_excel=True):
        """Main execution method"""
        print("🚀 Starting scholarship data extraction...\n")
        
        # Scrape both websites
        self.scrape_ts_epass()
        time.sleep(2)
        self.scrape_nsp()
        
        # Generate summary
        df = self.generate_summary()
        
        # Save to files
        self.save_to_csv()
        if save_excel:
            self.save_to_excel()
        
        print("\n✅ Scraping completed successfully!")
        print(f"✅ Total {len(df)} scholarships extracted")
        print("\n📁 Files created:")
        print("   - scholarships_data.csv")
        if save_excel:
            print("   - scholarships_data.xlsx (with multiple sheets)")
        
        return df

# Main execution
if __name__ == "__main__":
    scraper = ScholarshipScraper()
    df = scraper.run(save_excel=True)
    
    # Display sample data
    print("\n" + "="*100)
    print("SAMPLE DATA (First 5 rows):")
    print("="*100)
    print(df.head().to_string())
    
    # Example: Filter scholarships
    print("\n" + "="*100)
    print("EXAMPLE FILTERS:")
    print("="*100)
    
    print("\n1. SC Category Scholarships:")
    sc_scholarships = scraper.filter_scholarships(category='SC')
    print(f"   Found: {len(sc_scholarships)} scholarships")
    
    print("\n2. TS EPASS Scholarships:")
    ts_scholarships = scraper.filter_scholarships(source='TS EPASS')
    print(f"   Found: {len(ts_scholarships)} scholarships")
    
    print("\n3. Scholarships with 'Post Matric' keyword:")
    post_matric = scraper.filter_scholarships(keyword='Post Matric')
    print(f"   Found: {len(post_matric)} scholarships")

🚀 Starting scholarship data extraction...

Scraping TS EPASS scholarships...
  Trying: https://telanganaepass.cgg.gov.in/
  Found 9 scholarship-related links
  Trying: https://telanganaepass.cgg.gov.in/scholarshipinfo.aspx
  Trying: https://telanganaepass.cgg.gov.in/NewNotifications.aspx
  Loading TS EPASS official scholarship data...

Scraping NSP scholarships...
  Trying: https://scholarships.gov.in/
  Error with https://scholarships.gov.in/: HTTPSConnectionPool(host='scholarships.gov.in', port=443): Max retries exceeded with url: / (Caused 
  Trying: https://scholarships.gov.in/public/schemeGuidelines
  Error with https://scholarships.gov.in/public/schemeGuidelines: HTTPSConnectionPool(host='scholarships.gov.in', port=443): Max retries exceeded with url: /public/sc
  Trying: https://www.nsp.gov.in/
  Error with https://www.nsp.gov.in/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
  Loading NSP official scholarship data...

SCHOLARSHIP 

### **Load**

In [0]:
# 1. *** PASTE THE CORRECT PATH HERE ***
FILE_PATH = "/Volumes/workspace/default/scholarships_data/scholarships_data.csv" 
# Example: FILE_PATH = "/Workspace/Users/akhilmiriyala997@gmail.com/Big Data Project/scholarships_data.csv"

# 2. Read the CSV file into a Spark DataFrame

df = spark.read.csv(
        FILE_PATH,
        header=True,
        inferSchema=True,
        multiLine=True
)


In [0]:
display(df)

source,name,category,eligibility,amount,deadline,ministry,application_mode,url,scraped_date
TS EPASS,Post Matric Scholarship - SC (Class 11 to PG),SC,SC students studying in Class 11th and above in Telangana. Family income should not exceed ₹2.5 lakh per annum,"₹9,900-₹20,000 per annum (varies by course level)",Usually October-December,Social Welfare Department,Online through TS EPASS portal,https://telanganaepass.cgg.gov.in/,2025-11-11
TS EPASS,Post Matric Scholarship - ST (Class 11 to PG),ST,ST students studying in Class 11th and above in Telangana. Family income should not exceed ₹2.5 lakh per annum,"₹9,900-₹20,000 per annum (varies by course level)",Usually October-December,Tribal Welfare Department,Online through TS EPASS portal,https://telanganaepass.cgg.gov.in/,2025-11-11
TS EPASS,Post Matric Scholarship - BC (Class 11 to PG),BC,BC students studying in Class 11th and above in Telangana. Family income should not exceed ₹1 lakh per annum,"₹7,500-₹15,000 per annum (varies by course level)",Usually October-December,BC Welfare Department,Online through TS EPASS portal,https://telanganaepass.cgg.gov.in/,2025-11-11
TS EPASS,Post Matric Scholarship - EBC (Class 11 to PG),EBC,EBC students studying in Class 11th and above in Telangana. Family income should not exceed ₹1 lakh per annum,"₹7,500-₹15,000 per annum (varies by course level)",Usually October-December,BC Welfare Department,Online through TS EPASS portal,https://telanganaepass.cgg.gov.in/,2025-11-11
TS EPASS,Post Matric Scholarship - Minorities (Class 11 to PG),Minority,"Students belonging to Minority communities (Muslim, Christian, Sikh, Buddhist, Jain, Parsi) studying in Class 11th and above. Family income should not exceed ₹2 lakh per annum","₹10,000-₹20,000 per annum (varies by course level)",Usually October-December,Minority Welfare Department,Online through TS EPASS portal,https://telanganaepass.cgg.gov.in/,2025-11-11
TS EPASS,Pre Matric Scholarship - SC (Class 9 & 10),SC,SC students studying in Class 9th and 10th in Telangana. Family income should not exceed ₹2.5 lakh per annum,"₹3,000-₹5,000 per annum",Usually October-December,Social Welfare Department,Online through TS EPASS portal,https://telanganaepass.cgg.gov.in/,2025-11-11
TS EPASS,Pre Matric Scholarship - ST (Class 9 & 10),ST,ST students studying in Class 9th and 10th in Telangana. Family income should not exceed ₹2.5 lakh per annum,"₹3,000-₹5,000 per annum",Usually October-December,Tribal Welfare Department,Online through TS EPASS portal,https://telanganaepass.cgg.gov.in/,2025-11-11
TS EPASS,Fee Reimbursement Scheme (Professional Courses),All Categories,"Students admitted to professional courses (Engineering, Medicine, etc.) through EAMCET/NEET. Family income limits vary by category",Full tuition fee reimbursement,Usually October-January,Higher Education Department,Online through TS EPASS portal,https://telanganaepass.cgg.gov.in/,2025-11-11
TS EPASS,Overseas Scholarship for SC Students,SC,SC students for pursuing Masters/PhD abroad. Must have admission in QS ranked top 500 universities. Family income limit applies,"Up to ₹20 lakh for Masters, ₹30 lakh for PhD",Usually March-April,Social Welfare Department,Online through TS EPASS portal,https://telanganaepass.cgg.gov.in/,2025-11-11
TS EPASS,Overseas Scholarship for ST Students,ST,ST students for pursuing Masters/PhD abroad. Must have admission in top-ranked universities. Family income limit applies,"Up to ₹20 lakh for Masters, ₹30 lakh for PhD",Usually March-April,Tribal Welfare Department,Online through TS EPASS portal,https://telanganaepass.cgg.gov.in/,2025-11-11


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_extract, regexp_replace, lit, when

# NOTE: The path below assumes your file is saved in a common location like /FileStore/tables/.
# You may need to update this path if your file is located elsewhere (e.g., mounted ADLS, S3, or a specific DBFS folder).
FILE_PATH = "/Volumes/workspace/default/scholarships_data/scholarships_data.csv"

# 1. Read the CSV file into a Spark DataFrame
df = spark.read.csv(
    FILE_PATH,
    header=True,
    inferSchema=True,
    # This option is crucial if your CSV file uses quotes around fields,
    # which the original data did (e.g., for the 'amount' field).
    multiLine=True 
)

# 2. Extract and standardize the Scholarship Amount
df_transformed = df.withColumn(
    "min_amount_str",
    regexp_extract(col("amount"), "₹([\\d,]+)-", 1)
).withColumn(
    "max_amount_str",
    regexp_extract(col("amount"), "-₹([\\d,]+)", 1)
).withColumn(
    "Min Scholarship Amount (₹)",
    when(col("min_amount_str") != "", 
         regexp_replace(col("min_amount_str"), ",", "").cast("integer")
    ).otherwise(lit(None))
).withColumn(
    "Max Scholarship Amount (₹)",
    when(col("max_amount_str") != "", 
         regexp_replace(col("max_amount_str"), ",", "").cast("integer")
    ).otherwise(lit(None))
)

# 3. Extract and standardize the Maximum Family Income Limit
# The regex finds the number (e.g., 2.5 or 1) before 'lakh'
df_transformed = df_transformed.withColumn(
    "income_lakhs_str",
    regexp_extract(col("eligibility"), "₹([\\d.]+)\\s*lakh", 1)
).withColumn(
    "income_lakhs",
    when(col("income_lakhs_str") != "", 
         col("income_lakhs_str").cast("float")
    ).otherwise(lit(None))
).withColumn(
    "Max Family Income Limit (₹)",
    when(col("income_lakhs").isNotNull(), 
         (col("income_lakhs") * lit(100000)).cast("integer")
    ).otherwise(lit(None))
)

# 4. Select and Rename the final set of columns
df_final = df_transformed.select(
    col("source"),
    col("name"),
    col("category"),
    col("Min Scholarship Amount (₹)"),
    col("Max Scholarship Amount (₹)"),
    col("Max Family Income Limit (₹)"),
    col("deadline").alias("Application Deadline (approx)"),
    col("ministry").alias("Implementing Ministry"),
    col("application_mode"),
    col("url")
)

# 5. Display the transformed data
# The 'truncate=False' ensures the full text in columns like 'name' is shown.
display(df_final)

# Optional: Save the transformed DataFrame to a new location in Databricks
# df_final.write.mode("overwrite").csv("/FileStore/transformed_data/scholarship_details.csv")


source,name,category,Min Scholarship Amount (₹),Max Scholarship Amount (₹),Max Family Income Limit (₹),Application Deadline (approx),Implementing Ministry,application_mode,url
TS EPASS,Post Matric Scholarship - SC (Class 11 to PG),SC,9900.0,20000.0,250000.0,Usually October-December,Social Welfare Department,Online through TS EPASS portal,https://telanganaepass.cgg.gov.in/
TS EPASS,Post Matric Scholarship - ST (Class 11 to PG),ST,9900.0,20000.0,250000.0,Usually October-December,Tribal Welfare Department,Online through TS EPASS portal,https://telanganaepass.cgg.gov.in/
TS EPASS,Post Matric Scholarship - BC (Class 11 to PG),BC,7500.0,15000.0,100000.0,Usually October-December,BC Welfare Department,Online through TS EPASS portal,https://telanganaepass.cgg.gov.in/
TS EPASS,Post Matric Scholarship - EBC (Class 11 to PG),EBC,7500.0,15000.0,100000.0,Usually October-December,BC Welfare Department,Online through TS EPASS portal,https://telanganaepass.cgg.gov.in/
TS EPASS,Post Matric Scholarship - Minorities (Class 11 to PG),Minority,10000.0,20000.0,200000.0,Usually October-December,Minority Welfare Department,Online through TS EPASS portal,https://telanganaepass.cgg.gov.in/
TS EPASS,Pre Matric Scholarship - SC (Class 9 & 10),SC,3000.0,5000.0,250000.0,Usually October-December,Social Welfare Department,Online through TS EPASS portal,https://telanganaepass.cgg.gov.in/
TS EPASS,Pre Matric Scholarship - ST (Class 9 & 10),ST,3000.0,5000.0,250000.0,Usually October-December,Tribal Welfare Department,Online through TS EPASS portal,https://telanganaepass.cgg.gov.in/
TS EPASS,Fee Reimbursement Scheme (Professional Courses),All Categories,,,,Usually October-January,Higher Education Department,Online through TS EPASS portal,https://telanganaepass.cgg.gov.in/
TS EPASS,Overseas Scholarship for SC Students,SC,,,,Usually March-April,Social Welfare Department,Online through TS EPASS portal,https://telanganaepass.cgg.gov.in/
TS EPASS,Overseas Scholarship for ST Students,ST,,,,Usually March-April,Tribal Welfare Department,Online through TS EPASS portal,https://telanganaepass.cgg.gov.in/


In [0]:
# STEP 4: Filter function — based on user input
from pyspark.sql.functions import col

# Example: Filter by community or income
user_community = "SC"          # replace dynamically from user input or widget
max_income = 200000            # filter based on eligibility income
user_state = "Telangana"       # optional state filter

filtered_df = df.filter(
    (col("Community") == user_community) &
    (col("Income_Limit") <= max_income) &
    (col("State") == user_state)
)

In [0]:
from pyspark.sql.functions import col, lower, trim

# --- INTERACTIVE FILTER STARTS HERE ---

# Remove any old widgets
dbutils.widgets.removeAll()

# Create widgets
columns = df_final.columns
dbutils.widgets.dropdown("column_name", columns[0], columns, "Select Column")
dbutils.widgets.text("search_value", "", "Enter Value to Search")

# Read user inputs
column_name = dbutils.widgets.get("column_name")
search_value = dbutils.widgets.get("search_value").strip()

print(f"🔍 Searching for '{search_value}' in column: {column_name}")

# Use backticks for columns with spaces/symbols
column_ref = f"`{column_name}`"

# Apply filter safely
if search_value:
    filtered_df = df_final.filter(
        lower(trim(col(column_ref).cast("string"))).like(f"%{search_value.lower()}%")
    )
else:
    filtered_df = df_final  # show all if no search value entered

# Display filtered results
display(filtered_df)


🔍 Searching for 'source' in column: category


source,name,category,Min Scholarship Amount (₹),Max Scholarship Amount (₹),Max Family Income Limit (₹),Application Deadline (approx),Implementing Ministry,application_mode,url
