In [1]:
!pip install requests beautifulsoup4 pandas



In [2]:
import requests
import re
import json
import time
import pandas as pd
from bs4 import BeautifulSoup


In [3]:
E_LEARNING_PLATFORMS = {
    "moodle": ["moodle", "/login/index.php"],
    "blackboard": ["blackboard", "Blackboard Learn"],
    "canvas": ["canvas", "by Instructure"],
    "google classroom": ["classroom.google.com"],
    "sakai": ["sakai"],
    "virtual learning": ["virtual learning", "e-learning", "lms"]
}

KENET_URL = "https://www.kenet.or.ke/content/list-kenet-members"


In [4]:
!pip install certifi



In [5]:
import certifi

def scrape_kenet():
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(KENET_URL, headers=headers, timeout=10, verify=certifi.where())
    
    if response.status_code != 200:
        print("Failed to fetch KENET page.")
        return []
    
    soup = BeautifulSoup(response.text, "html.parser")
    
    institutions = []
    for link in soup.find_all("a", href=True):
        href = link["href"]
        name = link.text.strip()
        
        if ".ke" in href:  # Ensures it's a valid Kenyan website
            institutions.append({"name": name, "website": href})
    
    return institutions

institutions = scrape_kenet()
print(f"Found {len(institutions)} institutions.")


Found 203 institutions.


In [6]:
def extract_emails(text):
    return list(set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)))

def extract_phone_numbers(text):
    return list(set(re.findall(r"\+254\d{9}|\b0\d{9}\b", text)))

def detect_elearning_platform(soup):
    page_text = soup.get_text().lower()
    for platform, keywords in E_LEARNING_PLATFORMS.items():
        if any(keyword.lower() in page_text for keyword in keywords):
            return platform.capitalize()
    return "Unknown"

def scrape_institution_details(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            return None, None, None, None, None
        
        soup = BeautifulSoup(response.text, "html.parser")
        text = soup.get_text()

        # Extract emails & phone numbers
        emails = extract_emails(text)
        phones = extract_phone_numbers(text)

        # Find e-learning URL
        elearning_url = None
        for link in soup.find_all("a", href=True):
            href = link["href"]
            if any(keyword in href.lower() for keyword in E_LEARNING_PLATFORMS):
                elearning_url = href
                break

        # If e-learning URL found, scrape its details
        elearning_platform = "Unknown"
        elearning_emails, elearning_phones = [], []
        
        if elearning_url:
            try:
                elearning_response = requests.get(elearning_url, headers=headers, timeout=10)
                if elearning_response.status_code == 200:
                    elearning_soup = BeautifulSoup(elearning_response.text, "html.parser")
                    elearning_platform = detect_elearning_platform(elearning_soup)
                    elearning_text = elearning_soup.get_text()
                    elearning_emails = extract_emails(elearning_text)
                    elearning_phones = extract_phone_numbers(elearning_text)
            except:
                pass

        return emails, phones, elearning_url, elearning_platform, (elearning_emails, elearning_phones)

    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return None, None, None, None, (None, None)


In [7]:
final_data = []

for institution in institutions:
    name = institution["name"]
    website = institution["website"]

    print(f"Scraping: {name} ({website})")
    emails, phones, elearning_url, elearning_platform, (elearning_emails, elearning_phones) = scrape_institution_details(website)

    final_data.append({
        "Institution Name": name,
        "Website URL": website,
        "E-learning URL": elearning_url if elearning_url else "Not Found",
        "E-learning Platform": elearning_platform,
        "Main Website Emails": emails if emails else ["N/A"],
        "Main Website Phones": phones if phones else ["N/A"],
        "E-learning Emails": elearning_emails if elearning_emails else ["N/A"],
        "E-learning Phones": elearning_phones if elearning_phones else ["N/A"]
    })
    
    time.sleep(2)  # Avoid rate limits


Scraping: Gallery (https://gallery.kenet.or.ke/)
Scraping: KENET GPU Cluster (https://www.kenet.or.ke/kenet-gpu-cluster)
Scraping: Security Awareness (https://www.kenet.or.ke/cyber-security)
Scraping: CERT (https://cert.kenet.or.ke/)
Scraping: E-readiness Project (http://ereadiness.kenet.or.ke/)
Scraping: Marsabit Project (http://marsabit.kenet.or.ke)
Scraping: Schools Connectivity Initiative (http://schools.kenet.or.ke)
Scraping: Remote Teaching (https://www.kenet.or.ke/remote-teaching)
Scraping: JOBS (https://recruitment.kenet.or.ke/2023_jobs)
Scraping: Alupe University (https://au.ac.ke/)
Failed to scrape https://au.ac.ke/: HTTPSConnectionPool(host='au.ac.ke', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)')))
Scraping: Chuka University (https://www.chuka.ac.ke/)
Failed to scrape https://www.chuka.ac.ke/: HTTPSConnectionPool(h

TypeError: cannot unpack non-iterable NoneType object

In [8]:
with open("kenyan_elearning_institution_platforms.json", "w") as f:
    json.dump(final_data, f, indent=4)

df = pd.DataFrame(final_data)
df.to_csv("kenyan_elearning_institution_platforms.csv", index=False)

print("Scraping complete! Data saved to JSON and CSV.")


Scraping complete! Data saved to JSON and CSV.
