## Initial Import of my Library

In [1]:
# Import OS lib
import os

# Prepare for OCR to get information from the initial images
import easyocr

# Prepare for crawling
from googlesearch import search

# Prepare language data
import csv

# Webscraping
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
languages = {}
async_tools = {}
ai_tools = {}
cloud_tools = {}
development_environment = {}
frameworks = {}
web_frameworks = {}
sync_tools = {}
tools = {}

with open("languages.csv", "r") as file:
    language_dict = csv.DictReader(file)

    for row in language_dict:
        if row["type"] == "pl":
            key_id = row["pldb_id"]
            languages[key_id] = row

In [4]:
files = [f for f in os.listdir('./images/') 
         if os.path.isfile(os.path.join('./images/', f))]

print(files)

['popular-languages.png', 'popular-web-framework.png', 'popular-sync-tools.png', 'popular-development-environment.png', 'popular-async-tools.png', 'popular-ai-tools.png', 'popular-frameworks.png', 'popular-cloud-tools.png', 'popular-tools.png']


In [7]:
temp_list_of_languages = []

def process_languages(text: str):
    if '%' in text or len(text.split()) > 1:
        return

    match text:
        case 'JS' : temp_list_of_languages.append('JavaScript')
        case 'C+-': temp_list_of_languages.append('Cpp')
        case 'PY' : temp_list_of_languages.append('Python')
        case 'TS' : temp_list_of_languages.append('TypeScript')
        case 'C#' : temp_list_of_languages.append('CSharp')
        case '2024' : return
        case _ : temp_list_of_languages.append(text)

In [10]:
def process_web_frameworks(text:str):
        if '%' in text:
            return

        print(text)
        match text:
            case 'Source: surveystackoverflow.co/2024' : return
            case 'Developer': return
            case 'Survey' : return
            case 'Data licensed under Open Database License (ODbL)' : return
            case '2024' : return
            case _ : temp_list_of_languages.append(text)

In [None]:
reader = easyocr.Reader(["en"])

files = [f for f in os.listdir('./images/') 
         if os.path.isfile(os.path.join('./images/', f))]

for file in files:
    result = reader.readtext("./images/" + file)
    
    # Fix data given from the image - only extract the programming languages
    for (bbox, text, prob) in result:
        match file:
            case "popular-languages.png": process_languages(text)
            case "popular-web-framework.png": process_web_frameworks(text)
            case "popular-sync-tools.png": process_web_frameworks(text)
            case "popular-development-environment.png": process_web_frameworks(text)
            case "popular-async-tools.png": process_web_frameworks(text)
            case "popular-ai-tools.png": process_web_frameworks(text)
            case "popular-frameworks.png": process_web_frameworks(text)
            case "popular-cloud-tools.png": process_web_frameworks(text)
            case "popular-tools.png": process_web_frameworks(text)

In [62]:
# Get the top 10 languages - this excludes things like SQL
languages_for_analysis = [lang for lang in temp_list_of_languages if lang.lower() in languages][:10]

print(languages_for_analysis)

['JavaScript', 'Python', 'TypeScript', 'Java', 'CSharp', 'Cpp', 'PHP', 'PowerShell', 'Rust', 'Kotlin']


In [23]:
def scrape_reddit_data(url: str):
    driver = webdriver.Chrome()
    driver.get(url)

    # Wait for comments to load
    try:
        comments = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[id="-post-rtjson-content"]')))
    
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        paragraphs = soup.select('div[id="-post-rtjson-content"] p')
    
        texts = [p.get_text(strip=True) for p in paragraphs]

        driver.close()
    
        return texts
    except:
        driver.close()
        return None

In [53]:
def scrape_hackernews_data(url: str):
    driver = webdriver.Chrome()
    driver.get(url)

    # Wait for comments to load
    try:
        
        comments = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.comment div.commtext.c00')))
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        paragraphs = soup.select('div.comment div.commtext.c00')     
        
        texts = [p.get_text(strip=True) for p in paragraphs]

        driver.close()
    
        return texts
    except:
        driver.close()
        return None

In [55]:
# Perform the search
# Reddit
# Hacknews i.e. Hackernews

opinion_dict = {}

# Get a wide range of opinions from developers
for lang in languages_for_analysis:
    # # Looking for developer sentiment on the given technology
    query =  "Opinion on " + lang + " :site reddit"
    search_results = search(query, num_results=5)

    for result in search_results:
        data = scrape_reddit_data(result)

        # Skip the data that can't be extracted
        if data is None:
            continue

        if lang not in opinion_dict:
            opinion_dict[lang] = data
        else:
            opinion_dict[lang] += data

    query =  "Opinion on " + lang + " :site hacknews"
    search_results = search(query, num_results=5)

    for result in search_results:
        data = scrape_hackernews_data(result)
        # Skip the data that can't be extracted
        if data is None:
            continue

        if lang not in opinion_dict:
            opinion_dict[lang] = data
        else:
            opinion_dict[lang] += data

977
