## Initial Import of my Library

In [56]:
# Import OS lib
import os

# Prepare for OCR to get information from the initial images
import easyocr

# Prepare for crawling
from googlesearch import search

# Prepare language data
import csv

# Webscraping
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
languages = {}

with open("languages.csv", "r") as file:
    language_dict = csv.DictReader(file)

    for row in language_dict:
        if row["type"] == "pl":
            key_id = row["pldb_id"]
            languages[key_id] = row

In [60]:
reader = easyocr.Reader(["en"])

files = [f for f in os.listdir('./images/') 
         if os.path.isfile(os.path.join('./images/', f))]

for file in files:
    result = reader.readtext("./images/" + file)
    
    # Fix data given from the image - only extract the programming languages
    for (bbox, text, prob) in result:
        print(text)

Most popular technologies
All Respondents
Programming, scripting, and markup languages
JS
62.3%
HTML/CSS
52.9%
PY
51%
SQL
51%
TS
38.5%
Bash/Shell
33.9%
Java
30.3%
C#
27.1%
C+-
23%
20.3%
PHP
18.2%
PowerShell
13.8%
13.5%
Rust
12.6%
Kotlin
9.4%
6.2%
Dart
6%
Assembly
5.4%
Ruby
5.2%
Swift
4.7%
R
4.3%
Visual Basic
4.2%
MATLAB
4%
VBA
3.7%
Groovy
3.3%
Scala
2.6%
Perl
2.5%
GDScript
2.3%
Objective-C
2.1%
Elixir
2.1%
Haskell
2%
Delphi
1.8%
MicroPython
1.6%
Lisp
1.5%
Clojure
1.2%
Julia
1.1%
Zig
1.1%
Fortran
1.1%
Solidity
1.1%
Ada
0.9%
Erlang
0.9%
F#
0.9%
Apex
0.8%
Prolog
0.8%
OCaml
0.8%
Cobol
0.7%
Crystal
0.4%
Nim
0.4%
Zephyr
0.3%
2024
Source: surveystackoverflow.co/2024
Developer
Survey
Data licensed under Open Database License (ODbL)
Lua
Most popular technologies
All Respondents
Web frameworks and technologies
Node:js
40.8%
React
39.5%
jQuery
21.4%
Next js
17.9%
Express
17.8%
Angular
17.1%
ASPNET CORE
16.9%
Vuejs
15.4%
ASPNET
12.9%
Flask
12.9%
Spring Boot
12.7%
Django
12%
WordPress
11.8%
FastAPI

In [3]:
# Load image
reader = easyocr.Reader(["en"])

result = reader.readtext("./images/popular-languages.png")

temp_list_of_languages = []

# Fix data given from the image - only extract the programming languages
for (bbox, text, prob) in result:
    if '%' in text or len(text.split()) > 1:
        continue

    match text:
        case 'JS' : temp_list_of_languages.append('JavaScript')
        case 'C+-': temp_list_of_languages.append('Cpp')
        case 'PY' : temp_list_of_languages.append('Python')
        case 'TS' : temp_list_of_languages.append('TypeScript')
        case 'C#' : temp_list_of_languages.append('CSharp')
        case _ : temp_list_of_languages.append(text)

In [4]:
# Get the top 10 languages - this excludes things like SQL

languages_for_analysis = [lang for lang in temp_list_of_languages if lang.lower() in languages][:10]

print(languages_for_analysis)

['JavaScript', 'Python', 'TypeScript', 'Java', 'CSharp', 'Cpp', 'PHP', 'PowerShell', 'Rust', 'Kotlin']


In [23]:
def scrape_reddit_data(url: str):
    driver = webdriver.Chrome()
    driver.get(url)

    # Wait for comments to load
    try:
        comments = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[id="-post-rtjson-content"]')))
    
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        paragraphs = soup.select('div[id="-post-rtjson-content"] p')
    
        texts = [p.get_text(strip=True) for p in paragraphs]

        driver.close()
    
        return texts
    except:
        driver.close()
        return None

In [53]:
def scrape_hackernews_data(url: str):
    driver = webdriver.Chrome()
    driver.get(url)

    # Wait for comments to load
    try:
        
        comments = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.comment div.commtext.c00')))
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        paragraphs = soup.select('div.comment div.commtext.c00')     
        
        texts = [p.get_text(strip=True) for p in paragraphs]

        driver.close()
    
        return texts
    except:
        driver.close()
        return None

In [55]:
# Perform the search
# Reddit
# Hacknews i.e. Hackernews

opinion_dict = {}

# Get a wide range of opinions from developers
for lang in languages_for_analysis:
    # # Looking for developer sentiment on the given technology
    query =  "Opinion on " + lang + " :site reddit"
    search_results = search(query, num_results=5)

    for result in search_results:
        data = scrape_reddit_data(result)

        # Skip the data that can't be extracted
        if data is None:
            continue

        if lang not in opinion_dict:
            opinion_dict[lang] = data
        else:
            opinion_dict[lang] += data

    query =  "Opinion on " + lang + " :site hacknews"
    search_results = search(query, num_results=5)

    for result in search_results:
        data = scrape_hackernews_data(result)
        # Skip the data that can't be extracted
        if data is None:
            continue

        if lang not in opinion_dict:
            opinion_dict[lang] = data
        else:
            opinion_dict[lang] += data

977
