## Initial Import of my Library

In [1]:
# Import OS lib
import os

# Prepare for OCR to get information from the initial images
import easyocr

# Prepare for crawling
from googlesearch import search

# Prepare language data
import csv

# Webscraping
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
languages = {}
async_tools = {}
ai_tools = {}
cloud_tools = {}
development_environment = {}
frameworks = {}
web_frameworks = {}
sync_tools = {}
tools = {}

with open("languages.csv", "r") as file:
    language_dict = csv.DictReader(file)

    for row in language_dict:
        if row["type"] == "pl":
            key_id = row["pldb_id"]
            languages[key_id] = row

In [3]:
files = [f for f in os.listdir('./images/') 
         if os.path.isfile(os.path.join('./images/', f))]

print(files)

['popular-languages.png', 'popular-web-framework.png', 'popular-sync-tools.png', 'popular-development-environment.png', 'popular-async-tools.png', 'popular-ai-tools.png', 'popular-cloud-tools.png', 'popular-tools.png']


In [4]:
def remove_unnesesarry_info(text:str):
        if '%' in text:
            return None
            
        match text:
            case 'Source: surveystackoverflow.co/2024' : return None
            case 'Source: survey stackoverflow.co/2024' : return None
            case text if 'Most' in text: return None
            case text if 'popular' in text : return None
            case text if 'Respondents' in text : return None
            case 'Web frameworks and technologies': return None
            case 'Developer': return None
            case 'Survey' : return None
            case 'Integrated development environment' : return None
            case 'Data licensed under Open Database License (ODbL)' : return None
            case '2024' : return None

        print(text)
        return text

In [5]:
temp_list_of_languages = []

def process_languages(text):
    if text is None or'%' in text or len(text.split()) > 1:
        return

    match text:
        case 'JS' : temp_list_of_languages.append('JavaScript')
        case 'C+-': temp_list_of_languages.append('Cpp')
        case 'PY' : temp_list_of_languages.append('Python')
        case 'TS' : temp_list_of_languages.append('TypeScript')
        case 'C#' : temp_list_of_languages.append('CSharp')
        case '2024' : return
        case _ : temp_list_of_languages.append(text)

In [6]:
temp_list_of_web_frameworks = []

def process_web_frameworks(text):
    if text is None or'%' in text:
        return

    match text:
        case 'Node:js' : temp_list_of_web_frameworks.append('Node.js')
        case 'Next js': temp_list_of_web_frameworks.append('Next.js')
        case 'Vuejs': temp_list_of_web_frameworks.append('Vue.js')
        case '2024' : return
        case _ : temp_list_of_web_frameworks.append(text)

In [7]:
temp_list_of_sync_tools = []

def process_sync_tools(text):
    if text is None or'%' in text:
        return

    match text:
        case 'Synchronous tools' : return
        case _ : temp_list_of_sync_tools.append(text)

In [8]:
temp_list_of_ides = []

def process_ides(text):
    if text is None or'%' in text:
        return

    match text:
        case 'Notepad+ +' : temp_list_of_ides.append('Notepad++')
        case 'Jupyter NotebooklJupyterLab' : temp_list_of_ides.append('JupyterLab')
        case _ : temp_list_of_ides.append(text)

In [9]:
temp_list_of_async_tools = []

def process_async_tools(text):
    if text is None or'%' in text:
        return

    match text:
        case ')' : return
        case 'Asynchronous tools' : return
        case _ : temp_list_of_async_tools.append(text)

In [10]:
temp_list_of_ai_tools = []

def process_ai_tools(text):
    if text is None or'%' in text:
        return

    match text:
        case 'Al' : temp_list_of_ai_tools.append('Bing AI')
        case 'Al Search and Developer Tools' : return
        case 'Perplexity Al' : temp_list_of_ai_tools.append('Perplexity AI')
        case _ : temp_list_of_ai_tools.append(text)

In [11]:
temp_list_of_cloud_tools = []

def process_cloud_tools(text):
    if text is None or'%' in text:
        return

    match text:
        case 'Cloud platforms' : return
        case _ : temp_list_of_cloud_tools.append(text)

In [19]:
temp_list_of_tools = []

def process_tools(text):
    if text is None or'%' in text:
        return

    match text:
        case 'Other tools' : return
        case _ : temp_list_of_tools.append(text)

In [17]:
reader = easyocr.Reader(["en"])

files = [f for f in os.listdir('./images/') 
         if os.path.isfile(os.path.join('./images/', f))]

for file in files:
    result = reader.readtext("./images/" + file)
    
    # Fix data given from the image - only extract the programming languages
    for (bbox, text, prob) in result:
        match file:
            # case "popular-languages.png": process_languages(remove_unnesesarry_info(text))
            # case "popular-web-framework.png": process_web_frameworks(remove_unnesesarry_info(text))
            # case "popular-sync-tools.png": process_sync_tools(remove_unnesesarry_info(text))
            # case "popular-development-environment.png": process_ides(remove_unnesesarry_info(text))
            # case "popular-async-tools.png": process_async_tools(remove_unnesesarry_info(text))
            # case "popular-ai-tools.png": process_ai_tools(remove_unnesesarry_info(text))
            # case "popular-cloud-tools.png": process_cloud_tools(remove_unnesesarry_info(text))
            case "popular-tools.png": process_tools(remove_unnesesarry_info(text))

# Clear memory
del reader

Other tools
Docker
npm
Pip
Homebrew
Make
Vite
Kubernetes
Yarn
Webpack
NuGet
Maven
Visual Studio Solution
Gradle
MSBuild
Terraform
APT
pnpm
Composer
Chocolatey
Ansible
Pacman
Unity 3D
Podman
Godot
Ninja
Bun
Google Test
Unreal Engine
Ant
Nix
Dagger
Puppet
Pulumi
Chef


In [14]:
# Get the top 10 languages - this excludes things like SQL
languages_for_analysis = [lang for lang in temp_list_of_languages if lang.lower() in languages][:10]
web_frameworks_for_analysis = [w_frame for w_frame in temp_list_of_web_frameworks][:10]
sync_tools_for_analysis = [sync_tool for sync_tool in temp_list_of_sync_tools][:10]
ides_for_analysis = [ide for ide in temp_list_of_ides][:10]
async_tools_for_analysis = [async_tool for async_tool in temp_list_of_async_tools][:10]
ai_tools_for_analysis = [ai_tool for ai_tool in temp_list_of_ai_tools][:10]
cloud_tools_for_analysis = [cloud_tool for cloud_tool in temp_list_of_cloud_tools][:10]
tools_for_analysis = [tool for tool in temp_list_of_tools][:10]

print(languages_for_analysis)
print(web_frameworks_for_analysis)
print(sync_tools_for_analysis)
print(async_tools_for_analysis)
print(ides_for_analysis)
print(ai_tools_for_analysis)
print(cloud_tools_for_analysis)
print(tools_for_analysis)

[]
[]
[]
[]
[]
[]
['Amazon Web Services', 'Microsoft Azure', 'Google Cloud', 'Cloudflare', 'Firebase', 'Vercel', 'Digital Ocean', 'Heroku', 'Netlify', 'VMware']


In [20]:
def scrape_reddit_data(url: str):
    driver = webdriver.Chrome()
    driver.get(url)

    # Wait for comments to load
    try:
        comments = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[id="-post-rtjson-content"]')))
    
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        paragraphs = soup.select('div[id="-post-rtjson-content"] p')
    
        texts = [p.get_text(strip=True) for p in paragraphs]

        driver.close()
    
        return texts
    except:
        driver.close()
        return None

In [21]:
def scrape_hackernews_data(url: str):
    driver = webdriver.Chrome()
    driver.get(url)

    # Wait for comments to load
    try:
        
        comments = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.comment div.commtext.c00')))
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        paragraphs = soup.select('div.comment div.commtext.c00')     
        
        texts = [p.get_text(strip=True) for p in paragraphs]

        driver.close()
    
        return texts
    except:
        driver.close()
        return None

In [22]:
# Perform the search
# Reddit
# Hacknews i.e. Hackernews

opinion_dict = {}

# Get a wide range of opinions from developers
for lang in languages_for_analysis:
    # # Looking for developer sentiment on the given technology
    query =  "Opinion on " + lang + " :site reddit"
    search_results = search(query, num_results=5)

    for result in search_results:
        data = scrape_reddit_data(result)

        # Skip the data that can't be extracted
        if data is None:
            continue

        if lang not in opinion_dict:
            opinion_dict[lang] = data
        else:
            opinion_dict[lang] += data

    query =  "Opinion on " + lang + " :site hacknews"
    search_results = search(query, num_results=5)

    for result in search_results:
        data = scrape_hackernews_data(result)
        # Skip the data that can't be extracted
        if data is None:
            continue

        if lang not in opinion_dict:
            opinion_dict[lang] = data
        else:
            opinion_dict[lang] += data

KeyboardInterrupt: 