In [1]:
import pandas as pd
import re

unwanted_words = [
    "developer", "entwickler", "softwareentwickler", "programmierer", "devops",
    "administrator", "systemadministrator", "qa", "test", "marketing", "engineer"
]
skills_flags = [
    'support', 'it-support', 'helpdesk', 'kundensupport', 'user support', 'troubleshooting',
    'kundenservice', 'ticket', 'problembehandlung', 'windows', 'linux', 'hardware', 'software',
    'netzwerk', 'network', 'service desk', 'anwendungsbetreuung', 'telefon', 'kunden', 'remote',
    'first level', 'second level', 'b2b', 'b2c', 'service'
]

def seniority_detect(text):
    levels = {
        'Werkstudent': ['werkstudent', 'working student'],
        'Praktikum': ['praktikum', 'internship', 'praktikant'],
        'Junior': ['junior'],
        'Senior': ['senior', 'lead', 'principal'],
        'Manager': ['manager', 'leitung', 'leiter', 'head'],
        'Entry': ['entry', 'anfänger', 'einsteiger']
    }
    text = str(text).lower()
    for lvl, keys in levels.items():
        if any(k in text for k in keys):
            return lvl
    return 'Other'

def detect_language(text):
    text = str(text).lower()
    en_count = sum(word in text for word in ['responsibilities', 'requirements', 'apply', 'english'])
    de_count = sum(word in text for word in ['anforderungen', 'deutsch', 'bewerben', 'verantwortung', 'aufgaben'])
    if en_count > de_count and en_count >= 2:
        return 'en'
    elif de_count > en_count and de_count >= 2:
        return 'de'
    else:
        return 'mixed'

def clean_title(title):
    if not isinstance(title, str):
        return ''
    title = re.sub(r'\s*\((?:m/w/d|m/f/x|gn|all genders|alle|div|m|w|d)\)', '', title, flags=re.IGNORECASE)
    title = re.sub(r'\s+', ' ', title)
    return title.strip()

chunksize = 500

# Определи финальный список колонок ЗАРАНЕЕ!
all_cols = [
    'Title', 'Company', 'Location', 'Link', 'Keyword', 'City',
    'Description', 'Remote', 'Seniority', 'Lang'
] + [f'Has_{skill.replace(" ", "_").upper()}' for skill in skills_flags]

with pd.read_csv('raw_techsupport_germany.csv', chunksize=chunksize) as reader:
    for i, chunk in enumerate(reader):
        chunk['description_lower'] = chunk['Description'].fillna('').str.lower()
        chunk = chunk[~chunk['description_lower'].str.contains('|'.join(unwanted_words), case=False, na=False)]
        chunk = chunk[~chunk['Title'].str.contains('|'.join(unwanted_words), case=False, na=False)]

        chunk['Remote'] = chunk['description_lower'].apply(
            lambda x: any(word in x for word in ['remote', 'home office', 'hybrid', 'remotearbeit', 'homeoffice'])
        )

        for skill in skills_flags:
            col_name = f'Has_{skill.replace(" ", "_").upper()}'
            chunk[col_name] = chunk['description_lower'].apply(lambda x: skill in x)

        chunk['Seniority'] = chunk['description_lower'].apply(seniority_detect)
        chunk['Lang'] = chunk['description_lower'].apply(detect_language)
        chunk.drop(columns=['description_lower'], inplace=True)

        chunk = chunk.drop_duplicates(subset=['Link'])
        chunk['Title'] = chunk['Title'].apply(clean_title)
        chunk['Seniority'] = chunk['Seniority'].str.lower().str.strip()
        chunk['Lang'] = chunk['Lang'].str.lower().str.strip()
        chunk['Company'] = chunk['Company'].str.strip().str.title()

        bool_cols = [col for col in chunk.columns if col.startswith('Has_') or col in ['Remote']]
        for col in bool_cols:
            chunk[col] = chunk[col].apply(lambda x: True if str(x).strip().lower() in ['true', '1', 'yes'] else False)

        chunk = chunk.dropna(subset=['Title', 'Company'], how='all')
        # !!! Не трогай dropna(axis=1) — иначе структура улетает!

        # ВЫРАВНИВАЕМ КОЛОНКИ ПО ШАБЛОНУ
        for col in all_cols:
            if col not in chunk.columns:
                chunk[col] = None
        chunk = chunk[all_cols]

        # Чистим тайтлы (повторная страховка)
        chunk = chunk[chunk['Title'].str.count(r'[A-Za-zА-Яа-я]') > 5]
        chunk = chunk[~chunk['Title'].str.contains(r'^\*+|/|\\|^[\W_]+$', regex=True)]
        chunk['Title'] = chunk['Title'].str.title().str.strip()

        # Сохраняем батч!
        if i == 0:
            chunk.to_csv('techsupport_germany_clean.csv', index=False, mode='w')
        else:
            chunk.to_csv('techsupport_germany_clean.csv', index=False, header=False, mode='a')

print("✅ techsupport_germany_clean.csv готов. Данные ровные, CSV — как новая тачка после детейлинга.")


✅ techsupport_germany_clean.csv готов. Данные ровные, CSV — как новая тачка после детейлинга.
