In [None]:
!pip install git-lfs
!pip install pandas pyreadr
import pandas as pd
import pyreadr
import re
import requests
import os
import shutil
import zipfile

In [None]:
# Identification des fichiers csv
def extract_strings_from_webpage(url):
    response = requests.get(url) 
    if response.status_code == 200:
        strings = re.findall(r'"([^"]*)"', response.text)
        return strings
    else:
        print(f"Failed to fetch the webpage. Status code: {response.status_code}")
        return []

webpage_url = "https://unehistoireduconflitpolitique.fr/telecharger.html"
extracted_strings = extract_strings_from_webpage(webpage_url)

download_links = [item for item in extracted_strings if item.endswith("csv.zip") or item.endswith("csp.zip")]

In [None]:
# Téléchargement des fichiers
os.makedirs('data_zip', exist_ok=True)

for link in download_links:
    try:
        file_name = os.path.join('data_zip', os.path.basename(link))
        response = requests.get(link)
        with open(file_name, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded: {file_name}")
    except Exception as e:
        print(f"Error downloading {link}: {e}")
        
print("Download completed")

In [None]:
# Extraction des résultats électoraux
os.makedirs('data_csv/elections/pres', exist_ok=True)
os.makedirs('data_csv/elections/leg', exist_ok=True)
os.makedirs('data_csv/elections/ref', exist_ok=True)

files = os.listdir('data_zip')
for prefix in ['pres', 'leg', 'ref']:
    for file in files:
        if file.startswith(prefix) and file.endswith('.zip'):
            zip_file_path = os.path.join('data_zip', file)
            prefix_dir = os.path.join('data_csv/elections', prefix)
            with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
                for member in zip_ref.infolist():
                    if member.filename.lower().endswith('.csv'):
                        target_path = os.path.join(prefix_dir, os.path.basename(member.filename))
                        with zip_ref.open(member) as source, open(target_path, 'wb') as dest:
                            shutil.copyfileobj(source, dest)
            print(f"Extracted CSV files from {file}")
            os.remove(zip_file_path)
print("Extraction completed.")

In [None]:
# Nettoyage des résultats électoraux
for root, dirs, files in os.walk('data_csv'):
    for file_name in files:
        if file_name.startswith("._"):
            file_path = os.path.join(root, file_name)
            try:
                os.remove(file_path)
                print(f"Deleted file: {file_path}")
            except Exception as e:
                print(f"Error deleting file {file_path}: {e}")

print("Deletion of extra files completed")

os.rename("data_csv/elections/pres", "data_csv/elections/presidentielles")
os.rename("data_csv/elections/leg", "data_csv/elections/legislatives")
os.rename("data_csv/elections/ref", "data_csv/elections/referendums")

print("Folder renaming completed")

In [None]:
# Extraction des contrôles
os.makedirs('data_csv/controles', exist_ok=True)

zip_files = [f for f in os.listdir('data_zip') if f.endswith('.zip')]
for zip_file in zip_files:
    if zip_file.startswith(('pres', 'leg', 'ref')):
        continue
    zip_path = os.path.join('data_zip', zip_file)
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            file_list = [file for file in zip_ref.namelist() if not file.startswith('__MACOSX')]
            zip_ref.extractall('data_csv/controles', members=file_list)    
            print(f"Files extracted from {zip_file}:")
            for extracted_file in file_list:
                print(extracted_file)
    except Exception as e:
        print(f"Error extracting {zip_file}: {e}")

print("Extraction complete.")

In [None]:
# Nettoyage des contrôles
folders = [f for f in os.listdir('data_csv/controles') if os.path.isdir(os.path.join('data_csv/controles', f))]
for folder in folders:
    if folder.endswith('_csv'):
        old_path = os.path.join('data_csv/controles', folder)
        new_folder_name = folder[:-4]
        new_path = os.path.join('data_csv/controles', new_folder_name)
        os.rename(old_path, new_path)
        print(f"Renamed: {folder} -> {new_folder_name}")
        
print("Folder renaming complete.")

In [None]:
shutil.rmtree('data_zip')
print('Downloaded data removed.')

In [None]:
# Conversion au format R
def csv_to_rda(input_csv_path, output_rda_path):
    data = pd.read_csv(input_csv_path, low_memory=False, encoding='latin1')
    pyreadr.write_rdata(output_rda_path, data, compress='gzip')
    os.remove(input_csv_path)
    print(f"Converted file: {input_csv_path}")

def convert_csv_files(input_folder, output_folder):
    for root, dirs, files in os.walk(input_folder):
        for file in files:
            if file.endswith(".csv"):
                input_csv_path = os.path.join(root, file)
                relative_path = os.path.relpath(input_csv_path, input_folder)
                output_rda_path = os.path.join(output_folder, os.path.splitext(relative_path)[0] + ".rda")
                os.makedirs(os.path.dirname(output_rda_path), exist_ok=True)
                csv_to_rda(input_csv_path, output_rda_path)

if __name__ == "__main__":
    input_folder = "data_csv"
    output_folder = "data_rda"
    convert_csv_files(input_folder, output_folder)

In [None]:
shutil.rmtree('data_csv')
print('Extracted data removed.')