In [6]:
import streamlit as st
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import re
from io import BytesIO
from PIL import Image

# Charger et redimensionner le logo
logo = Image.open("360_capital_vc_logo.jpeg")
logo = logo.resize((64, 64))


# Configuration de la page
st.set_page_config(
    page_title="Nettoyage Données Crunchbase",
    page_icon=logo,
    layout="wide"
)

def get_domain(url):
    """Extrait le domaine d'une URL et le formate"""
    if pd.isna(url):
        return None
    try:
        domain = urlparse(url).netloc
        domain = re.sub(r'^www\d*\.', '', domain).split(':')[0]
        return domain.lower()
    except:
        return None

def clean_crunchbase_data(df):
    """
    Nettoie les données de levées de fonds Crunchbase
    
    Args:
        df: DataFrame avec les colonnes Crunchbase
        
    Returns:
        DataFrame nettoyé avec les colonnes formatées
    """
    # Créer une copie pour ne pas modifier l'original
    df_clean = df.copy()
    
    # 1. Filtrer les types de financement non désirés
    funding_types_to_remove = [
        'Corporate Round',
        'Grant',
        'Post-IPO Debt',
        'Equity Crowdfunding',
        'Debt Financing',
        'Convertible Note',
        'Series C'
    ]
    
    initial_count = len(df_clean)
    df_clean = df_clean[~df_clean['Funding Type'].isin(funding_types_to_remove)]
    filtered_count = initial_count - len(df_clean)
    
    # 2. Convertir les montants USD en devise originale
    mask_usd = df_clean['Money Raised Currency'] == 'USD'
    mask_has_both = pd.notna(df_clean['Money Raised']) & pd.notna(df_clean['Money Raised (in USD)'])
    
    # Calculer le taux de change moyen pour les lignes non-USD
    rates = df_clean[~mask_usd & mask_has_both].apply(
        lambda row: row['Money Raised (in USD)'] / row['Money Raised'] 
        if row['Money Raised'] != 0 else np.nan,
        axis=1
    )
    avg_rate = rates.median() if len(rates) > 0 else 1.0
    
    # Appliquer la conversion inverse pour les montants USD
    df_clean.loc[mask_usd & pd.isna(df_clean['Money Raised']) & pd.notna(df_clean['Money Raised (in USD)']), 'Money Raised'] = \
        df_clean.loc[mask_usd & pd.isna(df_clean['Money Raised']) & pd.notna(df_clean['Money Raised (in USD)']), 'Money Raised (in USD)'] / avg_rate
    
    # 3. Appliquer le formatage des URLs avec get_domain
    df_clean['Website_formatted'] = df_clean['Organization Website'].apply(get_domain)
    
    # 3bis Changer le format des montants 

    df_clean['Money Raised'] = df_clean['Money Raised'].apply(lambda x: f"€M {x:,.0f}" if pd.notna(x) else x)  

    # 4. Créer le nouveau DataFrame avec les colonnes demandées
    df_final = pd.DataFrame({
        'Company Name': df_clean['Organization Name'],
        'Website 2': '',
        'Website': df_clean['Website_formatted'],
        'Description': df_clean['Organization Description'],
        'Secteur': df_clean['Organization Industries'],
        'Date annonce levée': '',
        'Montant': df_clean['Money Raised'],
        'Investisseurs': df_clean['Investor Names']
    })
    
    # Réinitialiser l'index
    df_final = df_final.reset_index(drop=True)
    
    return df_final, filtered_count


# Interface principale
st.title("Nettoyage de Données Crunchbase")
st.markdown("---")

st.markdown("""
### Instructions
        1. Téléchargez votre fichier CSV exporté depuis Crunchbase.
        2. Cliquez sur "Nettoyer les données" pour lancer le processus de nettoyage.
        3. Téléchargez les données nettoyées au format CSV ou Excel.
""")

st.markdown("---")

# Upload du fichier
uploaded_file = st.file_uploader(
    "Chargez votre fichier CSV Crunchbase",
    type=['csv'],
    help="Le fichier doit contenir les colonnes standard de Crunchbase"
)

if uploaded_file is not None:
    try:
        # Lecture du fichier
        df = pd.read_csv(uploaded_file)
        
        st.success(f"✅ Fichier chargé : {len(df)} lignes détectées")
        
        # Afficher un aperçu des données originales
        with st.expander("Aperçu des données originales"):
            st.dataframe(df.head(10), use_container_width=True)
        
        # Bouton de nettoyage
        if st.button("Nettoyer les données", type="primary", use_container_width=True):
            with st.spinner("Nettoyage en cours..."):
                # Nettoyage
                df_clean, filtered_count = clean_crunchbase_data(df)
                
                # Stocker dans session state
                st.session_state['df_clean'] = df_clean
                st.session_state['filtered_count'] = filtered_count
        
        # Afficher les résultats si disponibles
        if 'df_clean' in st.session_state:
            df_clean = st.session_state['df_clean']
            filtered_count = st.session_state['filtered_count']
            
            st.markdown("---")
            st.success("Nettoyage terminé !")
            
            # Statistiques
            col1, col2, col3 = st.columns(3)
            with col1:
                st.metric("Lignes initiales", len(df))
            with col2:
                st.metric("Lignes filtrées", filtered_count)
            with col3:
                st.metric("Lignes finales", len(df_clean))
            
            # Aperçu des données nettoyées
            st.subheader("Données nettoyées")
            st.dataframe(df_clean, use_container_width=True)
            
            # Boutons de téléchargement
            st.markdown("---")
            st.subheader("Télécharger les résultats")
            
            col1, col2 = st.columns(2)
            
            with col1:
                # CSV
                csv = df_clean.to_csv(index=False).encode('utf-8')
                st.download_button(
                    label="Télécharger en CSV",
                    data=csv,
                    file_name="crunchbase_cleaned.csv",
                    mime="text/csv",
                    use_container_width=True
                )
            
    
    except Exception as e:
        st.error(f"❌ Erreur lors du traitement du fichier : {str(e)}")
        st.info("Vérifiez que votre fichier contient bien toutes les colonnes requises.")

else:
    st.info("Charger un fichier CSV")

# Footer
st.markdown("---")
st.markdown(
    """
    <div style='text-align: center; color: gray;'>
    Outil de nettoyage de données Crunchbase 360 Capital 
    </div>
    """,
    unsafe_allow_html=True
)

2025-10-13 17:27:43.410 
  command:

    streamlit run /Users/justinkim/Documents/GitHub/360capital/.venv/lib/python3.9/site-packages/ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator()

In [128]:
def custom(str : str):
    n = len(str)
    l = []
    for i in range(n):
        l.append(str[i])
        print(str[i])
    l.reverse()
    return((''.join(l)).strip())

custom('hello')


h
e
l
l
o


'olleh'

In [143]:
# check palyndrome

def palyiin(str):
    if str[::-1] == str:
        return True
    else : 
        return False

str = 'alaoaazeazea'
print(str[::-1])

aezaezaaoala


Find indices of two numbers that add up to a specific target in an array.

First we create a dictionary to store numbers and their indices as you iterate through the array. For each number, check if its complement (target minus the number) exists in the dictionary. If it does, return their indices.

In [147]:
def sum(a, target):
    test = a[0]
    index = []
    for i, value in enumerate(a):
        if test + value == target:
            index.append([a.index(test), i])
        else :
            test = value
    return(index)

print(sum([2, 7, 3, 15], 10))

[[1, 2]]


# Récupération Crunchbase API pour récupérer le CSV initial

# Récupération des informations Affinity pour calculer le fundraising ratio

In [8]:
curl "https://api.affinity.co/api_endpoint" -u :$MXTzh9IZ0vry24Yd0qzSM0WHgxF7pzQHoKTzhyELlhw

SyntaxError: invalid syntax (2578553415.py, line 1)

In [2]:
import pandas as pd

# LSN pré filtre

In [115]:
df = pd.read_csv("/Users/justinkim/Documents/GitHub/360capital/datas/extract_LSN - Feuille 1 (8).csv")

In [116]:
df

Unnamed: 0,CompanyName,Website,Linkedin,Description,Industry,DurationInRole,Status,Notes,Search account,Website.1,EmplyeeCount,Fullname,schoolname,schoolname2,schoolname3,CompanyName.1,CompanyName2,CompanyName3,Owner,Created
0,Embedia.io,,https://www.linkedin.com/in/ACwAAAOzNV8BdZKg_r...,,"Safouen SELMI, PMP®,AFSP®,RSASP®",Embedded Software Products,,,,,,,,,,,,2025-10-21,,
1,agepi.io,,https://www.linkedin.com/in/ACwAACMHbEEBo3t5uH...,,Severine MONCEL,Embedded Software Products,,,,,,,,,,,,2025-10-21,,
2,R500,,https://www.linkedin.com/in/ACwAAACszMMBfkPSgS...,,Dr. Olaf Hermans,Embedded Software Products,,,,,,,,,,,,2025-10-21,,
3,The Embedded Kit,,https://www.linkedin.com/in/ACwAACSESPkBxT4ZnR...,,Oriane Bruant,Embedded Software Products,,,,,,,,,,,,2025-10-21,,
4,Nelson,,https://www.linkedin.com/in/ACwAADTjHgYBGCtzay...,,Théo Soudan,Embedded Software Products,,,,,,,,,,,,2025-10-21,,
5,NEAT Protect,none.neatprotect,https://www.linkedin.com/in/ACwAAA6SBBABq5xlCx...,Neat is the leading embedded insurance Insurte...,Fabien Cazes,Insurance,,87.0,,,,,,,,,,2025-10-15,,
6,Hector AI,none.hectorai,https://www.linkedin.com/in/ACwAADz6PZsBaDIA3J...,Hector AI augmente votre pratique contentieuse...,Matthieu Kaeppelin,Software Development,,0.0,,,,,,,,,,2025-10-16,,
7,iyvo,none.iyvo,https://www.linkedin.com/in/ACwAAAgoZZ4BxzTirE...,Iyvo – La plateforme sans commission qui réuni...,Henvino DIFFO,Software Development,,7.0,,,,,,,,,,2025-10-18,,
8,Agentice,none.agentice,https://www.linkedin.com/in/ACwAAAaAkQwB-cRkAp...,,Axel Viard,Software Development,,3.0,,,,,,,,,,2025-10-18,,
9,OwnTrack,none.owntrack,https://www.linkedin.com/in/ACwAAADzHWoBebjV09...,"Fini les documents éparpillés, les process imp...",Karine Marini,Software Development,,3.0,,,,,,,,,,2025-10-18,,


In [7]:
import pandas as pd
from mistralai import Mistral

In [117]:


def classify_company_status(df, client, model):
    """
    Classifie les entreprises et met 'X' dans la colonne Status si elles ne correspondent 
    pas aux critères (France/Italie, ou Europe + climate tech, pas de consulting).
    
    Args:
        df: DataFrame contenant les données
        client: Client Mistral initialisé
        model: Nom du modèle Mistral à utiliser
    
    Returns:
        DataFrame avec la colonne Status mise à jour
    """
    
    def should_exclude(row):
        """
        Détermine si une entreprise doit être exclue (Status = X)
        """
        if pd.isna(row.get('Description')) or str(row.get('Description')).strip() == '':
            return None
        
        description = str(row['Description'])
        prompt = f"""

Analyze the company’s description below and determine whether to ‘EXCLURE’ or ‘GARDER’ based on the following criteria:

‘GARDER’ if:

The company is based in France or Italy.
The company operates in Europe and focuses on climate tech (renewable energy, decarbonization, etc.).
The company uses AI to address problems.
‘EXCLURE’ if:

The company is involved in consulting.
The company is non-profit or an association.
Respond only with ‘EXCLURE’ or ‘GARDER’.
If uncertain, choose ‘GARDER’.

Description: {description}


"""
        
        try:
            chat_response = client.chat.complete(
                model=model,
                messages=[
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ]
            )
            
            response = chat_response.choices[0].message.content.strip().upper()
            

            if "EXCLURE" in response:
                return 'X'
            else:
                return None  
                
        except Exception as e:
            print(f"Erreur lors de la classification: {e}")
            return None
    

    print("Classification en cours...")

    df_copy = df.copy()
    for idx, row in df_copy.iterrows():
        result = should_exclude(row)
        if result == 'X':
            df_copy.at[idx, 'Status'] = 'X'
            if idx % 10 == 0: 
                print(f"Traité {idx + 1}/{len(df_copy)} lignes")
    
    print("Classification terminée!")
    return df_copy


# Config
api_key = "tLYewB74Gq1R7krnmU2fYaRVoHCx8wfl"
model = "mistral-small-latest"

client = Mistral(
    server_url="https://api.05d3a00300de.dc.mistral.ai",
    api_key=api_key
)

df_classified = classify_company_status(df, client, model)
df_classified.to_csv('companies_classified.csv', index=False)

Classification en cours...


  df_copy.at[idx, 'Status'] = 'X'


Traité 41/57 lignes
Traité 51/57 lignes
Classification terminée!


In [118]:
col = ['CompanyName', 'Status']
df_classified[col].dropna()

Unnamed: 0,CompanyName,Status
7,iyvo,X
13,Sequoia Business Community,X
15,tikelt_uk,X
17,DeepKap Partners,X
23,KARELYTICS LLC,X
24,MTZ strategy.,X
27,Activate Games France,X
28,Kaliz,X
29,Primomanda,X
31,Agence Regard,X


In [55]:
def Fizzbuzz(a):
    if a%3 ==0 :
        return "Fizz"
    elif a%5 ==0 :
        return "Buzz"
    elif a%5 ==0 and a%3 ==0 :
        return "FizzBuzz"
    else :
        return a
        
Fizzbuzz(3)

'Fizz'