In [6]:
import streamlit as st
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import re
from io import BytesIO
from PIL import Image

# Charger et redimensionner le logo
logo = Image.open("360_capital_vc_logo.jpeg")
logo = logo.resize((64, 64))


# Configuration de la page
st.set_page_config(
    page_title="Nettoyage Données Crunchbase",
    page_icon=logo,
    layout="wide"
)

def get_domain(url):
    """Extrait le domaine d'une URL et le formate"""
    if pd.isna(url):
        return None
    try:
        domain = urlparse(url).netloc
        domain = re.sub(r'^www\d*\.', '', domain).split(':')[0]
        return domain.lower()
    except:
        return None

def clean_crunchbase_data(df):
    """
    Nettoie les données de levées de fonds Crunchbase
    
    Args:
        df: DataFrame avec les colonnes Crunchbase
        
    Returns:
        DataFrame nettoyé avec les colonnes formatées
    """
    # Créer une copie pour ne pas modifier l'original
    df_clean = df.copy()
    
    # 1. Filtrer les types de financement non désirés
    funding_types_to_remove = [
        'Corporate Round',
        'Grant',
        'Post-IPO Debt',
        'Equity Crowdfunding',
        'Debt Financing',
        'Convertible Note',
        'Series C'
    ]
    
    initial_count = len(df_clean)
    df_clean = df_clean[~df_clean['Funding Type'].isin(funding_types_to_remove)]
    filtered_count = initial_count - len(df_clean)
    
    # 2. Convertir les montants USD en devise originale
    mask_usd = df_clean['Money Raised Currency'] == 'USD'
    mask_has_both = pd.notna(df_clean['Money Raised']) & pd.notna(df_clean['Money Raised (in USD)'])
    
    # Calculer le taux de change moyen pour les lignes non-USD
    rates = df_clean[~mask_usd & mask_has_both].apply(
        lambda row: row['Money Raised (in USD)'] / row['Money Raised'] 
        if row['Money Raised'] != 0 else np.nan,
        axis=1
    )
    avg_rate = rates.median() if len(rates) > 0 else 1.0
    
    # Appliquer la conversion inverse pour les montants USD
    df_clean.loc[mask_usd & pd.isna(df_clean['Money Raised']) & pd.notna(df_clean['Money Raised (in USD)']), 'Money Raised'] = \
        df_clean.loc[mask_usd & pd.isna(df_clean['Money Raised']) & pd.notna(df_clean['Money Raised (in USD)']), 'Money Raised (in USD)'] / avg_rate
    
    # 3. Appliquer le formatage des URLs avec get_domain
    df_clean['Website_formatted'] = df_clean['Organization Website'].apply(get_domain)
    
    # 3bis Changer le format des montants 

    df_clean['Money Raised'] = df_clean['Money Raised'].apply(lambda x: f"€M {x:,.0f}" if pd.notna(x) else x)  

    # 4. Créer le nouveau DataFrame avec les colonnes demandées
    df_final = pd.DataFrame({
        'Company Name': df_clean['Organization Name'],
        'Website 2': '',
        'Website': df_clean['Website_formatted'],
        'Description': df_clean['Organization Description'],
        'Secteur': df_clean['Organization Industries'],
        'Date annonce levée': '',
        'Montant': df_clean['Money Raised'],
        'Investisseurs': df_clean['Investor Names']
    })
    
    # Réinitialiser l'index
    df_final = df_final.reset_index(drop=True)
    
    return df_final, filtered_count


# Interface principale
st.title("Nettoyage de Données Crunchbase")
st.markdown("---")

st.markdown("""
### Instructions
        1. Téléchargez votre fichier CSV exporté depuis Crunchbase.
        2. Cliquez sur "Nettoyer les données" pour lancer le processus de nettoyage.
        3. Téléchargez les données nettoyées au format CSV ou Excel.
""")

st.markdown("---")

# Upload du fichier
uploaded_file = st.file_uploader(
    "Chargez votre fichier CSV Crunchbase",
    type=['csv'],
    help="Le fichier doit contenir les colonnes standard de Crunchbase"
)

if uploaded_file is not None:
    try:
        # Lecture du fichier
        df = pd.read_csv(uploaded_file)
        
        st.success(f"✅ Fichier chargé : {len(df)} lignes détectées")
        
        # Afficher un aperçu des données originales
        with st.expander("Aperçu des données originales"):
            st.dataframe(df.head(10), use_container_width=True)
        
        # Bouton de nettoyage
        if st.button("Nettoyer les données", type="primary", use_container_width=True):
            with st.spinner("Nettoyage en cours..."):
                # Nettoyage
                df_clean, filtered_count = clean_crunchbase_data(df)
                
                # Stocker dans session state
                st.session_state['df_clean'] = df_clean
                st.session_state['filtered_count'] = filtered_count
        
        # Afficher les résultats si disponibles
        if 'df_clean' in st.session_state:
            df_clean = st.session_state['df_clean']
            filtered_count = st.session_state['filtered_count']
            
            st.markdown("---")
            st.success("Nettoyage terminé !")
            
            # Statistiques
            col1, col2, col3 = st.columns(3)
            with col1:
                st.metric("Lignes initiales", len(df))
            with col2:
                st.metric("Lignes filtrées", filtered_count)
            with col3:
                st.metric("Lignes finales", len(df_clean))
            
            # Aperçu des données nettoyées
            st.subheader("Données nettoyées")
            st.dataframe(df_clean, use_container_width=True)
            
            # Boutons de téléchargement
            st.markdown("---")
            st.subheader("Télécharger les résultats")
            
            col1, col2 = st.columns(2)
            
            with col1:
                # CSV
                csv = df_clean.to_csv(index=False).encode('utf-8')
                st.download_button(
                    label="Télécharger en CSV",
                    data=csv,
                    file_name="crunchbase_cleaned.csv",
                    mime="text/csv",
                    use_container_width=True
                )
            
    
    except Exception as e:
        st.error(f"❌ Erreur lors du traitement du fichier : {str(e)}")
        st.info("Vérifiez que votre fichier contient bien toutes les colonnes requises.")

else:
    st.info("Charger un fichier CSV")

# Footer
st.markdown("---")
st.markdown(
    """
    <div style='text-align: center; color: gray;'>
    Outil de nettoyage de données Crunchbase 360 Capital 
    </div>
    """,
    unsafe_allow_html=True
)

2025-10-13 17:27:43.410 
  command:

    streamlit run /Users/justinkim/Documents/GitHub/360capital/.venv/lib/python3.9/site-packages/ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator()

In [128]:
def custom(str : str):
    n = len(str)
    l = []
    for i in range(n):
        l.append(str[i])
        print(str[i])
    l.reverse()
    return((''.join(l)).strip())

custom('hello')


h
e
l
l
o


'olleh'

In [143]:
# check palyndrome

def palyiin(str):
    if str[::-1] == str:
        return True
    else : 
        return False

str = 'alaoaazeazea'
print(str[::-1])

aezaezaaoala


In [21]:
from collections import Counter

l = [1,1,1]

print(Counter(l))

# Pour un dataframe

df.value_counts()

Counter({1: 3})


Series([], Name: count, dtype: int64)

Find indices of two numbers that add up to a specific target in an array.

First we create a dictionary to store numbers and their indices as you iterate through the array. For each number, check if its complement (target minus the number) exists in the dictionary. If it does, return their indices.

In [147]:
def sum(a, target):
    test = a[0]
    index = []
    for i, value in enumerate(a):
        if test + value == target:
            index.append([a.index(test), i])
        else :
            test = value
    return(index)

print(sum([2, 7, 3, 15], 10))

[[1, 2]]


In [23]:
# faire la sum de deux array numpy 
import numpy as np
arr1 = np.array([1, 2])
arr2 = np.array([4, 5])
result = np.add(arr1, arr2)

# extract diag 
import numpy as np
matrix = np.array([[1, 2, 3], [4, 5, 6]])
print(np.diagonal(matrix))

[1 5]


Create a Class to Represent a Person with Basic Attributes.

__init__(self, name, age) initializes the Person object with a name and age.
birthday(self) increases the person's age by 1.
__str__(self) provides a human-readable string representation of the Person object.

In [27]:
class Person:
    def __init__(self, age, name):
        self.age = age
        self.name = name
    
    def birthday(self):
        self.age += 1

    def str(self):
        return(f"Name : {self.name} and Age : {self.age}")
    
perso = Person(12, "Alice")
print(perso.str())
perso.birthday()
print(perso.str())

Name : Alice and Age : 12
Name : Alice and Age : 13


Implement a sliding window to find the maximum sum of a subarray of a given size k.

In [6]:
def subarray(arr, k):
    max = 0
    for i in range(len(arr)):
        if max < sum(arr[i:i+k]):
            max = sum(arr[i:i+k])

    return(max)

print(subarray([2, 1, 5, 1, 3, 2], 3))

9


 Calculate the confidence interval for a given dataset (assume normal distribution).

In [9]:
!pip install scipy

Collecting scipy
  Using cached scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl.metadata (60 kB)
Using cached scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl (30.3 MB)
Installing collected packages: scipy
Successfully installed scipy-1.13.1


In [10]:
import numpy as np
from scipy.stats import norm

def confidence_interval(data, confidence=0.95):
    mean, std = np.mean(data), np.std(data, ddof=1)
    z = norm.ppf((1 + confidence) / 2)
    margin_of_error = z * (std / np.sqrt(len(data)))
    return mean - margin_of_error, mean + margin_of_error


print(confidence_interval([1, 2, 3, 4, 5]))

(1.614096175650322, 4.385903824349678)


 Implement the Chi-squared test for independence on a contingency table.

Calculate the Chi-squared statistic by comparing observed and expected frequencies in the contingency table.

In [11]:
import numpy as np
from scipy.stats import chi2_contingency

def chi_squared_test(contingency_table):
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    return chi2, p


table = [[10, 20], [20, 40]]
print(chi_squared_test(table))

(0.0, 1.0)


Write a function to handle missing data using multiple imputation.

we use Simple Imputer to replace missing values with the mean or another strategy. Below is the code:

In [13]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp39-cp39-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp39-cp39-macosx_12_0_arm64.whl (11.1 MB)
Using cached joblib-1.5.2-py3-none-any.whl (308 kB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [scikit-learn][0m [scikit-learn]
[1A[2KSuccessfully installed joblib-1.5.2 scikit-learn-1.6.1 threadpoolctl-3.6.0


In [16]:
from sklearn.impute import SimpleImputer
import numpy as np

def impute_missing_data(data):
    imputer = SimpleImputer(strategy='most_frequent')
    return imputer.fit_transform(data)

data = np.array([[1, 2], [np.nan, 3], [7, 6]])
print(impute_missing_data(data))

[[1. 2.]
 [1. 3.]
 [7. 6.]]


 Group a dataset by a column and calculate the rolling average for another column.

Use pandas.groupby and rolling to calculate rolling averages.

In [19]:
import pandas as pd

In [None]:
import pandas as pd

data = {
    'Group': ['A', 'A', 'B', 'B', 'C'],
    'Value': [10, 20, 30, 40, 50],
    'Score': [1.5, 2.5, 3.5, 4.5, 5.5]
}
df = pd.DataFrame(data)
print(df)

result = df.groupby('Group').agg({
    'Score': lambda x: x.max() - x.min().min() # la colonne doit exister 
}).reset_index() # à toujours rajouter
print(result)

  Group  Value  Score
0     A     10    1.5
1     A     20    2.5
2     B     30    3.5
3     B     40    4.5
4     C     50    5.5
  Group  Score
0     A    1.0
1     B    1.0
2     C    0.0


In [47]:
df.iloc[:2]


Unnamed: 0,Group,Value,Averages
0,A,10,10.0
1,A,20,15.0


In [49]:
df['Averages'] = df.groupby('Group')['Value'].rolling(window=2, min_periods=1).mean().reset_index(drop=True)
df

Unnamed: 0,Group,Value,Averages
0,A,10,10.0
1,A,20,15.0
2,B,30,30.0
3,B,40,35.0


Create a pivot table from raw transactional data.

Use pandas.pivot_table to summarize data into a pivot table.

In [58]:
import pandas as pd

def create_pivot_table(df, index, columns, values, aggfunc):
    return pd.pivot_table(df, index=index, columns=columns, values=values, aggfunc=aggfunc)

data = {'Category': ['A', 'A', 'B'], 'Type': ['X', 'Y', 'X'], 'Value': [10, 20, 30]}
df = pd.DataFrame(data)


pivot_table = create_pivot_table(df, index='Category', columns='Type', values='Value', aggfunc='sum')
print(pivot_table)

Type         X     Y
Category            
A         10.0  20.0
B         30.0   NaN


In [62]:
import pandas as pd

data = { 'First_name': ['Liam', 'Emma', 'Noah', 'Olivia', 'Ava'],
         'Last_name': ['Smith', 'Brown', 'Davis', 'Wilson', 'Taylor'],
         'Age': [42, 52, 36, 21, 23],
         'City': ['New York', 'Paris', 'Berlin', 'Madrid', 'Rome'] }
df = pd.DataFrame(data)
print(df)

new_vals = {"Liam": "MBA", "Emma": "PhD", "Noah": "LLB", "Olivia": "B.Tech", "Ava": "MD"}
df["Qualification"] = df["First_name"].map(new_vals)
print(df)

new_vals = {"Liam": "Lucas", "Noah": "Nathan", "Olivia": "Olive"}
df_replaced = df.replace({"First_name": new_vals})
print(df_replaced)

new_vals = {0: "Lukas", 2: "Nicolas", 3: "Sophia"}
df["First_name"].update(pd.Series(new_vals))
print(df)

  First_name Last_name  Age      City
0       Liam     Smith   42  New York
1       Emma     Brown   52     Paris
2       Noah     Davis   36    Berlin
3     Olivia    Wilson   21    Madrid
4        Ava    Taylor   23      Rome
  First_name Last_name  Age      City Qualification
0       Liam     Smith   42  New York           MBA
1       Emma     Brown   52     Paris           PhD
2       Noah     Davis   36    Berlin           LLB
3     Olivia    Wilson   21    Madrid        B.Tech
4        Ava    Taylor   23      Rome            MD
  First_name Last_name  Age      City Qualification
0      Lucas     Smith   42  New York           MBA
1       Emma     Brown   52     Paris           PhD
2     Nathan     Davis   36    Berlin           LLB
3      Olive    Wilson   21    Madrid        B.Tech
4        Ava    Taylor   23      Rome            MD
  First_name Last_name  Age      City Qualification
0      Lukas     Smith   42  New York           MBA
1       Emma     Brown   52     Paris       

In [77]:
# import pandas module
import pandas as pd

# making dataframe
df = pd.read_csv("https://media.geeksforgeeks.org/wp-content/uploads/nba.csv")

# it was print the first 5-rows
print(df.head())

# reshape the dataframe using stack() method
df_stacked = df.stack()

print(df_stacked[0].keys())
df_stacked[0].items()

            Name            Team  Number Position   Age Height  Weight  \
0  Avery Bradley  Boston Celtics     0.0       PG  25.0    6-2   180.0   
1    Jae Crowder  Boston Celtics    99.0       SF  25.0    6-6   235.0   
2   John Holland  Boston Celtics    30.0       SG  27.0    6-5   205.0   
3    R.J. Hunter  Boston Celtics    28.0       SG  22.0    6-5   185.0   
4  Jonas Jerebko  Boston Celtics     8.0       PF  29.0   6-10   231.0   

             College     Salary  
0              Texas  7730337.0  
1          Marquette  6796117.0  
2  Boston University        NaN  
3      Georgia State  1148640.0  
4                NaN  5000000.0  
Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')


<zip at 0x16a122f00>

In [83]:
for key, element in enumerate(df):
    print(key, element)
# unstack() method
df_unstacked = df_stacked.unstack()
print(df_unstacked.head(10))

0 Name
1 Team
2 Number
3 Position
4 Age
5 Height
6 Weight
7 College
8 Salary
            Name            Team Number Position   Age Height Weight  \
0  Avery Bradley  Boston Celtics    0.0       PG  25.0    6-2  180.0   
1    Jae Crowder  Boston Celtics   99.0       SF  25.0    6-6  235.0   
2   John Holland  Boston Celtics   30.0       SG  27.0    6-5  205.0   
3    R.J. Hunter  Boston Celtics   28.0       SG  22.0    6-5  185.0   
4  Jonas Jerebko  Boston Celtics    8.0       PF  29.0   6-10  231.0   
5   Amir Johnson  Boston Celtics   90.0       PF  29.0    6-9  240.0   
6  Jordan Mickey  Boston Celtics   55.0       PF  21.0    6-8  235.0   
7   Kelly Olynyk  Boston Celtics   41.0        C  25.0    7-0  238.0   
8   Terry Rozier  Boston Celtics   12.0       PG  22.0    6-2  190.0   
9   Marcus Smart  Boston Celtics   36.0       PG  22.0    6-4  220.0   

             College      Salary  
0              Texas   7730337.0  
1          Marquette   6796117.0  
2  Boston University     

In [86]:

df

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [103]:
df_copy = df.copy()

df_copy = df_copy.query('Age < 25')
df_copy

indices_to_drop = df_copy[df_copy['Weight'] < 185].index # récupérer les index
df.drop(indices_to_drop, inplace=True)
print(df.head(10))

            Name            Team  Number Position   Age Height  Weight  \
0  Avery Bradley  Boston Celtics     0.0       PG  25.0    6-2   180.0   
1    Jae Crowder  Boston Celtics    99.0       SF  25.0    6-6   235.0   
2   John Holland  Boston Celtics    30.0       SG  27.0    6-5   205.0   
3    R.J. Hunter  Boston Celtics    28.0       SG  22.0    6-5   185.0   
4  Jonas Jerebko  Boston Celtics     8.0       PF  29.0   6-10   231.0   
5   Amir Johnson  Boston Celtics    90.0       PF  29.0    6-9   240.0   
6  Jordan Mickey  Boston Celtics    55.0       PF  21.0    6-8   235.0   
7   Kelly Olynyk  Boston Celtics    41.0        C  25.0    7-0   238.0   
8   Terry Rozier  Boston Celtics    12.0       PG  22.0    6-2   190.0   
9   Marcus Smart  Boston Celtics    36.0       PG  22.0    6-4   220.0   

             College      Salary  
0              Texas   7730337.0  
1          Marquette   6796117.0  
2  Boston University         NaN  
3      Georgia State   1148640.0  
4         

In [97]:
df.iloc[1,:]

Name           Jae Crowder
Team        Boston Celtics
Number                99.0
Position                SF
Age                   25.0
Height                 6-6
Weight               235.0
College          Marquette
Salary           6796117.0
Name: 1, dtype: object

In [113]:
df.loc[0:1] # ici les index sont des nombres mais la différence c'est qu'on peut utiliser les noms

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0


In [116]:
df.query('Position != "PG"')

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
5,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,,12000000.0
...,...,...,...,...,...,...,...,...,...
451,Chris Johnson,Utah Jazz,23.0,SF,26.0,6-6,206.0,Dayton,981348.0
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [114]:
df.loc[df['Position'] != 'PG']

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
5,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,,12000000.0
...,...,...,...,...,...,...,...,...,...
451,Chris Johnson,Utah Jazz,23.0,SF,26.0,6-6,206.0,Dayton,981348.0
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [88]:
df.shape[0], df.shape[1]

(458, 9)

In [100]:
row = []
for i in range(df.shape[0]):
    row.append(list(df.iloc[i,:]))

print(row[0])

['Avery Bradley', 'Boston Celtics', 0.0, 'PG', 25.0, '6-2', 180.0, 'Texas', 7730337.0]


insérer des élements à n'importe quel endroit 

In [118]:
new_row2 = ['Jae Cr','Boston Celtics',	99.0,	'SF'	,25.0,	6-6,	235.0	,'Marquette',	6796117.0]

# Copy original DataFrame
df2 = df.copy()

# Insert row at position 1
df2.loc[1.5] = new_row2
df2 = df2.sort_index().reset_index(drop=True)
df2

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,Jae Cr,Boston Celtics,99.0,SF,25.0,0,235.0,Marquette,6796117.0
3,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
4,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
...,...,...,...,...,...,...,...,...,...
446,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
447,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
448,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
449,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [123]:
result = df.sort_values(by=['Age', 'Weight'], ascending=True)  
result

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
226,Rashad Vaughn,Milwaukee Bucks,20.0,SG,19.0,6-6,202.0,UNLV,1733040.0
122,Devin Booker,Phoenix Suns,1.0,SG,19.0,6-6,206.0,Kentucky,2127840.0
445,Dante Exum,Utah Jazz,11.0,PG,20.0,6-6,190.0,,3777720.0
116,D'Angelo Russell,Los Angeles Lakers,1.0,PG,20.0,6-5,195.0,Ohio State,5103120.0
401,Tyus Jones,Minnesota Timberwolves,1.0,PG,20.0,6-2,195.0,Duke,1282080.0
...,...,...,...,...,...,...,...,...,...
261,Vince Carter,Memphis Grizzlies,15.0,SG,39.0,6-6,220.0,North Carolina,4088019.0
304,Andre Miller,San Antonio Spurs,24.0,PG,40.0,6-3,200.0,Utah,250750.0
400,Kevin Garnett,Minnesota Timberwolves,21.0,PF,40.0,6-11,240.0,,8500000.0
298,Tim Duncan,San Antonio Spurs,21.0,C,40.0,6-11,250.0,Wake Forest,5250000.0


In [128]:
df['Rank'] = df['Salary'].rank(method='average', ascending=False) # pour rank par rapport à une features 
df.sort_values(by='Rank').head(5)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Rank
109,Kobe Bryant,Los Angeles Lakers,24.0,SF,37.0,6-6,212.0,,25000000.0,1.0
169,LeBron James,Cleveland Cavaliers,23.0,SF,31.0,6-8,250.0,,22970500.0,2.0
33,Carmelo Anthony,New York Knicks,7.0,SF,32.0,6-8,240.0,Syracuse,22875000.0,3.0
251,Dwight Howard,Houston Rockets,12.0,C,30.0,6-11,265.0,,22359364.0,4.0
339,Chris Bosh,Miami Heat,1.0,PF,32.0,6-11,235.0,Georgia Tech,22192730.0,5.0


# PySpark

In [129]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-4.0.1.tar.gz (434.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.2/434.2 MB[0m [31m53.3 MB/s[0m  [33m0:00:07[0m:00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting py4j==0.10.9.9 (from pyspark)
  Downloading py4j-0.10.9.9-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading py4j-0.10.9.9-py2.py3-none-any.whl (203 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (pyproject.toml) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-4.0.1-py2.py3-none-any.whl size=434813860 sha256=b5c5785be07adfcdddeae573b9a2b18bb5347becc5ed9da341396bacd0d7e69f
  Stored in directory: /Users/justinkim/Library/Caches/pip/wheels/10/e6/6b/c50eb601fa827dd56a5272db5d5db360e559e527a80a665b1d
Successfully built pyspark
Installing collected packag

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, when, lit
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

# 1. Initialisation de la session Spark
spark = SparkSession.builder \
    .appName("ExempleDataFrameSpark") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

# 2. Définition d'un schéma pour le DataFrame (optionnel, pour un contrôle précis)
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("nom", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("salaire", DoubleType(), True),
    StructField("ville", StringType(), True)
])

# 3. Création d'un DataFrame à partir de données brutes (si pas de fichier CSV)
data = [
    (1, "Alice", 25, 50000.0, "Paris"),
    (2, "Bob", 30, 60000.0, "Lyon"),
    (3, "Charlie", 35, 75000.0, "Marseille"),
    (4, "David", 28, 55000.0, "Paris"),
    (5, "Emma", 40, 80000.0, None)
]

df = spark.createDataFrame(data, schema)

# Alternativement, charger un DataFrame depuis un fichier CSV
# df = spark.read.csv("path/to/data.csv", header=True, schema=schema)

# 4. Afficher le schéma du DataFrame
print("Schéma du DataFrame :")
df.printSchema()

# 5. Afficher les premières lignes
print("Aperçu des données :")
df.show(5, truncate=False)

# 6. Exemple de transformations
# a. Filtrer les lignes où l'âge est supérieur à 30
df_filtre = df.filter(col("age") > 30)
print("Personnes de plus de 30 ans :")
df_filtre.show()

# b. Ajouter une nouvelle colonne basée sur une condition
df = df.withColumn("categorie_age", 
                   when(col("age") < 30, "Jeune")
                   .when(col("age") <= 35, "Adulte")
                   .otherwise("Senior"))
print("DataFrame avec nouvelle colonne :")
df.show()

# c. Grouper et agréger : calculer le salaire moyen par ville
df_agg = df.groupBy("ville").agg(
    avg("salaire").alias("salaire_moyen"),
    count("id").alias("nombre_personnes")
)
print("Salaire moyen et nombre de personnes par ville :")
df_agg.show()

# 7. Gestion des valeurs manquantes
# Remplacer les valeurs nulles dans la colonne 'ville' par 'Inconnu'
df = df.na.fill({"ville": "Inconnu"})
print("DataFrame après remplacement des valeurs nulles :")
df.show()

# 8. Jointure avec un autre DataFrame
# Création d'un DataFrame pour les départements
data_dep = [(1, "Paris", "Île-de-France"), (2, "Lyon", "Auvergne-Rhône-Alpes"), (3, "Inconnu", "Inconnu")]
schema_dep = StructType([
    StructField("id_dep", IntegerType(), False),
    StructField("ville", StringType(), True),
    StructField("region", StringType(), True)
])
df_dep = spark.createDataFrame(data_dep, schema_dep)

# Jointure sur la colonne 'ville'
df_joined = df.join(df_dep, "ville", "left")
print("DataFrame après jointure :")
df_joined.show()

# 9. Sauvegarde des résultats
# Sauvegarder le DataFrame transformé en format Parquet
df.write.mode("overwrite").parquet("output/transformed_data")

# Sauvegarder les résultats agrégés en CSV
df_agg.write.mode("overwrite").csv("output/aggregated_data", header=True)

# 10. Utilisation de SQL avec Spark
# Créer une vue temporaire pour exécuter des requêtes SQL
df.createOrReplaceTempView("personnes")
result_sql = spark.sql("""
    SELECT ville, AVG(salaire) as salaire_moyen, COUNT(*) as nombre
    FROM personnes
    GROUP BY ville
    HAVING COUNT(*) > 1
""")
print("Résultat de la requête SQL :")
result_sql.show()

# 11. Arrêter la session Spark
spark.stop()

# CNN 

In [None]:
import tensorflow as tf

from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt

In [None]:
(train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data()

# Normalize pixel values to be between 0 and 1
train_images, test_images = train_images / 255.0, test_images / 255.0

class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
               'dog', 'frog', 'horse', 'ship', 'truck']

plt.figure(figsize=(10,10))
for i in range(25):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(train_images[i])
    # The CIFAR labels happen to be arrays, 
    # which is why you need the extra index
    plt.xlabel(class_names[train_labels[i][0]])
plt.show()

In [None]:
model = models.Sequential()

In [338]:
import pandas as pd
import numpy as np

# data of 2018 drivers world championship
dict1 = {'Driver': ['Hamilton', 'Vettel', 'Raikkonen',
                    'Verstappen', 'Bottas', 'Ricciardo',
                    'Hulkenberg', 'Perez', 'Magnussen',
                    'Sainz', 'Alonso', 'Ocon', 'Leclerc',
                    'Grosjean', 'Gasly', 'Vandoorne',
                    'Ericsson', 'Stroll', 'Hartley', 'Sirotkin'],

         'Points': [408, 320, 251, 249, 247, 170, 69, 62, 56,
                    53, 50, 49, 39, 37, 29, 12, 9, 6, 4, 1],

         'Age': [33, 31, 39, 21, 29, 29, 31, 28, 26, 24, 37,
                 22, 21, 32, 22, 26, 28, 20, 29, 23]}

# creating dataframe using DataFrame constructor
df = pd.DataFrame(dict1)
print(df.head(10))

       Driver  Points  Age
0    Hamilton     408   33
1      Vettel     320   31
2   Raikkonen     251   39
3  Verstappen     249   21
4      Bottas     247   29
5   Ricciardo     170   29
6  Hulkenberg      69   31
7       Perez      62   28
8   Magnussen      56   26
9       Sainz      53   24


In [258]:
df['Points'] 
df.Points

0     408
1     320
2     251
3     249
4     247
5     170
6      69
7      62
8      56
9      53
10     50
11     49
12     39
13     37
14     29
15     12
16      9
17      6
18      4
19      1
Name: Points, dtype: int64

In [347]:
df.nsmallest(5, ['Age'])

Unnamed: 0,Driver,Points,Age,Nul,Statut,stat,Score Normalisé par age,Autre point
17,Stroll,6,20,0,Yes,A le droit,0.3,19.4
3,Verstappen,249,21,0,Yes,A le droit,8.517241,-3.9
12,Leclerc,39,21,0,Yes,A le droit,1.351351,17.1
11,Ocon,49,22,0,Yes,A le droit,1.857143,17.1
14,Gasly,29,22,0,Yes,A le droit,1.15625,19.1


In [None]:
# Select une colonne 
df.Points
df['Points']

# Select une row
df.iloc[1]
# Select plusieurs rows
df.iloc[0:3]

# Select randomly rows
sampled_df = df.sample(frac=0.5).reset_index().drop('index', axis=1)

# Select row that contains a certain string 

#print(df[df["Driver"].str.contains("ami")]) # on peut mettre un OR contient result = df[df["Team"].str.contains("Boston") | df["College"].str.contains("MIT")] 

# Create a list, a dict from rows

#print(df.value_counts())
#print(df.values.tolist())
#print(df.to_numpy().tolist())
#print(df.to_dict(orient='records')) # list of dict
#print([list(row) for row in df.itertuples(index=False)]) # pour les grandes base de données

# Drop une colonne 
#df = df.drop(df.iloc[:, 1:3], axis=1)
#df = df.drop('Points', axis=1)
#df.pop('col')
# Drop columns with more than 50% missing values
#threshold = len(df) * 0.5
#df = df.dropna(thresh=threshold, axis=1)

# Créer une colonne 
l_a_rajouter = [0 for i in range(len(df))]
df['Nul'] = l_a_rajouter

# Pour itérer sur une colonne et en créer une nouvelle avec un statut, High salary par exemple
statut = []
for age in df['Age']:
    if age>18:
        statut.append('Yes')
df['Statut'] = statut

bins = [0,18,40]
lab = ['Non', 'A le droit']
df['stat'] = pd.cut(df['Age'], bins = bins, labels = lab) # possible de le faire avce binning

# pour itérer sur le nom des colonnes 

#for col in df.columns:
    #print(col)


# Récupérer le nom des colonnes

#print(df.columns.to_list())
#print(sorted(df.columns.values))
#print(df.keys())

# récupérer les values unique d'une colonne 
print(sorted(df['Age'].unique()))
print(df['Age'].nunique())
print(df['Age'].value_counts())
print(df['Age'].value_counts().max())
df.groupby('Age').size()
pd.crosstab(index=df['Age'], columns='count')


# Modifier index avec une colonne
d = df.copy()
d.index = d.pop("Driver")
#df.set_index('Age')
#print(d)

# Get l'index du max dans un dataframe

i_min , min = 0, df['Age'][0]
for i in range(len(df)):
    if df['Age'][i]<min:
        min = df['Age'][i]
        i_min = i
print(i_min, min)

df.min()
df[df.Points == df.Points.min()]

df[['Age']].idxmax() # PLUS SIMPLE POUR LINDEX DU MAX
df.nlargest(5, ['Age']) # Pour avoir accès au n value les plus grandes
df.nsmallest(5, ['Age'])

# Rename les colonnes 

df_co = df.copy()
df_co.rename(columns={'Driver' : 'A', 'Points' : 'B' , 'Age': 'C',  'Nul': 'D',  'Statut': 'E',  'stat': 'F'}, inplace=True)

l = [i for i in range(len(df_co.columns.to_list()))]
df_co.columns = l

#df_co.add_prefix('new_') 

# Duplicates 
df['Age'].drop_duplicates()
unique_set = set(df['Age'])
print(unique_set)

# Créer une nouvelle colonne fonction des autres
max = df['Points'].max() 
df["Score Normalisé par age"] = df.apply(lambda x : x['Points']/x['Age'], axis = 1  )
sorted(df["Score Normalisé par age"].to_numpy(), reverse=True)
df["Score Normalisé par age"] = sorted(df["Score Normalisé par age"].to_numpy(), reverse=True)

df['Autre point'] = df['Age'] - (0.1 * df['Points'])



[20, 21, 22, 23, 24, 26, 28, 29, 31, 32, 33, 37, 39]
13
Age
29    3
31    2
21    2
28    2
26    2
22    2
33    1
39    1
24    1
37    1
32    1
20    1
23    1
Name: count, dtype: int64
3
17 20


AttributeError: 'DataFrame' object has no attribute 'Points'

In [311]:
# Create the dataframe
df = pd.DataFrame({'Date':['10/2/2011', '11/2/2011', '12/2/2011', '13/2/2011'],
                   'Product':['Umbrella', 'Mattress', 'Badminton', 'Shuttle'],
                   'Last Price':[1200, 1500, 1600, 352],
                   'Updated Price':[1250, 1450, 1550, 400],
                   'Discount':[10, 10, 10, 10]})
df

Final_cost = []
for element in df['Updated Price']:
    if element != 'NaN' :
        Final_cost.append(element*(0.9))
df['Final Price'] = Final_cost
df

# Check si la colonne est présente 
if {'Updated Price', 'Discount'}.issubset(df.columns):
    df['Final cost'] = df['Updated Price'] - (df['Updated Price']*0.1)

elif {'Last Price', 'Discount'}.issubset(df.columns):
    df['Final cost'] = df['Last Price'] - (df['Last Price']*0.1)

# Create the dataframe
df = pd.DataFrame({'Date':['10/2/2011', '11/2/2011', '12/2/2011', '13/2/2011'],
                   'Product':['Umbrella', 'Mattress', 'Badminton', 'Shuttle'],
                   'Last_Price':[1200, 1500, 1600, 352],
                   'Updated_Price':[1250, 1450, 1550, 400],
                   'Discount':[10, 10, 10, 10]})

# Create the indexes
df.index =[f'Item {i}' for i in range(len(df))]

# recherche dans les index qui satisfait une condition 
df
df.query('Updated_Price > 1250').index.to_list()

['Item 1', 'Item 2']

In [None]:
# Pour remplacer des valeurs dans une colonne 
# Define an incomplete dictionary
df = pd.DataFrame({'Date':['10/2/2011', '11/2/2011', '12/2/2011', '13/2/2011'],
                    'Event':['Music', 'Poetry', 'Theatre', 'Comedy'],
                    'Cost':[10000, 5000, 15000, 2000]})
d = {'Music': 'M', 'Poetry': 'P'}

# Apply map() and handle missing values
df['Event'] = df['Event'].map(d).fillna('Unknown')

# split des tring colonnes
df = pd.DataFrame({
    'Name': ['John Larter', 'Robert Junior', 'Jonny Depp'],
    'Age': [32, 34, 36]
})



            Name  Age
0    John Larter   32
1  Robert Junior   34
2     Jonny Depp   36


# Récupération Crunchbase API pour récupérer le CSV initial

# Récupération des informations Affinity pour calculer le fundraising ratio

In [8]:
curl "https://api.affinity.co/api_endpoint" -u :$MXTzh9IZ0vry24Yd0qzSM0WHgxF7pzQHoKTzhyELlhw

SyntaxError: invalid syntax (2578553415.py, line 1)

In [1]:
import pandas as pd

# LSN pré filtre

In [141]:
import pandas as pd
from mistralai import Mistral

df = pd.read_csv("/Users/justinkim/Documents/GitHub/360capital/datas/extract_LSN - Feuille 1 (16).csv")


def classify_company_status(df, client, model):
    """
    Classifie les entreprises et met 'X' dans la colonne Status si elles ne correspondent 
    pas aux critères (France/Italie, ou Europe + climate tech, pas de consulting).
    
    Args:
        df: DataFrame contenant les données
        client: Client Mistral initialisé
        model: Nom du modèle Mistral à utiliser
    
    Returns:
        DataFrame avec la colonne Status mise à jour
    """
    
    def should_exclude(row):
        """
        Détermine si une entreprise doit être exclue (Status = X)
        """
        if pd.isna(row.get('Description')) or str(row.get('Description')).strip() == '':
            return None
        
        description = str(row['Description'])
        prompt = f"""

Analyze the company’s description below and determine whether to ‘EXCLURE’ or ‘GARDER’ based on the following criteria:

‘GARDER’ if:

The company is based in France or Italy.
The company operates in Europe and focuses on climate tech (renewable energy, decarbonization, etc.).
The company uses AI to address problems, or technology such as deeptech.
‘EXCLURE’ if:

The company is involved in consulting.
The company is non-profit or an association.
Respond only with ‘EXCLURE’ or ‘GARDER’.
If uncertain, choose ‘GARDER’.

Description: {description}


"""
        
        try:
            chat_response = client.chat.complete(
                model=model,
                messages=[
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ]
            )
            
            response = chat_response.choices[0].message.content.strip().upper()
            

            if "EXCLURE" in response:
                return 'X'
            else:
                return None  
                
        except Exception as e:
            print(f"Erreur lors de la classification: {e}")
            return None
    

    print("Classification en cours...")

    df_copy = df.copy()
    for idx, row in df_copy.iterrows():
        result = should_exclude(row)
        if result == 'X':
            df_copy.at[idx, 'Status'] = 'X'
            if idx % 10 == 0: 
                print(f"Traité {idx + 1}/{len(df_copy)} lignes")
    
    print("Classification terminée!")
    return df_copy


# Config
api_key = "tLYewB74Gq1R7krnmU2fYaRVoHCx8wfl"
model = "mistral-small-latest"

client = Mistral(
    server_url="https://api.05d3a00300de.dc.mistral.ai",
    api_key=api_key
)

df_classified = classify_company_status(df, client, model)
df_classified.to_csv('companies_classified.csv', index=False)

Classification en cours...


  df_copy.at[idx, 'Status'] = 'X'


Traité 11/110 lignes
Traité 21/110 lignes
Traité 31/110 lignes
Traité 41/110 lignes
Traité 71/110 lignes
Traité 91/110 lignes
Traité 101/110 lignes
Classification terminée!


In [19]:
col = ['CompanyName', 'Status']
len(df_classified[col].dropna())

213

In [55]:
def Fizzbuzz(a):
    if a%3 ==0 :
        return "Fizz"
    elif a%5 ==0 :
        return "Buzz"
    elif a%5 ==0 and a%3 ==0 :
        return "FizzBuzz"
    else :
        return a
        
Fizzbuzz(3)

'Fizz'