In [1]:
import pandas as pd
import requests
from io import StringIO
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from shapely.geometry import Point
import folium

url = "https://raw.githubusercontent.com/Amina212004/MINI_PROJET_ML/refs/heads/data_branch/data/meteorites_final.csv"
token = "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"  

headers = {"Authorization": f"token {token}"}
r = requests.get(url, headers=headers)
r.raise_for_status()  

df = pd.read_csv(StringIO(r.text))
df.head()


Unnamed: 0,name,year_period,year,recclass,continent,country,mass_cleaned,mass_bin,recclass_clean,fall,reclat,reclong
0,Aachen,19th Century,1880.0,L5,Europe,Belgium,21.0,10-100g,L5,Fell,50.775,6.08333
1,Aarhus,20th Century,1951.0,H6,Europe,Denmark,720.0,100-1kg,H6,Fell,56.18333,10.23333
2,Abee,20th Century,1952.0,EH4,North America,Canada,50370.0,>10kg,H4,Fell,54.21667,-113.0
3,Acapulco,20th Century,1976.0,Acapulcoite,North America,Mexico,1914.0,1-10kg,OTHER,Fell,16.88333,-99.9
4,Achiras,20th Century,1902.0,L6,South America,Argentina,780.0,100-1kg,L6,Fell,-33.16667,-64.95


In [15]:
import pandas as pd
import numpy as np
import folium
from mlxtend.frequent_patterns import apriori, association_rules

# -----------------------------
# Apriori
# -----------------------------
df_apriori = pd.get_dummies(df[['year_period','mass_bin','continent','country','recclass_clean']])
frequent_itemsets = apriori(df_apriori, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.3)

# -----------------------------
# Filtrer règles selon critères choisis
# -----------------------------
# -----------------------------
# Filtrer règles selon critères choisis
# -----------------------------
def filter_rules(rules, years=None, mass_bins=None, continents=None):
    user_criteria = []

    # Années (utiliser df['year'] pour intervalle)
    if years:
        year_vals = df['year'].dropna().unique()
        for y in years:
            if isinstance(y, (list, tuple)):  # intervalle d'années
                user_criteria += [f'year_period_{df.loc[df["year"] == yy, "year_period"].values[0]}'
                                  for yy in year_vals if y[0] <= yy <= y[1]]
            else:  # année exacte
                matching_periods = df.loc[df['year'] == y, 'year_period'].unique()
                user_criteria += [f'year_period_{yp}' for yp in matching_periods]

    # Masses
    if mass_bins:
        mass_vals = df['mass_bin'].unique()
        for m in mass_bins:
            if isinstance(m, (list, tuple)):
                for mb in mass_vals:
                    try:
                        low, high = [float(x.replace('g','').replace('kg','000')) for x in mb.split('-')]
                        if low >= m[0] and high <= m[1]:
                            user_criteria.append(f'mass_bin_{mb}')
                    except:
                        continue
            else:
                user_criteria.append(f'mass_bin_{m}')

    # Continents
    if continents:
        user_criteria += [f'continent_{c}' for c in continents]

    if not user_criteria:
        return rules

    # Intersection stricte
    filtered = rules[rules['antecedents'].apply(lambda x: all(item in x for item in user_criteria))]
    return filtered


# -----------------------------
# Type le plus probable avec score combiné
# -----------------------------
def get_most_probable_type(filtered_rules, df_subset):
    type_scores = {}
    if not filtered_rules.empty:
        for _, row in filtered_rules.iterrows():
            for item in row['consequents']:
                if 'recclass_clean_' in item:
                    type_name = item.replace('recclass_clean_', '')
                    type_scores[type_name] = type_scores.get(type_name, 0) + row['confidence'] * row['support']

    if type_scores:
        top_type = max(type_scores, key=type_scores.get)
        prob = type_scores[top_type] / sum(type_scores.values())
    else:
        top_type = df_subset['recclass_clean'].value_counts().idxmax()
        prob = df_subset['recclass_clean'].value_counts(normalize=True).max()

    return top_type, prob, type_scores

# -----------------------------
# Prédire valeurs manquantes selon type
# -----------------------------
def predict_missing_criteria(df, top_type, user_years=None, user_mass=None, user_continents=None):
    df_type = df[df['recclass_clean'] == top_type]

    year_pred = df_type['year_period'].mode()[0] if not user_years else user_years
    mass_pred = df_type['mass_bin'].mode()[0] if not user_mass else user_mass
    continent_pred = df_type['continent'].mode()[0] if not user_continents else user_continents

    return year_pred, mass_pred, continent_pred

# -----------------------------
# Infos et pays selon intersection des critères
# -----------------------------
def get_type_info(df, top_type, user_years=None, user_mass=None, user_continents=None):
    df_type = df[df['recclass_clean'] == top_type]

    # Intersection des critères choisis
    df_filtered = df_type.copy()
    if user_years:
        years_flat = []
        for y in user_years:
            if isinstance(y, (list, tuple)):
                years_flat += list(range(y[0], y[1]+1))
            else:
                years_flat.append(y)
        df_filtered = df_filtered[df_filtered['year'].isin(years_flat)]

    if user_mass:
        valid_idx = []
        for m in user_mass:
            if isinstance(m, (list, tuple)):
                valid_idx += df_filtered[df_filtered['mass_cleaned'].between(m[0], m[1])].index.tolist()
            else:
                valid_idx += df_filtered[df_filtered['mass_bin'] == m].index.tolist()
        df_filtered = df_filtered.loc[valid_idx]

    if user_continents:
        df_filtered = df_filtered[df_filtered['continent'].isin(user_continents)]

    names = df_filtered['name'].tolist()
    countries = df_filtered['country'].unique().tolist()
    sample_years = df_filtered['year'].tolist()
    mass_bin = df_filtered['mass_bin'].mode()[0] if not df_filtered.empty else None
    return names, countries, sample_years, mass_bin

# -----------------------------
# Carte interactive
# -----------------------------
def plot_meteorites(df_top, map_file='map.html'):
    m = folium.Map(location=[0,0], zoom_start=2)
    for _, row in df_top.iterrows():
        folium.CircleMarker(
            location=[row['reclat'], row['reclong']],
            radius=3,
            popup=f"{row['name']} ({row['mass_cleaned']} g, {row['country']})",
            color='blue',
            fill=True
        ).add_to(m)
    m.save(map_file)
    return m

# -----------------------------
# Exemples utilisateur
# -----------------------------
user_selections = [
    {"years": [1950], "mass": ['10-100g'], "continents": ['Europe']},  # exact
    {"years": [2000], "mass": None, "continents": ['Asia']},           # masse à prédire
    {"years": [(1900,1950)], "mass": None, "continents": None},        # intervalle année
]

for i, sel in enumerate(user_selections, 1):
    filtered_rules = filter_rules(rules, sel['years'], sel['mass'], sel['continents'])
    top_type, prob, scores = get_most_probable_type(filtered_rules, df)

    year_pred, mass_pred, cont_pred = predict_missing_criteria(df, top_type, sel['years'], sel['mass'], sel['continents'])
    names, countries, sample_years, mass_bin = get_type_info(df, top_type, sel['years'], sel['mass'], sel['continents'])

    print(f"\n===== Exemple {i} =====")
    print(f"Type probable : {top_type} (≈{prob:.2f})")
    if sel['years']:
        print(f"Année choisie : {sel['years']}")
    else:
        print(f"Période prédite : {year_pred}")
        print(f"Exemples d'années : {sample_years[:10]}")
    if sel['mass']:
        print(f"Masse choisie : {sel['mass']}")
    else:
        print(f"Masse prédite : {mass_pred}")
    if sel['continents']:
        print(f"Continent choisi : {sel['continents']}")
    print(f"Pays : {countries[:10]}")  # limiter l'affichage pour lisibilité
    print(f"Exemples de météorites : {names[:10]}")





===== Exemple 1 =====
Type probable : L6 (≈0.24)
Année choisie : [1950]
Masse choisie : ['10-100g']
Continent choisi : ['Europe']
Pays : []
Exemples de météorites : []

===== Exemple 2 =====
Type probable : L6 (≈0.24)
Année choisie : [2000]
Masse prédite : 10-100g
Continent choisi : ['Asia']
Pays : ['Oman']
Exemples de météorites : ['Dhofar 005', 'Dhofar 1052', 'Dhofar 1056', 'Dhofar 1057', 'Dhofar 1061', 'Dhofar 1062', 'Dhofar 1064', 'Dhofar 107', 'Dhofar 1071', 'Dhofar 1072']

===== Exemple 3 =====
Type probable : L6 (≈0.24)
Année choisie : [(1900, 1950)]
Masse prédite : 10-100g
Pays : ['Argentina', 'Niger', 'Jordan', 'United Kingdom', 'United States of America', 'India', 'Canada', 'Turkey', 'Russia', 'South Africa']
Exemples de météorites : ['Achiras', 'Aguada', 'Aïr', 'Akaba', 'Appley Bridge', 'Ashdon', 'Athens', 'Atoka', 'Aztec', 'Baldwyn']
