### Imports et chargement des donn√©es

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Charger les trois fichiers
stars_df = pd.read_csv("Dataset/user_repo_stars_filtered_min_10.csv")  # user, repository
repos_df = pd.read_csv("Dataset/github-dataset.csv")  # repositories metadata
users_df = pd.read_csv("Dataset/users_stars.csv")     # user metadata

print(f"Taille initiale : {len(stars_df)} interactions, {len(repos_df)} repos, {len(users_df)} users.")

Taille initiale : 13407 interactions, 1009 repos, 22240 users.


### Affichage des donn√©es

In [2]:
print("Repositories : ")
repos_df

Repositories : 


Unnamed: 0,repositories,stars_count,forks_count,issues_count,pull_requests,contributors,language
0,octocat/Hello-World,0,0,612,316,2,
1,EddieHubCommunity/support,271,150,536,6,71,
2,ethereum/aleth,0,0,313,27,154,C++
3,localstack/localstack,0,0,290,30,434,Python
4,shobhit97/open-gpstracker,0,0,172,0,3,Java
...,...,...,...,...,...,...,...
1004,Tyriar/canvas-astar.dart,2,1,1,0,0,Dart
1005,ankitkumar70777/github-slideshow,0,0,1,0,8,HTML
1006,aitikgupta/interactive_cpu_scheduler,0,5,1,1,7,Python
1007,gwmccubbin/voting_dapp,11,5,1,0,0,JavaScript


In [3]:
print("Utilisateurs : ")
users_df

Utilisateurs : 


Unnamed: 0,login,name,company,location,public_repos,followers,following,created_at,bio,twitter_username,blog,languages_used,total_stars,total_forks
0,schacon,Scott Chacon,@gitbutlerapp,"Berlin, Germany",223,14024,27,2008-01-27T17:19:28Z,,chacon,http://scottchacon.com,"['AGS Script', 'Arduino', 'JavaScript', 'Pytho...",9522,4363
1,adelcambre,Andy Delcambre,@stripe,"Boulder, CO",100,185,5,2008-02-13T18:43:41Z,,,,"['Starlark', 'Shell', 'C++', 'CoffeeScript', '...",99,40
2,usergenic,Brendan Baldwin,,"Eagle, ID",48,82,27,2008-02-21T17:47:48Z,I make software for people.,,http://usergenic.com,"['VimL', 'TypeScript', 'Shell', 'JavaScript', ...",49,10
3,fdb,Frederik De Bleser,@nodebox,Belgium,211,234,135,2008-04-25T13:02:12Z,"Teacher / researcher at Sint Lucas Antwerpen, ...",enigmeta,https://www.enigmeta.com,"['JavaScript', 'Python', 'Max', 'Jupyter Noteb...",518,96
4,darinel,,,,82,12,18,2008-10-17T12:57:47Z,Web developer,,,"['C++', 'CoffeeScript', 'CSS', 'JavaScript', '...",24,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22235,cxml2,CXML2,,,101,222,794,2021-08-23T10:06:10Z,,,,[],71,1
22236,Cher172,„Åä„Åò„Åé,,„Ç≥„Éõ„É™„É≥„ÉàÂ≥∂,10,110,193,2022-02-14T03:27:55Z,„Åä„Åò„Åé/Ojigi/üáØüáµ/Âä†Â∑•Âº∑ü•ïü•ïü•ï TLË¶ã„Çå„Å¶„Åæ„Åõ„Çìüôè Ââµ‰Ωú #ojigikun Tik...,ojigi_chandayo,https://www.linkedin.com/in/%E3%81%8A%E3%81%98...,[],24,0
22237,KrauseFx,Felix Krause,,"Vienna, Austria",143,5538,16,2011-06-23T08:24:50Z,Founder of ContextSDK\r\n\r\nProfessional iOS ...,KrauseFx,https://krausefx.com,"['TypeScript', 'Objective-C', 'Java', 'Shell',...",12635,1505
22238,AnkitChouhanDws,,,,7,11,2,2018-04-13T07:59:44Z,,,,"['JavaScript', 'Python']",2,1


In [4]:
print("Interactions (Repositories/Utilisateurs) : ")
stars_df

Interactions (Repositories/Utilisateurs) : 


Unnamed: 0,user,repository
0,ff6347,octocat/Hello-World
1,gmittal,octocat/Hello-World
2,angusshire,octocat/Hello-World
3,Jinxiansen,octocat/Hello-World
4,canering,octocat/Hello-World
...,...,...
13402,Chan9390,defuse/email-spoofing
13403,picatz,defuse/email-spoofing
13404,ur5us,defuse/email-spoofing
13405,MilleniumSpark,marcel-dempers/k8s-build-your-own


### Pr√©paration des donn√©es

In [5]:
# Supprimer les doublons

stars_df["user"] = stars_df["user"].astype(str).str.strip()
stars_df["repository"] = stars_df["repository"].astype(str).str.strip()
stars_df.drop_duplicates(subset=["user", "repository"], inplace=True)

# Make sure repositories match between files 
merged_repos = repos_df[["repositories", "stars_count", "forks_count", "language"]].copy() 
merged_repos.rename(columns={"repositories": "repository"}, inplace=True)
merged_repos.drop_duplicates(subset=["repository"], keep="first", inplace=True)

# Keep only repositories that appear in both datasets 
stars_df = stars_df[stars_df["repository"].isin(merged_repos["repository"])] 

print(f"Taille finale : {len(stars_df)} interactions, {len(merged_repos)} repos, {len(users_df)} users.")

Taille finale : 13407 interactions, 932 repos, 22240 users.


### Division TRAIN/TEST (70 / 30)

In [6]:
# TRAIN/TEST split : 70/30 par utilisateur

train_list, test_list = [], []

for user, user_df in stars_df.groupby("user"):
    if len(user_df) < 2:
        train_list.append(user_df)
        continue
    
    train, test = train_test_split(
        user_df,
        test_size=0.3,
        random_state=42
    )
    train_list.append(train)
    test_list.append(test)

train_df = pd.concat(train_list)
test_df = pd.concat(test_list)

print(f"Train: {len(train_df)} interactions, Test: {len(test_df)} interactions")

Train: 9040 interactions, Test: 4367 interactions


### Model bas√© sur la popularit√©

In [7]:
popular_repos = merged_repos.sort_values(by="stars_count", ascending=False).reset_index(drop=True)
print(popular_repos)

                                  repository  stars_count  forks_count  \
0         iamshaunjp/Complete-React-Tutorial          995            0   
1          adrianhajdin/project_graphql_blog          977          233   
2    adrianhajdin/project_medical_pager_chat          968          336   
3                brettkromkamp/contextualise          960           43   
4                sukritishah15/DS-Algo-Point          956          289   
..                                       ...          ...          ...   
927                    localstack/localstack            0            0   
928                      rrousselGit/freezed            0          169   
929                       google/accompanist            0          389   
930                     WhiteHouse/petitions            0          353   
931              mattgallagher/AudioStreamer            0          566   

        language  
0            NaN  
1     JavaScript  
2     JavaScript  
3         Python  
4           Java

### Fonction de recommandation 

In [8]:
def recommend_popular(user_id, top_k=10):
    """Return top-k popular repositories that the user hasn't starred yet."""
    # d√©p√¥ts d√©j√† √©toil√©s par l'utilisateur
    user_starred = set(train_df.loc[train_df["user"] == user_id, "repository"])
    # filtrer les d√©p√¥ts non encore √©toil√©s
    recs = popular_repos[~popular_repos["repository"].isin(user_starred)]
    # top_k
    return recs.head(top_k)["repository"].tolist()

### Evaluation

In [9]:
# calcul de la pr√©cision, recall et f-measure pour le test set
def precision_recall_f1_at_k(train_df, test_df, top_k=10):
    precisions, recalls, f1s = [], [], []

    users = test_df["user"].unique()
    for user in users:
        # test items (ground truth)
        true_items = set(test_df.loc[test_df["user"] == user, "repository"])
        if len(true_items) == 0:
            continue
        
        # recommended items
        recommended = set(recommend_popular(user, top_k=top_k))
        if not recommended:
            continue
        
        # true positives
        tp = len(true_items & recommended)
        precision = tp / len(recommended) if len(recommended) > 0 else 0
        recall = tp / len(true_items) if len(true_items) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

    results = {
        "Precision@K": np.mean(precisions),
        "Recall@K": np.mean(recalls),
        "F1@K": np.mean(f1s)
    }
    return results


metrics = precision_recall_f1_at_k(train_df, test_df, top_k=10)

print("\nEvaluation Results (Popularity Model)")
print(f"Precision@10: {metrics['Precision@K']:.4f}")
print(f"Recall@10:    {metrics['Recall@K']:.4f}")
print(f"F1@10:        {metrics['F1@K']:.4f}")


Evaluation Results (Popularity Model)
Precision@10: 0.0181
Recall@10:    0.0352
F1@10:        0.0229


### Evaluation pour un user particulier

In [10]:
# S√©lection d'un utilisateur particulier (par index or manuellement)
some_user = stars_df['user'].iloc[40]  # ou "Adlinke"
print(f"Evaluating user: {some_user}")

# USER PROFILE
user_profile = users_df[users_df["login"] == some_user]

if not user_profile.empty:
    print("\nUser profile:")
    print(user_profile.to_string(index=False))
else:
    print("No user metadata found")

# Interaction du user
user_train = train_df[train_df["user"] == some_user]
user_test = test_df[test_df["user"] == some_user]

print(f"\nTrain interactions: {len(user_train)} repositories")
print(f"\nTest interactions:  {len(user_test)} repositories")

if len(user_train) > 0:
    print("\nTrain repositories:")
    print(user_train["repository"].to_list())

if len(user_test) > 0:
    print("\nTest repositories:")
    print(user_test["repository"].to_list())

# Liste des recommandations
recommendations = recommend_popular(some_user, top_k=10)
print("\nTop-10 Recommended repositories:")
print(recommendations)

# EVALUATION
true_items = set(user_test["repository"])

if len(true_items) == 0:
    print("No test items found")
else:
    recommended = set(recommendations)
    tp = len(true_items & recommended)
    
    if tp > 0:
        print(f"\nCorrectly recommended repositories: {list(true_items & recommended)}")

    precision = tp / len(recommended) if len(recommended) > 0 else 0
    recall = tp / len(true_items) if len(true_items) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    print("\nUser-specific evaluation results:")
    print(f"Precision@10: {precision:.4f}")
    print(f"Recall@10:    {recall:.4f}")
    print(f"F1@10:        {f1:.4f}")

Evaluating user: Adlinke

User profile:
  login                         name                             company location  public_repos  followers  following           created_at                                                                                                                                                                        bio twitter_username blog                                                                          languages_used  total_stars  total_forks
Adlinke Cap‚Äôn Donald Trump & AdLinke AdLinke Media & Trump Organisation       USA           867         18        467 2022-02-05T17:19:07Z Software Developer. Business Manager! \r\n‚Ä¢ `#SovereignPurge`\r\n‚Ä¢ `#NoDomesticSetup`\r\n‚Ä¢ `#AdLinkeProtocol`\r\n‚Ä¢ `#InfrastructureCorrection`\r\n‚Ä¢ `#JurisdictionDeclared              NaN  NaN ['Forth', 'TypeScript', 'Java', 'Swift', 'Astro', 'JavaScript', 'Python', 'C#', 'HTML']           72           18

Train interactions: 16 repositories

Test interactions:

In [11]:
# Bloc d'√©valuation compl√®te du mod√®le de popularit√©

import numpy as np
import pandas as pd
from collections import defaultdict

def evaluate_model_comprehensive(train_df, test_df, recommend_function, k_values=[5, 10, 20]):
    """
    √âvalue le mod√®le de recommandation avec plusieurs m√©triques et valeurs de K
    
    Args:
        train_df: DataFrame d'entra√Ænement
        test_df: DataFrame de test
        recommend_function: Fonction de recommandation (user_id, top_k) -> list
        k_values: Liste des valeurs de K √† tester
    
    Returns:
        DataFrame avec les r√©sultats pour chaque K
    """
    results = []
    
    users = test_df["user"].unique()
    print(f"√âvaluation sur {len(users)} utilisateurs...\n")
    
    for k in k_values:
        precisions, recalls, f1s = [], [], []
        hits = 0  # Nombre d'utilisateurs avec au moins 1 bonne recommandation
        total_users = 0
        
        for user in users:
            # Ground truth
            true_items = set(test_df.loc[test_df["user"] == user, "repository"])
            if len(true_items) == 0:
                continue
            
            total_users += 1
            
            # Recommandations
            try:
                recommended = set(recommend_function(user, top_k=k))
            except:
                continue
                
            if not recommended:
                continue
            
            # Calcul des m√©triques
            tp = len(true_items & recommended)
            
            if tp > 0:
                hits += 1
            
            precision = tp / len(recommended) if len(recommended) > 0 else 0
            recall = tp / len(true_items) if len(true_items) > 0 else 0
            f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
            
            precisions.append(precision)
            recalls.append(recall)
            f1s.append(f1)
        
        # Calcul des moyennes
        hit_rate = hits / total_users if total_users > 0 else 0
        
        results.append({
            'K': k,
            'Precision@K': np.mean(precisions),
            'Recall@K': np.mean(recalls),
            'F1@K': np.mean(f1s),
            'Hit_Rate@K': hit_rate,
            'Utilisateurs √©valu√©s': total_users
        })
    
    results_df = pd.DataFrame(results)
    return results_df


# Ex√©cution de l'√©valuation
print("="*70)
print("√âVALUATION DU MOD√àLE DE POPULARIT√â")
print("="*70)

evaluation_results = evaluate_model_comprehensive(
    train_df, 
    test_df, 
    recommend_popular,
    k_values=[5, 10, 20, 50]
)

# Affichage des r√©sultats
print("\n" + "="*70)
print("R√âSULTATS D'√âVALUATION")
print("="*70 + "\n")

for _, row in evaluation_results.iterrows():
    print(f"K = {int(row['K'])}")
    print(f"  Precision@{int(row['K'])}: {row['Precision@K']:.4f} ({row['Precision@K']*100:.2f}%)")
    print(f"  Recall@{int(row['K'])}: {row['Recall@K']:.4f} ({row['Recall@K']*100:.2f}%)")
    print(f"  F1@{int(row['K'])}: {row['F1@K']:.4f} ({row['F1@K']*100:.2f}%)")
    print(f"  Hit Rate@{int(row['K'])}: {row['Hit_Rate@K']:.4f} ({row['Hit_Rate@K']*100:.2f}%)")
    print(f"  Utilisateurs √©valu√©s: {int(row['Utilisateurs √©valu√©s'])}")
    print()

# Analyse compl√©mentaire
print("="*70)
print("ANALYSE COMPL√âMENTAIRE")
print("="*70 + "\n")

# Distribution des interactions
print("Distribution des interactions dans le test set:")
test_interactions_per_user = test_df.groupby("user").size()
print(f"  Moyenne: {test_interactions_per_user.mean():.2f} repos/user")
print(f"  M√©diane: {test_interactions_per_user.median():.0f} repos/user")
print(f"  Min: {test_interactions_per_user.min()}")
print(f"  Max: {test_interactions_per_user.max()}")
print(f"  √âcart-type: {test_interactions_per_user.std():.2f}\n")

# Couverture des recommandations
unique_recommended = set()
for user in test_df["user"].unique()[:100]:  # Sur un √©chantillon
    try:
        recs = recommend_popular(user, top_k=10)
        unique_recommended.update(recs)
    except:
        pass

print(f"Diversit√© des recommandations:")
print(f"  Nombre de d√©p√¥ts uniques recommand√©s (√©chantillon): {len(unique_recommended)}")
print(f"  Nombre total de d√©p√¥ts disponibles: {len(merged_repos)}")
print(f"  Taux de couverture: {len(unique_recommended)/len(merged_repos)*100:.2f}%\n")

# Visualisation simple
print("="*70)
print("COMPARAISON VISUELLE DES M√âTRIQUES")
print("="*70 + "\n")

for metric in ['Precision@K', 'Recall@K', 'F1@K', 'Hit_Rate@K']:
    print(f"\n{metric}:")
    for _, row in evaluation_results.iterrows():
        k = int(row['K'])
        value = row[metric]
        bar_length = int(value * 100)  # Scale to 100
        bar = "‚ñà" * bar_length + "‚ñë" * (100 - bar_length)
        print(f"  K={k:2d}: {bar[:50]} {value:.4f}")

print("\n" + "="*70)
print("INTERPR√âTATION")
print("="*70)
print("""
Interpr√©tation des r√©sultats:
- Precision faible (<5%) : Le mod√®le recommande beaucoup de d√©p√¥ts non pertinents
- Recall faible (<10%) : Le mod√®le manque la plupart des d√©p√¥ts r√©ellement pertinents
- F1 faible : Performance globale m√©diocre
- Hit Rate : % d'utilisateurs ayant au moins 1 bonne recommandation

Un mod√®le de popularit√© pure a g√©n√©ralement des performances limit√©es car il ignore
les pr√©f√©rences individuelles. Des am√©liorations possibles:
1. Filtrage par langage de programmation
2. Mod√®le collaboratif (filtrage collaboratif)
3. Mod√®le hybride (popularit√© + contenu + collaboratif)
""")

√âVALUATION DU MOD√àLE DE POPULARIT√â
√âvaluation sur 901 utilisateurs...


R√âSULTATS D'√âVALUATION

K = 5
  Precision@5: 0.0155 (1.55%)
  Recall@5: 0.0154 (1.54%)
  F1@5: 0.0148 (1.48%)
  Hit Rate@5: 0.0766 (7.66%)
  Utilisateurs √©valu√©s: 901

K = 10
  Precision@10: 0.0181 (1.81%)
  Recall@10: 0.0352 (3.52%)
  F1@10: 0.0229 (2.29%)
  Hit Rate@10: 0.1654 (16.54%)
  Utilisateurs √©valu√©s: 901

K = 20
  Precision@20: 0.0173 (1.73%)
  Recall@20: 0.0682 (6.82%)
  F1@20: 0.0267 (2.67%)
  Hit Rate@20: 0.2997 (29.97%)
  Utilisateurs √©valu√©s: 901

K = 50
  Precision@50: 0.0133 (1.33%)
  Recall@50: 0.1305 (13.05%)
  F1@50: 0.0236 (2.36%)
  Hit Rate@50: 0.4562 (45.62%)
  Utilisateurs √©valu√©s: 901

ANALYSE COMPL√âMENTAIRE

Distribution des interactions dans le test set:
  Moyenne: 4.85 repos/user
  M√©diane: 4 repos/user
  Min: 3
  Max: 27
  √âcart-type: 2.28

Diversit√© des recommandations:
  Nombre de d√©p√¥ts uniques recommand√©s (√©chantillon): 14
  Nombre total de d√©p√¥ts disponible