# Modle Building 
## Predictive modleing of Git Hub Pulls and Issues

### Packages

In [27]:

import pandas as pd
import numpy as np
import re
import os
from dotenv import load_dotenv
from github import Github
from tqdm import tqdm
from datetime import datetime, timedelta
import pytz
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import scipy.stats as stats
import xgboost as xgb
import tensorflow as tf

#model building
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, GroupKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor, XGBClassifier, plot_importance
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, precision_recall_curve, mean_absolute_error, accuracy_score
import shap


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

from scipy.stats import uniform, randint




In [28]:
org_name = "Rdatatable"
repo_name = "data.table"

reponame_noperiod = repo_name.replace(".", "")
reponame_noperiod = reponame_noperiod.replace("_", "")
reponame_noperiod = reponame_noperiod.lower()

In [29]:
secret = "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"
g = Github(secret)
org = g.get_organization(org_name)
repo = org.get_repo(repo_name)
g.rate_limiting

(2614, 5000)

### Load Files from Data Extraction

In [30]:
#load cli_interactions_pulls
#load cli_interactions 
file_path = f'cli_interactions_pulls.xlsx'
if os.path.exists(file_path):
    cli_interactions_pulls = pd.read_excel(file_path, engine='openpyxl')
    print("CLI Interactions Pulls sucsefully loaded")
else:
    cli_interactions_pulls = pd.DataFrame()
    print("Error: CLI Interactions Pulls file does not exist ", file_path)

file_path = f'Files/cli_issue_interactions_Small.xlsx'

if os.path.exists(file_path):
    cli_interactions = pd.read_excel(file_path, engine='openpyxl')
    print("CLI Interactions file sucsefully loaded")
else:   
    cli_interactions = pd.DataFrame()
    print("Error: CLI Interactions file does not exist ", file_path)    

CLI Interactions Pulls sucsefully loaded
CLI Interactions file sucsefully loaded


### Bot Filter

In [31]:
def filter_bots(df, column='created_by'):
    bot_patterns = r'(?:\\[bot\\]| bot|dependabot|github-actions|renovate|github|circleci|travis|appveyor|azure|jenkins|ci/cd|snyk|codecov|hdepend|vercel|netlify|heroku|greenkeeper|semantic-release|sonarcloud|probot|lgtm|coveralls)'
    return df[~df[column].str.contains(bot_patterns, case=False, na=False)]

cli_interactions=filter_bots(cli_interactions, column='created_by')
cli_interactions_pulls=filter_bots(cli_interactions_pulls, column='created_by')


In [None]:

# Prepare a list to store results
results = []

# Iterate over each row in the CSV file
for index, row in df.iterrows():
    org_name = row['org']
    repo_name = row['repo']
    try:
        print(f"Processing {org_name}/{repo_name}...")
        repo = g.get_repo(f"{org_name}/{repo_name}")
        
        # Get all contributors, sorted by number of contributions
        contributors = list(repo.get_contributors())
        contributors.sort(key=lambda x: x.contributions, reverse=True)
        
        # Take top contributors (let's say top 10)
        top_contributors = contributors[:10]
        
        # Store their information
        for contributor in top_contributors:
            results.append({
                "org": org_name,
                "repo": repo_name,
                "username": contributor.login,
                "commits": contributor.contributions,
                "profile_url": contributor.html_url
            })
            
        # Avoid rate limiting
        time.sleep(2)
        
    except Exception as e:
        print(f"Error processing {org_name}/{repo_name}: {e}")

# Convert results to DataFrame and save
result_df = pd.DataFrame(results)
result_df.to_excel("top_contributors.xlsx", index=False)
print("Top contributors extraction complete! Saved to top_contributors.xlsx.")

### Top 30% of Users

In [32]:
# Count the number of rows for each user in the "created_by" column for the "cli_interactions" dataset and cli_interactions_pulls
cli_interactions_user_counts = cli_interactions['created_by'].value_counts()
cli_interactions_pulls_user_counts = cli_interactions_pulls['created_by'].value_counts()

# Calculate the threshold for the top 20% of users
cli_interactions_threshold = int(0.30 * len(cli_interactions_user_counts))
cli_interactions_pulls_threshold = int(0.30 * len(cli_interactions_pulls_user_counts))

#remove the users who have less than the threshold number of interactions from cli_interactions and cli_interactions_pulls
cli_interactions_pulls = cli_interactions_pulls[cli_interactions_pulls['created_by'].isin(cli_interactions_pulls_user_counts[cli_interactions_pulls_user_counts > cli_interactions_pulls_threshold].index)]

#list all users that are in the top 20% of users in both datasets
cli_interactions_users = cli_interactions['created_by'].unique()
cli_interactions_pulls_users = cli_interactions_pulls['created_by'].unique()

# print both lists
print("Users in CLI Interactions:")
print(cli_interactions_users)
print("Users in CLI Interactions Pulls:")
print(cli_interactions_pulls_users)


Users in CLI Interactions:
['hadley' 'gaborcsardi' 'jennybc' 'salim-b' 'krlmlr']
Users in CLI Interactions Pulls:
['jennybc' 'gaborcsardi' 'MichaelChirico' 'rundel' 'krlmlr' 'hadley'
 'olivroy' 'salim-b' 'lionel-' 'romainfrancois']


### Time Series Data Convertion

In [288]:
def create_comprehensive_daily_time_series(issues_activity, pulls_activity):
    
    # Combine issues and pulls data
    issues_activity['data_source'] = 'issues'
    pulls_activity['data_source'] = 'pulls'
    
    # Combine datasets
    combined_activity = pd.concat([issues_activity, pulls_activity], ignore_index=True)
    
    # Convert created_at to datetime if not already
    combined_activity['created_at'] = pd.to_datetime(combined_activity['created_at'])
    
    # Extract date from created_at
    combined_activity['year_month_day'] = combined_activity['created_at'].dt.to_period('W').astype(str)
    
    # Get unique interaction types
    interaction_types = combined_activity['interaction_type'].unique()
    
    # Find the overall date range
    min_date = combined_activity['year_month_day'].min()
    max_date = combined_activity['year_month_day'].max()
    
    # Get unique users
    unique_users = combined_activity['created_by'].unique()
    
    # Create a complete date range
    complete_dates = pd.period_range(start=min_date, end=max_date, freq='W').astype(str)
    
    # Create a complete user-date grid
    user_date_grid = []
    for user in unique_users:
        for date in complete_dates:
            user_date_grid.append({
                "created_by": user,
                "year_month_day": date
            })
    
    # Convert to DataFrame
    complete_grid = pd.DataFrame(user_date_grid)
    
    # Compute daily interaction counts
    daily_interaction_counts = combined_activity.groupby([
        'created_by', 'year_month_day', 'interaction_type'
    ]).size().reset_index(name='count')
    
    # Pivot to create columns for each interaction type
    interaction_pivot = daily_interaction_counts.pivot_table(
        index=['created_by', 'year_month_day'], 
        columns='interaction_type', 
        values='count', 
        fill_value=0
    ).reset_index()
    
    # Merge complete grid with interaction counts
    df = pd.merge(
        complete_grid, 
        interaction_pivot, 
        on=['created_by', 'year_month_day'], 
        how='left'
    ).fillna(0)
    
    # Compute source-specific activity
    source_activity = combined_activity.groupby([
        'created_by', 'year_month_day', 'data_source'
    ]).size().reset_index(name='activity_count')
    
    source_pivot = source_activity.pivot_table(
        index=['created_by', 'year_month_day'], 
        columns='data_source', 
        values='activity_count', 
        fill_value=0
    ).reset_index()
    source_pivot.columns.name = None
    source_pivot = source_pivot.rename(columns={
        'issues': 'total_activity_issues', 
        'pulls': 'total_activity_pulls'
    })
    
    # Merge source activity
    df = pd.merge(
        df, 
        source_pivot, 
        on=['created_by', 'year_month_day'], 
        how='left'
    ).fillna(0)
    
    # Total activity
    df['total_activity'] = df['total_activity_issues'] + df['total_activity_pulls']
    
    # Is active flag
    df['is_active'] = (df['total_activity'] > 0).astype(int)
    
    # Create target variable (predicting inactivity in the next day)
    df['target'] = (
        df.groupby('created_by')['is_active'].shift(-1) == 0
    ).fillna(0).astype(int)
    
    # Ensure all interaction type columns exist
    for interaction in interaction_types:
        if interaction not in df.columns:
            df[interaction] = 0
    
    return df





def create_user_specific_time_series(df):
    # Find each user's first active day
    user_first_active_day = df[df['total_activity'] > 0].groupby('created_by')['year_month_day'].min()
    
    # Filter DataFrame to start from user's first active day
    filtered_df = df.merge(user_first_active_day, on='created_by', suffixes=('', '_first'))
    filtered_df = filtered_df[filtered_df['year_month_day'] >= filtered_df['year_month_day_first']]

    filtered_df= filtered_df.drop(columns=['year_month_day_first'])

    
    return filtered_df



In [289]:
# Example usage:
daily_time_series = create_comprehensive_daily_time_series(cli_interactions, cli_interactions_pulls)
daily_time_series = create_user_specific_time_series(daily_time_series)
#make this a xlsx file

# ---- Feature Engineering ----

In [290]:
def calculate_gini(series):
    """Calculate Gini coefficient for a pandas Series or numpy array."""
    sorted_series = np.sort(series)
    n = len(sorted_series)
    if n == 0 or sorted_series.sum() == 0:
        return 0.0
    index = np.arange(1, n + 1)
    return (np.sum((2 * index - n - 1) * sorted_series)) / (n * np.sum(sorted_series))

def calculate_interaction_entropy(row, interaction_columns):
    """Calculate entropy for interaction columns in a row."""
    interactions = row[interaction_columns].values.astype(float)
    total = interactions.sum()
    if total == 0:
        return 0.0
    probs = interactions / total
    non_zero_probs = probs[probs > 0]
    if len(non_zero_probs) == 0:
        return 0.0
    return -np.sum(non_zero_probs * np.log2(non_zero_probs))

def engineer_features(df):
    """Consolidated feature engineering function with all transformations."""
    # Identify interaction columns
    interaction_columns = [
    'assigned', 'closed', 'demilestoned', 'issue_comment', 
    'issue_creation', 'labeled', 'mentioned', 'milestoned', 
    'pr_comment', 'pr_commit', 'pr_creation', 
    'pr_review_approved', 'pr_review_commented', 
    'referenced', 'renamed', 'reopened', 
    'subscribed', 'transferred', 'unlabeled'
]
    
    # 1. Row-level metrics
    df['row_gini'] = df.apply(lambda row: calculate_gini(row[interaction_columns]), axis=1)
    df['interaction_entropy'] = df.apply(calculate_interaction_entropy, axis=1, interaction_columns=interaction_columns)
    
    # 2. User-level statistics
    user_agg = df.groupby('created_by').agg({
        'total_activity': ['mean', 'median', 'max', 'std'],
        'is_active': 'mean'
    }).reset_index()
    user_agg.columns = ['created_by', 'activity_mean', 'activity_median', 
                        'activity_max', 'activity_std', 'active_probability']
    df = df.merge(user_agg, on='created_by', how='left')
    
    # 3. Activity-based features
    df['relative_activity'] = df['total_activity'] / (df['activity_mean'] + 1e-10)
    df['deviation_from_mean'] = df['total_activity'] - df['activity_mean']
    df['activity_cv'] = df['activity_std'] / (df['activity_mean'] + 1e-10)
    df['activity_zscore'] = (df['total_activity'] - df['activity_mean']) / (df['activity_std'] + 1e-10)
    df['activity_gini'] = df.groupby('created_by')['total_activity'].transform(calculate_gini)
    
    # 4. Temporal features
    for window in [3, 7, 14]:
        # Rolling features
        df[f'relative_rolling_{window}d'] = df.groupby('created_by')['relative_activity'].transform(
            lambda x: x.rolling(window, min_periods=1).mean())
        df[f'deviation_rolling_{window}d'] = df.groupby('created_by')['deviation_from_mean'].transform(
            lambda x: x.rolling(window, min_periods=1).mean())
        df[f'activity_rolling_{window}d_mean'] = df.groupby('created_by')['total_activity'].transform(
            lambda x: x.rolling(window, min_periods=1).mean())
        df[f'activity_rolling_{window}d_std'] = df.groupby('created_by')['total_activity'].transform(
            lambda x: x.rolling(window, min_periods=1).std())
        
        # Exponential smoothing
        df[f'exp_smooth_{window}d'] = df.groupby('created_by')['total_activity'].transform(
            lambda x: x.ewm(span=window, adjust=False).mean())
    
    # 5. Interaction features
    for col in interaction_columns:
        df[f'{col}_ratio'] = df[col] / (df['total_activity'] + 1e-10)
    
    # 6. Specialized metrics
    df['pulls_to_issues_ratio'] = np.where(
        df['total_activity_issues'] == 0,
        10.0,
        df['total_activity_pulls'] / df['total_activity_issues']
    )
    df['active_streak'] = df.groupby('created_by')['is_active'].transform(
        lambda x: x * (x.groupby((x != x.shift()).cumsum()).cumcount() + 1)
    )
    
    return df




In [291]:
df = engineer_features(daily_time_series)
df

Unnamed: 0,created_by,year_month_day,assigned,closed,demilestoned,issue_comment,issue_creation,labeled,mentioned,milestoned,...,pr_review_approved_ratio,pr_review_commented_ratio,referenced_ratio,renamed_ratio,reopened_ratio,subscribed_ratio,transferred_ratio,unlabeled_ratio,pulls_to_issues_ratio,active_streak
0,hadley,2017-04-24/2017-04-30,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,hadley,2017-05-01/2017-05-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0
2,hadley,2017-05-08/2017-05-14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0
3,hadley,2017-05-15/2017-05-21,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,hadley,2017-05-22/2017-05-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2764,romainfrancois,2025-01-27/2025-02-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0
2765,romainfrancois,2025-02-03/2025-02-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0
2766,romainfrancois,2025-02-10/2025-02-16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0
2767,romainfrancois,2025-02-17/2025-02-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0


### Predictive modeling

In [300]:
def prepare_data(df):
    """
    Prepare data for machine learning models with user group handling
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input DataFrame
    
    Returns:
    --------
    tuple
        Prepared X, y, and user groups
    """
    # Identify the user ID column (handle different possible column names)
    user_id_col = 'user_id' if 'user_id' in df.columns else 'created_by'
    
    # Exclude columns
    exclude_columns = [
        'created_by', 'year_month_day', 'target', 
        'user_id', user_id_col
    ]
    
    # Select features
    feature_columns = [
        col for col in df.columns 
        if col not in exclude_columns
    ]
    
    X = df[feature_columns]
    y = df['target']
    user_groups = df[user_id_col]
    
    return X, y, user_groups

def split_data_by_user(df):
    """
    Split dataframe into individual user dataframes
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe with all users' data
    
    Returns:
    --------
    dict
        Dictionary of dataframes, one per user
    """
    # Identify the user ID column
    user_id_col = 'user_id' if 'user_id' in df.columns else 'created_by'
    
    user_dataframes = {
        user: df[df[user_id_col] == user] 
        for user in df[user_id_col].unique()
    }
    
    return user_dataframes

def prepare_lstm_data(df):
    """
    Prepare data for LSTM model
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input DataFrame
    
    Returns:
    --------
    tuple
        Prepared X, y for LSTM
    """
    # Identify the user ID column
    user_id_col = 'user_id' if 'user_id' in df.columns else 'created_by'
    
    # Group by user and create sequences
    def create_sequences(group, look_back=7):
        X, y = [], []
        
        # Exclude non-feature columns
        feature_columns = [
            col for col in group.columns 
            if col not in ['created_by', 'year_month_day', 'target', 'user_id', user_id_col]
        ]
        
        # Create sequences
        for i in range(len(group) - look_back):
            X.append(group[feature_columns].iloc[i:i+look_back].values)
            y.append(group['target'].iloc[i+look_back])
        
        return X, y
    
    # Prepare sequences for each user
    sequences_data = df.groupby(user_id_col).apply(create_sequences)
    
    # Combine sequences
    X_sequences = []
    y_sequences = []
    for user_sequences in sequences_data:
        X_sequences.extend(user_sequences[0])
        y_sequences.extend(user_sequences[1])
    
    # Convert to numpy arrays
    X = np.array(X_sequences)
    y = np.array(y_sequences)
    
    return X, y


In [301]:
def random_forest_model(X, y):
    """
    Train and evaluate Random Forest Classifier
    
    Parameters:
    -----------
    X : pd.DataFrame
        Feature matrix
    y : pd.Series
        Target variable
    
    Returns:
    --------
    dict
        Model performance metrics
    """
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Random Forest Classifier
    rf_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=10,
        min_samples_leaf=4,
        random_state=42,
        class_weight='balanced'  # Handle class imbalance
    )
    
    # Train the model
    rf_model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_pred = rf_model.predict(X_test_scaled)
    
    # Performance metrics
    performance = {
        'accuracy': accuracy_score(y_test, y_pred),
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'classification_report': classification_report(y_test, y_pred),
        'feature_importances': rf_model.feature_importances_
    }
    
    # Feature Importance Analysis
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("Top 10 Most Important Features:")
    print(feature_importance.head(10))
    
    return performance


def lstm_model(X, y):
    """
    Train and evaluate LSTM model
    
    Parameters:
    -----------
    X : np.array
        Input sequences
    y : np.array
        Target variable
    
    Returns:
    --------
    dict
        Model performance metrics
    """
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Build LSTM model
    model = Sequential([
        LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
        Dropout(0.2),
        LSTM(32),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    
    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    # Early stopping
    early_stop = EarlyStopping(
        monitor='val_loss', 
        patience=10, 
        restore_best_weights=True
    )
    
    # Train the model
    history = model.fit(
        X_train, y_train,
        validation_split=0.2,
        epochs=50,
        batch_size=32,
        callbacks=[early_stop],
        verbose=0
    )
    
    # Evaluate the model
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    
    performance = {
        'accuracy': accuracy_score(y_test, y_pred),
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'classification_report': classification_report(y_test, y_pred)
    }
    
    return performance



In [302]:
def xgboost_parameter_tuning(X, y, user_groups=None):
    """
    Perform comprehensive parameter tuning for XGBoost
    
    Parameters:
    -----------
    X : pd.DataFrame
        Feature matrix
    y : pd.Series
        Target variable
    user_groups : pd.Series, optional
        User group identifiers for group-based cross-validation
    
    Returns:
    --------
    dict
        Tuning results and best model
    """
    # Prepare cross-validation strategy
    if user_groups is not None:
        # Create a mapping of unique user groups to integers
        unique_groups = user_groups.unique()
        group_map = {group: i for i, group in enumerate(unique_groups)}
        group_indices = np.array([group_map[g] for g in user_groups])
        
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    else:
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Base XGBoost Classifier
    base_model = xgb.XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    )
    
    # Randomized Search Parameter Grid
    random_search_params = {
        'n_estimators': randint(50, 300),
        'max_depth': randint(3, 10),
        'learning_rate': uniform(0.01, 0.3),
        'subsample': uniform(0.6, 0.4),
        'colsample_bytree': uniform(0.5, 0.5),
        'min_child_weight': randint(1, 7),
        'gamma': uniform(0, 1)
    }
    
    # Randomized Search
    random_search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=random_search_params,
        n_iter=100,  # Number of parameter settings sampled
        cv=cv,
        scoring='f1',
        n_jobs=-1,
        random_state=42,
        verbose=2
    )
    
    # Fit Randomized Search
    random_search.fit(X, y)
    
    # Grid Search with Refined Parameters
    grid_search_params = {
        'n_estimators': [
            max(50, random_search.best_params_['n_estimators'] - 50),
            random_search.best_params_['n_estimators'],
            min(300, random_search.best_params_['n_estimators'] + 50)
        ],
        'max_depth': [
            max(3, random_search.best_params_['max_depth'] - 1),
            random_search.best_params_['max_depth'],
            min(10, random_search.best_params_['max_depth'] + 1)
        ],
        'learning_rate': [
            max(0.01, random_search.best_params_['learning_rate'] - 0.1),
            random_search.best_params_['learning_rate'],
            min(0.3, random_search.best_params_['learning_rate'] + 0.1)
        ]
    }
    
    # Grid Search
    grid_search = GridSearchCV(
        estimator=base_model,
        param_grid=grid_search_params,
        cv=cv,
        scoring='f1',
        n_jobs=-1,
        verbose=2
    )
    
    # Fit Grid Search
    grid_search.fit(X, y)
    
    # Prepare results
    tuning_results = {
        'random_search_best_params': random_search.best_params_,
        'random_search_best_score': random_search.best_score_,
        'grid_search_best_params': grid_search.best_params_,
        'grid_search_best_score': grid_search.best_score_,
        'best_model': grid_search.best_estimator_
    }
    
    # Print detailed results
    print("\n--- Randomized Search Best Parameters ---")
    for param, value in random_search.best_params_.items():
        print(f"{param}: {value}")
    print(f"Best Score: {random_search.best_score_:.4f}")
    
    print("\n--- Grid Search Best Parameters ---")
    for param, value in grid_search.best_params_.items():
        print(f"{param}: {value}")
    print(f"Best Score: {grid_search.best_score_:.4f}")
    
    # Feature Importance for Best Model
    plt.figure(figsize=(12, 8))
    xgb.plot_importance(tuning_results['best_model'], height=0.8, max_num_features=10)
    plt.title('Feature Importance for Best Model', fontsize=16)
    plt.tight_layout()
    plt.savefig('best_model_feature_importance.png')
    plt.close()
    
    return tuning_results


In [303]:
def model_analysis(df, X_train, X_test, y_train, y_test, best_model, feature_cols):
    """
    Comprehensive model analysis function
    
    Parameters:
    -----------
    df : pd.DataFrame
        Original dataframe
    X_train : pd.DataFrame
        Training features
    X_test : pd.DataFrame
        Testing features
    y_train : pd.Series
        Training target
    y_test : pd.Series
        Testing target
    best_model : XGBClassifier
        Trained XGBoost model
    feature_cols : list
        List of feature column names
    """
    # 1. Feature Importance Analysis
    plt.figure(figsize=(12, 8))
    xgb.plot_importance(best_model, height=0.8, max_num_features=len(feature_cols))
    plt.title('Feature Importance (Weight)', fontsize=16)
    plt.tight_layout()
    plt.savefig('feature_importance_weight.png')
    plt.close()

    plt.figure(figsize=(12, 8))
    xgb.plot_importance(best_model, height=0.8, importance_type='gain', max_num_features=len(feature_cols))
    plt.title('Feature Importance (Gain)', fontsize=16)
    plt.tight_layout()
    plt.savefig('feature_importance_gain.png')
    plt.close()

    # 2. SHAP Values for deeper insights
    explainer = shap.TreeExplainer(best_model)
    shap_values = explainer.shap_values(X_test)

    # Summary plot
    plt.figure(figsize=(12, 8))
    shap.summary_plot(shap_values, X_test, feature_names=feature_cols, show=False)
    plt.title('SHAP Summary Plot', fontsize=16)
    plt.tight_layout()
    plt.savefig('shap_summary.png')
    plt.close()

    # Dependence plots for top features
    top_features = [feature_cols[i] for i in np.argsort(np.abs(shap_values).mean(0))[-3:]]
    for feature in top_features:
        plt.figure(figsize=(12, 8))
        shap.dependence_plot(feature, shap_values, X_test, feature_names=feature_cols, show=False)
        plt.title(f'SHAP Dependence Plot: {feature}', fontsize=16)
        plt.tight_layout()
        plt.savefig(f'shap_dependence_{feature}.png')
        plt.close()

    # 3. Analysis of misclassifications
    y_pred = best_model.predict(X_test)
    misclassified_idx = np.where(y_pred != y_test)[0]

    misclassified_df = pd.DataFrame({
        'true_label': y_test.iloc[misclassified_idx],
        'predicted': y_pred[misclassified_idx],
        'probability': best_model.predict_proba(X_test)[:, 1][misclassified_idx]
    })

    # Add feature values for misclassified instances
    for feature in feature_cols:
        misclassified_df[feature] = X_test.iloc[misclassified_idx][feature].values

    # 4. False Negatives Analysis
    false_negatives = misclassified_df[misclassified_df['true_label'] == 1]
    print(f"Number of false negatives: {len(false_negatives)}")

    if len(false_negatives) > 0:
        print("\nAverage feature values for false negatives:")
        for feature in feature_cols:
            print(f"{feature}: {false_negatives[feature].mean():.4f}")
        
        print("\nOverall average feature values:")
        for feature in feature_cols:
            print(f"{feature}: {X_test[feature].mean():.4f}")

    # 5. Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title('Confusion Matrix (Counts)', fontsize=16)
    plt.ylabel('True Label', fontsize=14)
    plt.xlabel('Predicted Label', fontsize=14)
    plt.savefig('confusion_matrix_counts.png')
    plt.close()

    plt.figure(figsize=(10, 8))
    sns.heatmap(cm_percent, annot=True, fmt='.1%', cmap='Blues', cbar=False)
    plt.title('Confusion Matrix (Percentages)', fontsize=16)
    plt.ylabel('True Label', fontsize=14)
    plt.xlabel('Predicted Label', fontsize=14)
    plt.savefig('confusion_matrix_percent.png')
    plt.close()

    return misclassified_df

In [305]:

def main(df):
    # Prepare data
    X, y, user_groups = prepare_data(df)
    
    # Perform XGBoost Parameter Tuning
    tuning_results = xgboost_parameter_tuning(X, y, user_groups)
    
    # Use the best model for further analysis
    best_model = tuning_results['best_model']
    
    # Split data for final model evaluation
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Fit best model
    best_model.fit(X_train, y_train)

    y_pred = best_model.predict(X_test)
    print("Best Model Performance:")
    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Feature Importances:")
    


     # Continue with other models (optional)
    # Random Forest Model
    print("\nRunning Random Forest Model...")
    rf_results = random_forest_model(X, y)
    print("Random Forest Results:")
    print(rf_results['classification_report'])
    
    # LSTM Model
    print("\nRunning LSTM Model...")
    X_lstm, y_lstm = prepare_lstm_data(df)
    lstm_results = lstm_model(X_lstm, y_lstm)
    print("LSTM Results:")
    print(lstm_results['classification_report'])
    
    
    # Perform Model Analysis with Best Model
    feature_cols = list(X.columns)
    misclassified_df = model_analysis(
        df, 
        X_train, 
        X_test, 
        y_train, 
        y_test, 
        best_model, 
        feature_cols
    )
    
    return tuning_results


main(df.dropna())

Fitting 5 folds for each of 100 candidates, totalling 500 fits


Parameters: { "use_label_encoder" } are not used.



Fitting 5 folds for each of 27 candidates, totalling 135 fits


Parameters: { "use_label_encoder" } are not used.




--- Randomized Search Best Parameters ---
colsample_bytree: 0.6504391549083848
gamma: 0.2848404943774676
learning_rate: 0.021066084206359838
max_depth: 3
min_child_weight: 2
n_estimators: 179
subsample: 0.7644148053272926
Best Score: 0.9417

--- Grid Search Best Parameters ---
learning_rate: 0.01
max_depth: 3
n_estimators: 229
Best Score: 0.9406


Parameters: { "use_label_encoder" } are not used.



Best Model Performance:
              precision    recall  f1-score   support

           0       0.62      0.45      0.52        73
           1       0.92      0.96      0.94       479

    accuracy                           0.89       552
   macro avg       0.77      0.71      0.73       552
weighted avg       0.88      0.89      0.88       552

Accuracy: 0.8913043478260869
ROC AUC Score: 0.7051505705379357
Confusion Matrix:
[[ 33  40]
 [ 20 459]]
Feature Importances:

Running Random Forest Model...
Top 10 Most Important Features:
                      feature  importance
49             exp_smooth_14d    0.131981
44              exp_smooth_7d    0.106251
39              exp_smooth_3d    0.077743
47  activity_rolling_14d_mean    0.060192
48   activity_rolling_14d_std    0.051428
35        relative_rolling_3d    0.042229
45       relative_rolling_14d    0.038293
38    activity_rolling_3d_std    0.038037
37   activity_rolling_3d_mean    0.037877
32                activity_cv    0.03764

  sequences_data = df.groupby(user_id_col).apply(create_sequences)
  super().__init__(**kwargs)


[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step
LSTM Results:
              precision    recall  f1-score   support

           0       0.70      0.46      0.56        95
           1       0.89      0.96      0.92       443

    accuracy                           0.87       538
   macro avg       0.80      0.71      0.74       538
weighted avg       0.86      0.87      0.86       538

Number of false negatives: 20

Average feature values for false negatives:
assigned: 0.0000
closed: 0.4500
demilestoned: 0.0000
issue_comment: 1.2000
issue_creation: 0.4500
labeled: 0.2000
mentioned: 0.0000
milestoned: 0.0000
pr_comment: 0.2000
pr_commit: 1.9000
pr_creation: 0.0500
pr_review_approved: 0.1000
pr_review_commented: 0.4000
referenced: 0.0000
renamed: 0.0000
reopened: 0.0500
subscribed: 0.0000
transferred: 0.0000
unlabeled: 0.0000
total_activity_issues: 2.3500
total_activity_pulls: 2.6500
total_activity: 5.0000
is_active: 0.6000
row_gini: 0.5450
interaction_entropy: 

{'random_search_best_params': {'colsample_bytree': 0.6504391549083848,
  'gamma': 0.2848404943774676,
  'learning_rate': 0.021066084206359838,
  'max_depth': 3,
  'min_child_weight': 2,
  'n_estimators': 179,
  'subsample': 0.7644148053272926},
 'random_search_best_score': 0.9416974678274469,
 'grid_search_best_params': {'learning_rate': 0.01,
  'max_depth': 3,
  'n_estimators': 229},
 'grid_search_best_score': 0.9406192843842259,
 'best_model': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric='logloss',
               feature_types=None, gamma=None, grow_policy=None,
               importance_type=None, interaction_constraints=None,
               learning_rate=0.01, max_bin=None, max_cat_threshold=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
      

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

In [22]:
data= all_activity
data["year_month"] = data["year_month"].dt.to_timestamp()

# Sort Data
data = data.sort_values(by=["created_by", "year_month"])

# Create Lag and Rolling Features
data["prev_month_activity"] = data.groupby("created_by")["count"].shift(1)
data["rolling_3m_avg"] = data.groupby("created_by")["count"].shift(1).rolling(window=3, min_periods=1).mean()
data["month"] = data["year_month"].dt.month  # Add month as a feature

# Target Variable: Next Month's Activity
data["next_month_activity"] = data.groupby("created_by")["count"].shift(-1)

# Drop NaN values (last months won't have next month activity)
data = data.dropna()

# Define Features & Target
features = ["prev_month_activity", "rolling_3m_avg", "month"]
X = data[features]
y = data["next_month_activity"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

# Train XGBoost Model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Train Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make Predictions
y_pred_xgb = xgb_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)

# Evaluate Models
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

print(f"XGBoost MAE: {mae_xgb}")
print(f"Random Forest MAE: {mae_rf}")

KeyError: 'year_month'

In [74]:
# Define feature columns for modeling
feature_cols = [
    'past_4_weeks_issues',
    'past_4_weeks_pulls',
    'activity_mix_pulls',
    'activity_mix_issues',
    'active_streak',
    'exp_smooth_activity',
    'avg_activity',
    'activity_burst',
    'issue_to_pull_ratio'
]

# Sample split code (for reference)
from sklearn.model_selection import train_test_split

# Remove NaN values that might have been introduced
df_model = df.dropna(subset=feature_cols + ['target'])

# Train-test split
X = df_model[feature_cols]
y = df_model['target']

In [75]:
# Make sure we have a valid dataset
df_model = df.dropna(subset=feature_cols + ['target'])

# Print distribution of target variable
print(f"Target distribution: {df_model['target'].value_counts(normalize=True)}")

# Split the data
X = df_model[feature_cols]
y = df_model['target']

# Create a stratified train/validation/test split
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")


Target distribution: target
1    0.907399
0    0.092601
Name: proportion, dtype: float64
Training set size: 2676
Validation set size: 892
Test set size: 892


In [76]:

# Train a basic XGBoost model
basic_model = XGBClassifier(
    objective='binary:logistic',
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
basic_model.fit(X_train, y_train)

# Evaluate the basic model
val_preds = basic_model.predict(X_val)
val_probs = basic_model.predict_proba(X_val)[:, 1]
print("\n--- Basic Model Validation Results ---")
print(classification_report(y_val, val_preds))
print(f"ROC AUC: {roc_auc_score(y_val, val_probs):.4f}")

# Feature importance plot
plt.figure(figsize=(10, 6))
features = X_train.columns
importances = basic_model.feature_importances_
indices = np.argsort(importances)

plt.title('Feature Importance')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# Grid search for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'scale_pos_weight': [1, sum(y_train == 0) / sum(y_train == 1)]  # Handle class imbalance
}

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Set up grid search
grid_search = GridSearchCV(
    estimator=XGBClassifier(
        objective='binary:logistic',
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    ),
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc',
    verbose=1,
    n_jobs=-1
)

# Fit grid search
grid_search.fit(X_train, y_train)

# Print best parameters
print("\n--- Grid Search Results ---")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")


Parameters: { "use_label_encoder" } are not used.




--- Basic Model Validation Results ---
              precision    recall  f1-score   support

           0       0.63      0.55      0.59        83
           1       0.95      0.97      0.96       809

    accuracy                           0.93       892
   macro avg       0.79      0.76      0.78       892
weighted avg       0.92      0.93      0.93       892

ROC AUC: 0.8770
Fitting 5 folds for each of 144 candidates, totalling 720 fits

--- Grid Search Results ---
Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'scale_pos_weight': 1, 'subsample': 0.8}
Best cross-validation score: 0.9088


Parameters: { "use_label_encoder" } are not used.



In [None]:
# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model on validation set
val_preds_best = best_model.predict(X_val)
val_probs_best = best_model.predict_proba(X_val)[:, 1]
print("\n--- Best Model Validation Results ---")
print(classification_report(y_val, val_preds_best))
print(f"ROC AUC: {roc_auc_score(y_val, val_probs_best):.4f}")

# Final evaluation on test set
test_preds = best_model.predict(X_test)
test_probs = best_model.predict_proba(X_test)[:, 1]
print("\n--- Final Test Results ---")
print(classification_report(y_test, test_preds))
print(f"ROC AUC: {roc_auc_score(y_test, test_probs):.4f}")

# Confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, test_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('confusion_matrix.png')
plt.close()

# Precision-Recall curve for threshold selection
plt.figure(figsize=(8, 6))
precision, recall, thresholds = precision_recall_curve(y_test, test_probs)
plt.plot(recall, precision, marker='.')
plt.title('Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.grid(True)
plt.savefig('precision_recall_curve.png')
plt.close()

# Find optimal threshold based on F1 score
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
print(f"\nOptimal threshold for F1 score: {optimal_threshold:.4f}")

# Apply optimal threshold
optimal_preds = (test_probs >= optimal_threshold).astype(int)
print("\n--- Results with Optimal Threshold ---")
print(classification_report(y_test, optimal_preds))

# Save the model for future predictions
best_model.save_model('xgboost_user_activity_prediction.json')

# Example of how to use the model for predictions on new data
def predict_user_inactivity(user_data, model, threshold=0.5):
    """
    Predicts whether users will be inactive next week
    
    Parameters:
    user_data (DataFrame): DataFrame with feature columns
    model: Trained XGBoost model
    threshold: Classification threshold
    
    Returns:
    DataFrame with prediction results
    """
    # Ensure user_data has all required features
    missing_features = set(feature_cols) - set(user_data.columns)
    if missing_features:
        raise ValueError(f"Missing features in input data: {missing_features}")
    
    # Generate predictions
    probs = model.predict_proba(user_data[feature_cols])[:, 1]
    predictions = (probs >= threshold).astype(int)
    
    # Create results dataframe
    results = user_data[['created_by', 'week_start']].copy()
    results['inactive_probability'] = probs
    results['predicted_inactive'] = predictions
    
    return results


--- Best Model Validation Results ---
              precision    recall  f1-score   support

           0       0.69      0.52      0.59        83
           1       0.95      0.98      0.96       809

    accuracy                           0.93       892
   macro avg       0.82      0.75      0.78       892
weighted avg       0.93      0.93      0.93       892

ROC AUC: 0.9012

--- Final Test Results ---
              precision    recall  f1-score   support

           0       0.73      0.48      0.58        83
           1       0.95      0.98      0.96       809

    accuracy                           0.93       892
   macro avg       0.84      0.73      0.77       892
weighted avg       0.93      0.93      0.93       892

ROC AUC: 0.9063

Optimal threshold for F1 score: 0.3951

--- Results with Optimal Threshold ---
              precision    recall  f1-score   support

           0       0.88      0.36      0.51        83
           1       0.94      1.00      0.97       809

   

Feature Importance Analysis: Tells you which features are driving predictions

Both weight-based and gain-based importance visualizations
SHAP values for more accurate feature contribution analysis


Misclassification Analysis: Deep dive into where the model fails

Detailed breakdown of false negatives (users we failed to predict would become inactive)
Comparison between misclassified cases and overall averages


Individual User Analysis: The analyze_user_predictions() function

Time series visualization of specific users' activity
Shows when the model predicted correctly vs. incorrectly
Perfect for understanding high-profile users like 'hadley', 'gaborcsardi', etc.


Insightful Explanations: The explain_individual_prediction() function

Generates readable explanations for any prediction
Identifies the top factors contributing to each prediction
Perfect for explaining to stakeholders why a particular prediction was made

im working of feature diagnostics and more data colling about the users watches and stars

I working on more data collection too

In [38]:
# Assuming 'best_model' is your trained XGBoost model
# and X_test, y_test are your test data

# 1. Feature Importance Analysis
plt.figure(figsize=(12, 8))
plot_importance(best_model, height=0.8, max_num_features=len(feature_cols))
plt.title('Feature Importance (Weight)', fontsize=16)
plt.tight_layout()
plt.savefig('feature_importance_weight.png')
plt.close()

# Gain-based importance (how much each feature improves the model)
plt.figure(figsize=(12, 8))
plot_importance(best_model, height=0.8, importance_type='gain', max_num_features=len(feature_cols))
plt.title('Feature Importance (Gain)', fontsize=16)
plt.tight_layout()
plt.savefig('feature_importance_gain.png')
plt.close()

# 2. SHAP Values for deeper insights
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)

# Summary plot
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test, feature_names=feature_cols, show=False)
plt.title('SHAP Summary Plot', fontsize=16)
plt.tight_layout()
plt.savefig('shap_summary.png')
plt.close()

# Dependence plots for top features
top_features = [feature_cols[i] for i in np.argsort(np.abs(shap_values).mean(0))[-3:]]
for feature in top_features:
    plt.figure(figsize=(12, 8))
    shap.dependence_plot(feature, shap_values, X_test, feature_names=feature_cols, show=False)
    plt.title(f'SHAP Dependence Plot: {feature}', fontsize=16)
    plt.tight_layout()
    plt.savefig(f'shap_dependence_{feature}.png')
    plt.close()

# 3. Analysis of misclassifications
y_pred = best_model.predict(X_test)
misclassified_idx = np.where(y_pred != y_test)[0]

misclassified_df = pd.DataFrame({
    'true_label': y_test.iloc[misclassified_idx],
    'predicted': y_pred[misclassified_idx],
    'probability': best_model.predict_proba(X_test.iloc[misclassified_idx])[:, 1]
})

# Add feature values for misclassified instances
for feature in feature_cols:
    misclassified_df[feature] = X_test.iloc[misclassified_idx][feature].values

# Add user identifiers if available in the test set
if 'created_by' in df_model.columns:
    misclassified_df['user'] = df_model.iloc[X_test.index[misclassified_idx]]['created_by'].values

# 4. Analysis of false negatives (users we predicted would be active but weren't)
false_negatives = misclassified_df[misclassified_df['true_label'] == 1]
print(f"Number of false negatives: {len(false_negatives)}")

if len(false_negatives) > 0:
    # Print average feature values for false negatives
    print("\nAverage feature values for false negatives:")
    for feature in feature_cols:
        print(f"{feature}: {false_negatives[feature].mean():.4f}")
    
    # Compare with overall average
    print("\nOverall average feature values:")
    for feature in feature_cols:
        print(f"{feature}: {X_test[feature].mean():.4f}")

# 5. Confusion matrix with percentages
cm = confusion_matrix(y_test, y_pred)
cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix (Counts)', fontsize=16)
plt.ylabel('True Label', fontsize=14)
plt.xlabel('Predicted Label', fontsize=14)
plt.savefig('confusion_matrix_counts.png')
plt.close()

plt.figure(figsize=(10, 8))
sns.heatmap(cm_percent, annot=True, fmt='.1%', cmap='Blues', cbar=False)
plt.title('Confusion Matrix (Percentages)', fontsize=16)
plt.ylabel('True Label', fontsize=14)
plt.xlabel('Predicted Label', fontsize=14)
plt.savefig('confusion_matrix_percent.png')
plt.close()

# 6. Individual user prediction analysis
def analyze_user_predictions(user_id, df=df_model, model=best_model, features=feature_cols):
    """Analyze predictions for a specific user over time"""
    user_data = df[df['created_by'] == user_id].copy()
    
    if len(user_data) == 0:
        return "User not found in dataset"
    
    # Make predictions
    user_data['predicted_prob'] = model.predict_proba(user_data[features])[:, 1]
    user_data['predicted_inactive'] = model.predict(user_data[features])
    
    # Create time series plot
    plt.figure(figsize=(14, 8))
    
    # Plot activity and prediction
    plt.plot(user_data['week_start'], user_data['total_activity'], 
             label='Actual Activity', marker='o', color='blue')
    
    # Add target (actual inactivity next week)
    inactive_weeks = user_data[user_data['target'] == 1]['week_start']
    if len(inactive_weeks) > 0:
        plt.scatter(inactive_weeks, 
                   [0] * len(inactive_weeks), 
                   color='red', s=100, marker='x',
                   label='Actually Inactive Next Week')
    
    # Add predictions
    predicted_inactive = user_data[user_data['predicted_inactive'] == 1]['week_start']
    if len(predicted_inactive) > 0:
        plt.scatter(predicted_inactive, 
                   [0] * len(predicted_inactive), 
                   color='orange', s=100, marker='+', 
                   label='Predicted Inactive Next Week')
    
    # Add prediction probability as a secondary axis
    ax2 = plt.twinx()
    ax2.plot(user_data['week_start'], user_data['predicted_prob'], 
             color='green', linestyle='--', label='Inactivity Probability')
    ax2.set_ylabel('Inactivity Probability', color='green', fontsize=12)
    ax2.set_ylim(0, 1)
    
    plt.title(f'User Activity and Inactivity Predictions: {user_id}', fontsize=16)
    plt.xlabel('Week', fontsize=12)
    plt.ylabel('Activity Count', fontsize=12)
    plt.grid(True, alpha=0.3)
    
    # Combine legends
    lines1, labels1 = plt.gca().get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')
    
    plt.tight_layout()
    plt.savefig(f'user_prediction_{user_id}.png')
    plt.close()
    
    return user_data


Number of false negatives: 15

Average feature values for false negatives:
past_4_weeks_issues: 16.0000
past_4_weeks_pulls: 6.6667
activity_mix_pulls: 0.0000
activity_mix_issues: 0.7333
active_streak: 2.3333
exp_smooth_activity: 4.6745
avg_activity: 5.6946
activity_burst: -0.7973
issue_to_pull_ratio: 10.0000

Overall average feature values:
past_4_weeks_issues: 1.8531
past_4_weeks_pulls: 0.9137
activity_mix_pulls: 0.0240
activity_mix_issues: 0.0747
active_streak: 0.3587
exp_smooth_activity: 0.6722
avg_activity: 0.7475
activity_burst: -0.1299
issue_to_pull_ratio: 9.7143


<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

In [21]:

# Example: Analyze predictions for specific users
for user in ['hadley', 'gaborcsardi', 'jennybc', 'lionel-']:
    analyze_user_predictions(user)

# 7. Create a dashboard of key insights
def create_insight_dashboard(model, X_test, y_test, feature_cols):
    """Create a comprehensive dashboard of model insights"""
    fig = plt.figure(figsize=(20, 24))
    grid = plt.GridSpec(4, 2, figure=fig)
    
    # 1. Confusion Matrix
    ax1 = fig.add_subplot(grid[0, 0])
    cm = confusion_matrix(y_test, model.predict(X_test))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1)
    ax1.set_title('Confusion Matrix', fontsize=14)
    ax1.set_ylabel('True Label', fontsize=12)
    ax1.set_xlabel('Predicted Label', fontsize=12)
    
    # 2. Feature Importance
    ax2 = fig.add_subplot(grid[0, 1])
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]
    ax2.barh(range(len(indices)), importances[indices], color='b', align='center')
    ax2.set_yticks(range(len(indices)))
    ax2.set_yticklabels([feature_cols[i] for i in indices])
    ax2.set_title('Feature Importance', fontsize=14)
    ax2.set_xlabel('Relative Importance', fontsize=12)
    
    # 3. Precision-Recall Curve
    ax3 = fig.add_subplot(grid[1, 0])
    from sklearn.metrics import precision_recall_curve
    precs, recs, thresholds = precision_recall_curve(y_test, model.predict_proba(X_test)[:, 1])
    ax3.plot(recs, precs, 'b-', linewidth=2)
    ax3.set_title('Precision-Recall Curve', fontsize=14)
    ax3.set_xlabel('Recall', fontsize=12)
    ax3.set_ylabel('Precision', fontsize=12)
    ax3.grid(True)
    
    # 4. ROC Curve
    ax4 = fig.add_subplot(grid[1, 1])
    from sklearn.metrics import roc_curve, auc
    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)
    ax4.plot(fpr, tpr, 'b-', linewidth=2, label=f'AUC = {roc_auc:.3f}')
    ax4.plot([0, 1], [0, 1], 'k--')
    ax4.set_title('ROC Curve', fontsize=14)
    ax4.set_xlabel('False Positive Rate', fontsize=12)
    ax4.set_ylabel('True Positive Rate', fontsize=12)
    ax4.legend(loc='lower right')
    ax4.grid(True)
    
    # 5. Feature Distribution: Active vs Inactive
    features_to_plot = [feature_cols[i] for i in indices[:4]]  # Top 4 features
    for i, feature in enumerate(features_to_plot):
        ax = fig.add_subplot(grid[2 + i//2, i%2])
        active_vals = X_test[y_test == 0][feature]
        inactive_vals = X_test[y_test == 1][feature]
        sns.kdeplot(active_vals, ax=ax, label='Active Users', fill=True, alpha=0.3)
        sns.kdeplot(inactive_vals, ax=ax, label='Inactive Users', fill=True, alpha=0.3)
        ax.set_title(f'Distribution of {feature}', fontsize=14)
        ax.legend()
        ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('model_insights_dashboard.png', dpi=300)
    plt.close()

create_insight_dashboard(best_model, X_test, y_test, feature_cols)

# 8. Create explanations for individual predictions
def explain_individual_prediction(user_id, week, model, feature_cols):
    """Generate a human-readable explanation for a specific prediction"""
    # Get user data for the specified week
    user_week_data = df_model[(df_model['created_by'] == user_id) & 
                             (df_model['week_start'] == pd.to_datetime(week))]
    
    if len(user_week_data) == 0:
        return f"No data found for user {user_id} in week {week}"
    
    # Get features for the prediction
    features = user_week_data[feature_cols].iloc[0]
    
    # Make prediction
    prob = model.predict_proba(features.values.reshape(1, -1))[0, 1]
    prediction = "INACTIVE" if prob > 0.5 else "ACTIVE"
    
    # Get SHAP values for explanation
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(features.values.reshape(1, -1))[0]
    
    # Get top 3 contributing features
    feature_importance = list(zip(feature_cols, shap_values))
    feature_importance.sort(key=lambda x: abs(x[1]), reverse=True)
    top_features = feature_importance[:3]
    
    # Build explanation
    explanation = f"User {user_id} is predicted to be {prediction} next week with {prob:.1%} probability.\n\n"
    explanation += "Top contributing factors:\n"
    
    for feature, value in top_features:
        feature_value = features[feature]
        if value > 0:
            explanation += f"- {feature} = {feature_value:.2f} (increases likelihood of inactivity)\n"
        else:
            explanation += f"- {feature} = {feature_value:.2f} (decreases likelihood of inactivity)\n"
    
    return explanation

# Example usage
print(explain_individual_prediction('hadley', '2023-01-01', best_model, feature_cols))

KeyError: 'week_start'

<Figure size 1400x800 with 0 Axes>

# User Data Extraction

In [31]:
users = df['created_by'].unique()
users

array(['MichaelChirico', 'gaborcsardi', 'hadley', 'jennybc', 'krlmlr',
       'lionel-', 'olivroy', 'romainfrancois', 'rundel', 'salim-b'],
      dtype=object)

In [36]:
# Authenticate
secret = "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"
g = Github(secret)

def fetch_user_events(username, event_type):
    """Fetch user events from GitHub and filter by event type (handles pagination)."""
    try:
        user = g.get_user(username)
        events = user.get_events()  # Gets public events (pagination happens here)
        
        # Collect filtered events
        filtered_events = []
        for event in events:
            if event.type == event_type:
                filtered_events.append(event)
            if len(filtered_events) >= 100:  # Limit to avoid API overuse
                break
        
        return filtered_events
    except Exception as e:
        print(f"Error fetching events for {username}: {e}")
        return []

def count_weekly_events(events):
    """Count weekly occurrences of events."""
    weekly_counts = {}
    now = datetime.utcnow()
    
    for event in events:
        event_date = event.created_at
        week = (now - event_date).days // 7
        weekly_counts[week] = weekly_counts.get(week, 0) + 1
        
    return weekly_counts

def collect_user_engagement(username):
    """Collect weekly engagement metrics for a user."""
    star_events = fetch_user_events(username, "StarEvent")  # Starring a repo
    fork_events = fetch_user_events(username, "ForkEvent")  # Forking a repo
    watch_events = fetch_user_events(username, "WatchEvent")  # Watching a repo (deprecated, may be empty)

    return {
        "weekly_stars": count_weekly_events(star_events),
        "weekly_forks": count_weekly_events(fork_events),
        "weekly_watches": count_weekly_events(watch_events),
    }

# Check rate-limiting before running
print(f"API Rate Limit: {g.rate_limiting}")

# Load your existing dataframe with users
users = df['created_by'].unique()

# Collect engagement data for each user
engagement_data = {}
for user in users:
    print(f"Fetching data for {user}...")
    engagement_data[user] = collect_user_engagement(user)

# Convert to DataFrame and save
engagement_df = pd.DataFrame(engagement_data).T
engagement_df.to_csv("user_engagement_data.csv", index=True)

print("User engagement data collected and saved successfully!")

API Rate Limit: (4744, 5000)
Fetching data for MichaelChirico...
Fetching data for gaborcsardi...
Fetching data for hadley...
Fetching data for jennybc...
Fetching data for krlmlr...
Fetching data for lionel-...
Fetching data for olivroy...
Fetching data for romainfrancois...
Fetching data for rundel...
Fetching data for salim-b...
User engagement data collected and saved successfully!


In [37]:
engagement_df

Unnamed: 0,weekly_stars,weekly_forks,weekly_watches
MichaelChirico,{},{0: 7},{}
gaborcsardi,{},{0: 1},{}
hadley,{},{2: 3},{}
jennybc,{},{},{}
krlmlr,{},{},{1: 1}
lionel-,{},{0: 2},{}
olivroy,{},{2: 4},{}
romainfrancois,{},{3: 1},{}
rundel,{},{3: 1},{}
salim-b,{},"{0: 5, 1: 3, 2: 2, 3: 2}","{0: 2, 1: 4, 2: 3, 3: 17}"
