In [55]:
import pandas as pd
import pickle as pkl
import os
from pathlib import Path

from sklearn import tree

import utils

import matplotlib.pyplot as plt
import graphviz

In [56]:
def prepare_data(repo_name,directory,goal):
    df_raw = pd.read_csv(directory + repo_name, sep=',')
    df_raw = df_raw.drop(columns=['dates'])  
    last_col = utils.get_goal(goal)
    cols = list(df_raw.columns.values)
    cols.remove(last_col)
    df_adjust = df_raw[cols+[last_col]]
    return df_adjust

def plot_tree(clf,data,cluster_id, goal):
    fig = plt.figure(figsize=(25,20))
    _ = tree.plot_tree(clf, 
                       feature_names=data.columns[:-1],  
                       filled=True)
    data_store_path = '../results/graphs/' + goal + '/month_6_models/'
    data_path = Path(data_store_path)
    if not data_path.is_dir():
        os.makedirs(data_path)
    fig.savefig(data_store_path + 'cluster_' + str(cluster_id) + '.pdf')

In [108]:
goals = []
feature_imp = pd.DataFrame()
for i in range(7):
    goal = utils.get_goal(i)
    goals.append(goal)
    print('Running Goal:', goal)
    project = 'project0000.csv'
    data = prepare_data(project,'../data/data_use/',i)
    path = '../results/month_' + str(6) + '_models/' + goal
    bell_df = pd.read_csv(path + '/bellwether_level_1.csv')
    bell_df.drop('Unnamed: 0', axis = 1, inplace = True)
    imp = []
    for i in range(bell_df.shape[0]):
        cluster_id = bell_df.iloc[i,0]
        bellwether = bell_df.iloc[i,1]
        with open(path + '/level_' + str(1) + '/cluster_' +  str(cluster_id) + '_performance_models.pkl', 'rb') as handle:
            models = pkl.load(handle)
        model = models[bellwether]
#         plot_tree(model,data,cluster_id,goal)
        f_imp = list(model.feature_importances_)
        f_imp.append(cluster_id)
        imp.append(f_imp)
    cols = list(data.columns[:-1])
    cols.append('cluster')
    imp_df = round(pd.DataFrame(imp, columns = cols),2)
#     print(imp_df.iloc[:,:-1].sum(axis = 1))
    features = imp_df.iloc[:,:-1].sum(axis = 0)/imp_df.shape[0]
    feature_imp = pd.concat([feature_imp,features], axis = 1)


Running Goal: monthly_commits
Running Goal: monthly_contributors
Running Goal: monthly_open_PRs
Running Goal: monthly_closed_PRs
Running Goal: monthly_open_issues
Running Goal: monthly_closed_issues
Running Goal: monthly_stargazer


In [110]:
feature_imp.columns = goals

In [112]:
feature_imp.to_csv('feature_imp.csv')