In [60]:
import pandas as pd
from ortools.sat.python import cp_model

In [61]:
gen_dataset_file = "./generation_meta/dpdf/gen_data.txt"
dataset_stats_file = "./generation_meta/dpdf/dataset_stats.csv"
size_summary_file = "./generation_meta/dpdf/size_summary.csv"
target = 20

In [62]:
def transform_project( x: str, prefix="dpdf" ):
    x = x.lower()
    x = x.replace( " ", "_" )
    x = x.replace( ".", "" )
    return f"{prefix}-{x}"

In [63]:
with open( gen_dataset_file, "r" ) as f:
    # Each line is assumed to be in the format: "[timestamp] project_name"
    gen_projects = [ transform_project( line.strip().split( "] " )[ 1 ] ) for line in f if "] " in line ]
gen_projects

['dpdf-maven-cuke4duke-jump-start',
 'dpdf-protoj',
 'dpdf-sample-projects',
 'dpdf-whartoneventscheduler',
 'dpdf-clusterbench',
 'dpdf-hunapknotifier',
 'dpdf-javadesignpatternexercises',
 'dpdf-comhappyprogtdgotchi',
 'dpdf-3taps-java-client',
 'dpdf-spring-stringtemplate',
 'dpdf-modapi',
 'dpdf-java-patterns',
 'dpdf-basiclti-portlet',
 'dpdf-accent',
 'dpdf-android-rtmp',
 'dpdf-swp1-teamhub',
 'dpdf-ibatisworkshop',
 'dpdf-akka-javaee6-integracia',
 'dpdf-swing-mvc-demo',
 'dpdf-bio_quiz',
 'dpdf-jagtrack',
 'dpdf-uk-ejemplos',
 'dpdf-lord-of-ultima-manager',
 'dpdf-a7b36ass',
 'dpdf-jpatterns',
 'dpdf-turtleplayer',
 'dpdf-smartbiprototype',
 'dpdf-chute-android-components',
 'dpdf-brightroom',
 'dpdf-cream',
 'dpdf-mitfahrzentraleclient',
 'dpdf-java-design-patterns',
 'dpdf-cs230-software-project',
 'dpdf-get-another-label',
 'dpdf-magic-config',
 'dpdf-tapestry-bootstrap',
 'dpdf-client',
 'dpdf-androidbillinglibrary',
 'dpdf-ipage',
 'dpdf-hazelcast-cluster-monitor',
 'dpdf

In [64]:
df_stats = pd.read_csv( dataset_stats_file )
df_stats = df_stats[ df_stats.project != "all" ]
df_stats = df_stats.drop( 'dp_names', axis=1 )
df_stats = df_stats.drop( 'dps_unique', axis=1 )
df_stats = df_stats.drop( 'dps_total', axis=1 )
df_stats = df_stats.drop( 'dps_memento', axis=1 )  # too less instances
df_size = pd.read_csv( size_summary_file )
df_size[ 'project' ] = df_size[ 'project' ].apply( transform_project )

In [65]:
df = pd.merge( df_size, df_stats, on='project' )
df

Unnamed: 0,project,bytes,dps_abstract_factory,dps_adapter,dps_builder,dps_facade,dps_factory_method,dps_observer,dps_singleton,dps_decorator,dps_prototype,dps_proxy,dps_visitor
0,dpdf-maven-cuke4duke-jump-start,3939,0,0,0,1,0,0,0,0,0,0,0
1,dpdf-protoj,16161,0,0,0,0,0,0,0,0,1,0,0
2,dpdf-sample-projects,16360,0,0,0,0,0,0,0,0,0,0,1
3,dpdf-whartoneventscheduler,16634,0,2,0,0,0,0,0,0,0,0,0
4,dpdf-clusterbench,28977,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
208,dpdf-alfresco,66698607,0,0,0,0,0,0,1,0,0,0,0
209,dpdf-ju4pa,99237265,0,0,0,0,0,1,0,0,0,0,0
210,dpdf-platform,113890053,0,0,0,0,0,34,0,0,0,7,0
211,dpdf-intellij-community,115123824,0,0,0,7,0,0,5,0,0,0,0


In [66]:
df = df[ df[ 'project' ].isin( gen_projects ) ]
df

Unnamed: 0,project,bytes,dps_abstract_factory,dps_adapter,dps_builder,dps_facade,dps_factory_method,dps_observer,dps_singleton,dps_decorator,dps_prototype,dps_proxy,dps_visitor
0,dpdf-maven-cuke4duke-jump-start,3939,0,0,0,1,0,0,0,0,0,0,0
1,dpdf-protoj,16161,0,0,0,0,0,0,0,0,1,0,0
2,dpdf-sample-projects,16360,0,0,0,0,0,0,0,0,0,0,1
3,dpdf-whartoneventscheduler,16634,0,2,0,0,0,0,0,0,0,0,0
4,dpdf-clusterbench,28977,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,dpdf-spring-batch,4424631,0,2,0,0,0,0,0,0,0,0,0
140,dpdf-santuario-java,4493804,0,0,0,0,2,0,0,0,0,3,0
141,dpdf-aseme,4616599,10,0,0,0,0,0,0,0,0,0,0
144,dpdf-vogella,5000382,0,0,0,0,0,3,1,0,0,0,0


In [67]:
df.sum( numeric_only=True, axis=0 )

bytes                   119996359
dps_abstract_factory           43
dps_adapter                    32
dps_builder                    36
dps_facade                     31
dps_factory_method             52
dps_observer                   28
dps_singleton                  34
dps_decorator                  53
dps_prototype                  25
dps_proxy                      33
dps_visitor                    24
dtype: int64

In [68]:
# Identify design pattern columns and fill missing values with zeros
design_patterns = [ col for col in df.columns if col.startswith( "dps_" ) ]
df[ design_patterns ] = df[ design_patterns ].fillna( 0 )

projects = df[ 'project' ].tolist()
bytes_list = df[ 'bytes' ].tolist()

# For each design pattern, get its counts per project as a list
dp_counts = { dp: df[ dp ].tolist() for dp in design_patterns }

# Create the OR-Tools CP-SAT model
model = cp_model.CpModel()

# Create binary decision variables for each project
project_vars = { }
for i, project in enumerate( projects ):
    project_vars[ i ] = model.NewBoolVar( f"select_{i}" )

# Define a weight for the project selection term.
# Adjust weight so that minimizing number of projects is prioritized along with bytes.
# For example, if typical project sizes are around 1e6 bytes, you might choose weight = 1e6.
weight = 1e6  # <-- Adjust this value as needed

# Objective: minimize total bytes plus a penalty for each selected project
model.Minimize(
    sum( bytes_list[ i ] * project_vars[ i ] for i in range( len( projects ) ) ) +
    weight * sum( project_vars[ i ] for i in range( len( projects ) ) )
)

# Constraints: For each design pattern, ensure the selected projects have at least 'target' instances
for dp in design_patterns:
    model.Add( sum( dp_counts[ dp ][ i ] * project_vars[ i ] for i in range( len( projects ) ) ) >= target )

# Solve the model
solver = cp_model.CpSolver()
status = solver.Solve( model )

if status == cp_model.OPTIMAL or status == cp_model.FEASIBLE:
    selected_projects = [ projects[ i ] for i in range( len( projects ) ) if solver.Value( project_vars[ i ] ) == 1 ]
    total_bytes = sum( bytes_list[ i ] for i in range( len( projects ) ) if solver.Value( project_vars[ i ] ) == 1 )
    print( "Selected projects:", selected_projects )
    print( "Total bytes:", total_bytes )

    # Filter the dataframe to only include the selected projects
    selected_df = df[ df[ 'project' ].isin( selected_projects ) ].copy()

    # Calculate totals for each design pattern and the bytes column
    totals = { 'project': "Total", 'bytes': selected_df[ 'bytes' ].sum() }
    for dp in design_patterns:
        totals[ dp ] = selected_df[ dp ].sum()

    # Create a DataFrame for the totals row
    total_df = pd.DataFrame( [ totals ] )

    # Prepend the totals row to the selected projects dataframe
    selected_df = pd.concat( [ total_df, selected_df ], ignore_index=True )

    # Optionally, reorder columns so that 'project' is the first column
    cols = [ 'project' ] + [ c for c in selected_df.columns if c != 'project' ]
    selected_df = selected_df[ cols ]
else:
    print( "No solution found." )

selected_df

Selected projects: ['dpdf-maven-cuke4duke-jump-start', 'dpdf-protoj', 'dpdf-clusterbench', 'dpdf-javadesignpatternexercises', 'dpdf-comhappyprogtdgotchi', 'dpdf-modapi', 'dpdf-java-patterns', 'dpdf-basiclti-portlet', 'dpdf-android-rtmp', 'dpdf-swp1-teamhub', 'dpdf-ibatisworkshop', 'dpdf-swing-mvc-demo', 'dpdf-bio_quiz', 'dpdf-jagtrack', 'dpdf-uk-ejemplos', 'dpdf-lord-of-ultima-manager', 'dpdf-turtleplayer', 'dpdf-smartbiprototype', 'dpdf-brightroom', 'dpdf-cream', 'dpdf-cs230-software-project', 'dpdf-magic-config', 'dpdf-api', 'dpdf-robo-remote', 'dpdf-tapit-android-sdk-source', 'dpdf-samygo-android-remote', 'dpdf-suite', 'dpdf-downloadprovider', 'dpdf-protogrid', 'dpdf-see', 'dpdf-fafdtibb', 'dpdf-console_1', 'dpdf-jamendo-android', 'dpdf-dual-battery-widget', 'dpdf-spring-social-twitter', 'dpdf-java-game-server', 'dpdf-ccw', 'dpdf-cucumber-jvm', 'dpdf-small-team-km-android-client', 'dpdf-bmach', 'dpdf-kahlua2', 'dpdf-smsc-server', 'dpdf-spring-migration-analyzer', 'dpdf-s4', 'dpdf-pl

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[ design_patterns ] = df[ design_patterns ].fillna( 0 )


Unnamed: 0,project,bytes,dps_abstract_factory,dps_adapter,dps_builder,dps_facade,dps_factory_method,dps_observer,dps_singleton,dps_decorator,dps_prototype,dps_proxy,dps_visitor
0,Total,27312823,20,20,20,20,20,20,20,29,20,23,20
1,dpdf-maven-cuke4duke-jump-start,3939,0,0,0,1,0,0,0,0,0,0,0
2,dpdf-protoj,16161,0,0,0,0,0,0,0,0,1,0,0
3,dpdf-clusterbench,28977,0,0,0,0,0,0,1,0,0,0,0
4,dpdf-javadesignpatternexercises,40375,0,0,0,0,0,2,0,0,0,0,0
5,dpdf-comhappyprogtdgotchi,42280,0,0,0,0,0,2,0,0,0,0,0
6,dpdf-modapi,53064,0,0,0,0,0,0,0,0,1,0,0
7,dpdf-java-patterns,53707,4,0,0,0,0,0,0,0,0,0,0
8,dpdf-basiclti-portlet,53992,0,9,0,0,1,0,0,0,0,0,0
9,dpdf-android-rtmp,59203,0,0,0,0,0,0,0,0,1,0,0


In [70]:
sorted( selected_projects )

['dpdf-android-rackspacecloud',
 'dpdf-android-rtmp',
 'dpdf-api',
 'dpdf-aseme',
 'dpdf-basiclti-portlet',
 'dpdf-bio_quiz',
 'dpdf-bmach',
 'dpdf-brightroom',
 'dpdf-buildcraft',
 'dpdf-ccw',
 'dpdf-clusterbench',
 'dpdf-comhappyprogtdgotchi',
 'dpdf-computercraft-spout',
 'dpdf-console_1',
 'dpdf-cream',
 'dpdf-cs230-software-project',
 'dpdf-cucumber-jvm',
 'dpdf-de-webapp',
 'dpdf-downloadprovider',
 'dpdf-dual-battery-widget',
 'dpdf-emma',
 'dpdf-fafdtibb',
 'dpdf-flume_1',
 'dpdf-ibatisworkshop',
 'dpdf-jagtrack',
 'dpdf-jamendo-android',
 'dpdf-java-game-server',
 'dpdf-java-patterns',
 'dpdf-javadesignpatternexercises',
 'dpdf-kahlua2',
 'dpdf-lord-of-ultima-manager',
 'dpdf-magic-config',
 'dpdf-maven-cuke4duke-jump-start',
 'dpdf-modapi',
 'dpdf-playorm',
 'dpdf-protogrid',
 'dpdf-protoj',
 'dpdf-robo-remote',
 'dpdf-s4',
 'dpdf-sablecc',
 'dpdf-samygo-android-remote',
 'dpdf-see',
 'dpdf-small-team-km-android-client',
 'dpdf-smartbiprototype',
 'dpdf-smsc-server',
 'dpdf-s