In [1]:
#loading libraries 
# Remember to install using !pip install intellikit 
import intellikit #importing the installled intellikit library to load the functions
import pandas as pd #importing pandas for loading and preparing the dataset

In [3]:
#We now create a new version of the dataset that we can transform into a similarity dataset. this way we can retain the original columns as the casebase file
df = pd.read_csv('datasets/cars-1k.csv')

#Select your respective columns to ensure that the similarities match based on the data type of the column
numerical_columns = ['price', 'year', 'miles']
text_columns = ['manufacturer', 'make', 'fuel', 'title_status', 'transmission', 'drive', 'type', 'paint_color']

query = pd.DataFrame({
    'price': [10000],
    'year': [2004],
    'manufacturer': ['ford'],
    'make': ['combo'],
    'fuel': ['gas'],
    'miles': [150000],
    'title_status': ['clean'],
    'transmission': ['manual'],
    'drive': ['4wd'],
    'type': ['van'],
    'paint_color': ['black']
    })


df = intellikit.preprocess_data(df)
query = intellikit.preprocess_data(query)

df = intellikit.calculate_similarity(df, query, numerical_columns, text_columns)
df.to_csv('similarity_output.csv', index=False) #A new csv file is created in the directory

In [4]:
#You can now retrieve the newly generated dataset of similarities which has been added as a csv file in your directory
similarity_file_path = 'similarity_output.csv'
df_similarity = pd.read_csv(similarity_file_path)

similarity_column = 'similarity'
top_k = 7 #you can edit the number of top results to show by changing here

top_similar_found = intellikit.retrieve_top_similar_results(df_similarity, similarity_column, top_k)
print(top_similar_found)

        price      year  manufacturer  make  fuel     miles  title_status  \
426  0.000171  0.333333             1     0     1  0.000028             1   
983  0.000158  0.333333             1     0     1  0.000019             1   
725  0.000459  0.250000             1     0     1  0.000006             1   
104  0.000938  0.200000             1     0     1  0.000008             1   
436  0.000110  0.200000             1     0     1  0.000006             1   
526  0.001437  0.142857             1     0     1  0.000024             1   
484  0.000122  1.000000             0     0     1  0.000250             1   

     transmission  drive  type  paint_color  similarity  
426             1      1     1            1    0.666685  
983             1      1     1            1    0.666683  
725             1      1     1            1    0.659133  
104             1      1     1            1    0.654631  
436             1      1     1            1    0.654556  
526             1      1     1     

In [5]:
#let's re-add our original dataset that we are using as the casebase
casebase = pd.read_csv('datasets/cars-1k.csv')

In [6]:
#Or you retrieve the final ordered list of the results combined with the original casebase using this below. the df in this case is the adjusted case base while the casebase is the original casebase
result = intellikit.retrieve_final_results(df, casebase, similarity_column, top_k=5, order=False)
print(result)

   price  year manufacturer        make    fuel   miles title_status  \
1   4145  2002         audi          a4  diesel  186215        clean   
4   3663  2002         audi          a3  diesel  202000        clean   
3  12176  2007         audi          q7  diesel  311479        clean   
0  11065  2008         audi  a6-allroad  diesel  282000        clean   
2    921  2008         audi          a4  diesel  319002        clean   

  transmission drive     type  ... simmake  simfuel  simmiles  \
1       manual   4wd  compact  ...       0        1  0.000028   
4       manual   4wd  compact  ...       0        1  0.000019   
3       manual   4wd  compact  ...       0        1  0.000006   
0       manual   4wd  compact  ...       0        1  0.000008   
2       manual   4wd  compact  ...       0        1  0.000006   

   simtitle_status  simtransmission  simdrive  simtype  simpaint_color  \
1                1                1         1        1               1   
4                1          