#TF-IDF

In [5]:
#%%writefile modeling_tfidf.py

# Imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump

# Load the data, as packages
packages = pd.read_csv("./data/packages_11_may_2022.csv")
# There are 18550 packages 

# Create the TF-IDF model, called vectors
vectorizer = TfidfVectorizer(lowercase=True, strip_accents='unicode', stop_words='english')
vectors = vectorizer.fit_transform(packages['Description'])

# Joblib dump model, saved as `tf-idf.joblib`
dump(vectors, 'tf-idf.joblib') 

['tf-idf.joblib']

In [63]:
packages.tail().to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Unnamed: 0</th>\n      <th>Package</th>\n      <th>Version</th>\n      <th>Priority</th>\n      <th>Depends</th>\n      <th>Imports</th>\n      <th>LinkingTo</th>\n      <th>Suggests</th>\n      <th>Enhances</th>\n      <th>License</th>\n      <th>License_is_FOSS</th>\n      <th>License_restricts_use</th>\n      <th>OS_type</th>\n      <th>Archs</th>\n      <th>MD5sum</th>\n      <th>NeedsCompilation</th>\n      <th>File</th>\n      <th>Repository</th>\n      <th>Title</th>\n      <th>Description</th>\n      <th>page.rank</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>18545</th>\n      <td>18546</td>\n      <td>zTree</td>\n      <td>1.0.7</td>\n      <td>NaN</td>\n      <td>R (&gt;= 3.1.0)</td>\n      <td>plyr (&gt;= 1.0)</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>GPL-3 | file LICENSE</td>\n      <td>NaN</td>\n      <td>NaN</td>

In [57]:
#%%writefile results.py

# Imports
import pandas as pd
import numpy as np
from joblib import load

# Load the model 
vectors = load('tf-idf.joblib') 

def get_closest(query, w = 0.5, sort_by = "PageRank"):
    result = vectorizer.fit(packages['Description']).transform([query]) # Matrix of 1x 55750
    pairwise_similarity = result * vectors.T

    indexes = (-pairwise_similarity.toarray()[0]).argsort()[:20]
    
    df = [packages["Package"], 
      packages["Version"], 
      packages["Title"], 
      packages["Description"], 
      packages["page.rank"],
      pd.Series(pairwise_similarity.toarray()[0])]

    headers = ["Package_name", "Version", "Title", "Description", "PageRank", "PW_Similarity"]

    df = pd.concat(df, axis=1, keys=headers)
    df["Mix"] = df["PageRank"]*df["PW_Similarity"]
    #df["Mix"] = w*df["PageRank"] + (1-w)*df["PW_Similarity"]
    df = df.iloc[indexes].sort_values(by = [sort_by], ascending=False)
    return df

In [58]:
#%%writefile testing.py
#from results import get_closest
# the sum would require standardizing both columns, instead we multiply
get_closest("map visualization graph network", sort_by = "Mix")

Unnamed: 0,Package_name,Version,Title,Description,PageRank,PW_Similarity,Mix
7347,igraph,1.3.1,Network Analysis and Visualization,Routines for simple graphs and network analysi...,0.003454,0.29246,0.00101
15721,sna,2.6,Tools for Social Network Analysis,"A range of tools for social network analysis, ...",0.000294,0.477737,0.00014
10475,network,1.17.1,Classes for Relational Data,Tools to create and modify network objects. T...,0.000416,0.296832,0.000124
16104,spNetwork,0.4.3.1,Spatial Analysis on Network,Perform spatial analysis on network.\n Impl...,9.3e-05,0.286631,2.7e-05
8933,mapStats,2.4,Geographic Display of Survey Data Statistics,Automated calculation and visualization of sur...,5.1e-05,0.321006,1.6e-05
15805,SOMEnv,1.1.2,SOM Algorithm for the Analysis of Multivariate...,Analysis of multivariate environmental high fr...,4.8e-05,0.271464,1.3e-05
15399,sigmajs,0.1.5,Interface to 'Sigma.js' Graph Visualization Li...,Interface to 'sigma.js' graph visualization li...,5.2e-05,0.243395,1.3e-05
10487,networkreporting,0.1.1,Tools for using Network Reporting Estimators,Functions useful\n for producing estimates ...,4.5e-05,0.235585,1.1e-05
1255,bingat,1.3,Binary Graph Analysis Tools,Tools to analyze binary graph objects.,3.8e-05,0.269041,1e-05
10455,netmap,0.1.1,Represent Network Objects on a Map,Represent 'network' or 'igraph' objects whose ...,3.7e-05,0.25596,9e-06


In [67]:
a = "a"

In [68]:
a == ""

False

# Word2Vec