# Filtered Important words list based on TFIDF ranking and min_df and max_df parameters in TFIDFVectorizer

In [4]:
import pandas as pd
import numpy as np
import pymongo
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
######## Importing "preprocess" collection from mongoDB ####

In [5]:
client = pymongo.MongoClient("mongodb+srv://group3:group3psu!@squid.36jsw.mongodb.net/CORD19?retryWrites=true&w=majority")
db = client.CORD19
db.list_collection_names()
a_coll_1017 = db.preprocess
a_1017 = pd.DataFrame(list(a_coll_1017.find()))

In [7]:
a_1017 .head()
a_1017 .info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57921 entries, 0 to 57920
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   _id             57921 non-null  object        
 1   level_0         57921 non-null  int64         
 2   index           57921 non-null  int64         
 3   abstract        57921 non-null  object        
 4   authors         57921 non-null  object        
 5   journal         57921 non-null  object        
 6   license         57921 non-null  object        
 7   publish_time    57921 non-null  datetime64[ns]
 8   title           57921 non-null  object        
 9   language        57921 non-null  object        
 10  word_count      57921 non-null  int64         
 11  char_count      57921 non-null  int64         
 12  sent_count      57921 non-null  int64         
 13  avg_word_len    57921 non-null  float64       
 14  stopwords       57921 non-null  int64         
 15  cl

In [8]:
####### abstract_string_conversion #####
def convert_list_to_string(list, seperator=' '):
    return seperator.join(list)
a_1017 ['ab_string'] = a_1017 ['cleanAbtstract'].apply(lambda row: convert_list_to_string(row))

In [9]:
#List of Abstracts
w2 = []
for i in range(0,len(a_1017 .index)):
    abstract = a_1017['ab_string'].iloc[i]
    w2.append(abstract)

# High Frequency TF-IDF Words ( Post initial preprocessing and cleanup)

In [12]:
##### tf-idf calc from sklearn max_df=.50, min_df=.25 ###

vectorizer = TfidfVectorizer(max_df=.50, min_df=.25, stop_words=None, use_idf=True, norm=None)
vectors = vectorizer.fit_transform(w2)
feature_names = vectorizer.get_feature_names()
sums = vectors.sum(axis=0) #sum tf-idf for each term throughout

#connects term and sum freq
data = []
for col, term in enumerate(feature_names):
    data.append((term,sums[0,col]))

##### Output: tf-idf sorted descending top 50

ranking = pd.DataFrame(data, columns=['term','rank']) 
print(ranking.sort_values('rank', ascending=False).head(50))

           term           rank
14      sarscov  114642.131032
7        infect   96980.206139
0          case   91525.154771
17          use   83439.447491
16        studi   82755.186967
10       pandem   80545.510483
15        sever   79833.063131
5        health   75675.105791
1        clinic   71523.932391
13       result   65370.330207
3          data   58794.198501
12  respiratori   57363.643948
11       report   56748.136069
6        includ   54516.022902
4        effect   53276.887718
8           may   52323.752786
9        method   47931.056963
2       conclus   39746.847315


- As seen above notable words in above list are "sarscov" and "respiratori" indicating relation of COVID with these and covid mainly being a respiratori disease. 
- Note that this list does not contain other more frequently expected words such as covid,coronavirus,patient etc. since either they have already been filtered in preprocessing 
- or because the min and max parameters of TFIDF vectorizer has been set to include min 25 % of document and max 50% of documents in which term/word appears. 
- So even though list mentions "High Frequency" its still actually a filtered list of more meaningful words with frequency between 25% to 50% only

# Medium Frequency TF-IDF words ( Post initial preprocessing and cleanup)

In [14]:
##### tf-idf calc from sklearn max_df=.25, min_df=.10 ###

vectorizer = TfidfVectorizer(max_df=.25, min_df=.10, stop_words=None, use_idf=True, norm=None)
vectors = vectorizer.fit_transform(w2)
feature_names = vectorizer.get_feature_names()
sums = vectors.sum(axis=0) #sum tf-idf for each term throughout

#connects term and sum freq
data = []
for col, term in enumerate(feature_names):
    data.append((term,sums[0,col]))

##### Output: tf-idf sorted descending top 25

ranking = pd.DataFrame(data, columns=['term','rank']) 
print(ranking.sort_values('rank', ascending=False).head(25))

          term          rank
118       test  69455.338458
52      hospit  63346.997991
105       risk  63004.132501
16        care  62801.121377
73       model  58368.422630
122  treatment  55886.237337
30         day  55512.007733
126      virus  55040.111518
12      associ  52321.073044
58      increa  51372.844623
48       group  51008.226685
115    symptom  50677.632025
119       time  48238.207948
108     signif  47819.754817
78      number  47597.912646
97        rate  47270.647929
71      measur  45575.842860
1         acut  45146.865558
34     develop  45037.748402
88       posit  44759.570538
35      differ  44585.347394
91     present  44362.120884
94      provid  44090.735397
82    outbreak  44035.302137
27     countri  43260.749374


- Interestingly as mentioned above the most notable word in above list is "test" matching with the general heavy emphasis laid on testing to control the corona virus.
- Above list is based on 10% to 25% of documents in which the word appears

# Low Frequency / Most Important TF-IDF words ( Post initial preprocessing and cleanup)

In [17]:
##### tf-idf calc from sklearn max_df=.01, min_df=.0001 ###

vectorizer = TfidfVectorizer(max_df=.01, min_df=.0001, stop_words=None, use_idf=True, norm=None)
vectors = vectorizer.fit_transform(w2)
feature_names = vectorizer.get_feature_names()
sums = vectors.sum(axis=0) #sum tf-idf for each term throughout

#connects term and sum freq
data = []
for col, term in enumerate(feature_names):
    data.append((term,sums[0,col]))

##### Output: tf-idf sorted descending top 50

ranking = pd.DataFrame(data, columns=['term','rank']) 
print(ranking.sort_values('rank', ascending=False).head(50))

                 term          rank
3009               de  11109.688327
5281              hcq   9400.837220
5289             hcws   9255.124813
11980           sleep   8953.559133
3696               ed   8646.216369
8370           neonat   8622.283205
3146           dental   8225.223836
5659               hr   7878.742953
9948           pollut   7831.321859
2630           counti   7658.397115
10788          recipi   7364.630511
13184     tocilizumab   7192.569300
9893               pm   7174.235803
4021           epitop   6934.654800
10692             rbd   6853.277005
12855      telehealth   6690.017065
8872              nsp   6576.977265
11256              rr   6516.621467
2603   corticosteroid   6465.988314
13959         variant   6351.195289
13170          tmprss   6316.278782
317               aki   6231.120296
14141         vitamin   6062.090747
7693               mg   6051.544232
10885       rehabilit   6034.005133
6841               la   6021.769205
3474            donor   6008

- Above list is based on terms/words appearing in .01 to 1% of documents
- Although above frequency seems too low, since we have a dataset of around 58000 documents, its still a good frequency to get more meaningful targeted data we are looking for such as names of most used medicines


#### Top 50 highest ranking TF-IDF words from this list does throw us name of medicines such as "hcq", "tocilizumab" both arthritis drugs which were being tried on patients with severe COVID-19 patients at the time to which data set belongs