In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

### Data wrangling

In [103]:
df = pd.read_csv('../data/data.tsv', sep='\t',error_bad_lines=False)

b'Skipping line 83032: expected 6 fields, saw 7\n'
b'Skipping line 154657: expected 6 fields, saw 7\n'
b'Skipping line 323916: expected 6 fields, saw 7\n'
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [104]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,133273,213221,213222.0,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0.0
1,402555,536040,536041.0,How do I control my horny emotions?,How do you control your horniness?,1.0
2,360472,364011,490273.0,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0.0
3,150662,155721,7256.0,What can one do after MBBS?,What do i do after my MBBS ?,1.0
4,183004,279958,279959.0,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0.0


In [105]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363192 entries, 0 to 363191
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            363192 non-null  object 
 1   qid1          363192 non-null  object 
 2   qid2          363185 non-null  float64
 3   question1     363181 non-null  object 
 4   question2     363180 non-null  object 
 5   is_duplicate  363180 non-null  float64
dtypes: float64(2), object(4)
memory usage: 16.6+ MB


In [106]:
#remove rows with missing entries in queries, responses or labels
df = df.dropna(axis=0, subset=('question1','question2','is_duplicate' ))

In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 363177 entries, 0 to 363191
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            363177 non-null  object 
 1   qid1          363177 non-null  object 
 2   qid2          363177 non-null  float64
 3   question1     363177 non-null  object 
 4   question2     363177 non-null  object 
 5   is_duplicate  363177 non-null  float64
dtypes: float64(2), object(4)
memory usage: 19.4+ MB


### Text preprocessing

In [108]:
#lower casing
def lower_case(text):
    return text.lower()


#remove numbers
def remove_numbers(text):
    output = re.sub(r'\d+', '', text)
    return output

# remove punctuation
import string
def remove_punctuation(text):
    text_p = "".join([char for char in text if char not in string.punctuation])
    return text_p

#tokenize text

import nltk
nltk.download('punkt')
from nltk import word_tokenize

def tokenize(text):
    words = word_tokenize(text)
    return words

#remove stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

def remove_stopwords(text):
    filtered_words = [word for word in text if word not in stop_words]
    return filtered_words


#remove single character tokens

def remove_single_characters(text):
    filtered_words = [word for word in text if len(word) > 1]
    return filtered_words

#stemming
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

def stem_words(text):
    stemmed = [stemmer.stem(word) for word in text]
    return stemmed

# Lemmatize with POS Tag
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    lemmatized = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in text]
    return lemmatized

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [109]:
class Preprocessor():
    def __init__(self):
        pass
    
    def preprocess(self,df_column, steps):
        self.df_column = df_column
        self.steps = steps
    
        if 'lower_case' in self.steps:
            self.df_column = self.df_column.apply(lambda x: lower_case(x))
        
        if 'remove_numbers' in self.steps:
            self.df_column = self.df_column.apply(lambda x: remove_numbers(x))
        
        if 'remove_punctuation' in self.steps:
            self.df_column = self.df_column.apply(lambda x: remove_punctuation(x))           
        
        if 'tokenize' in self.steps:
            self.df_column = self.df_column.apply(lambda x: tokenize(x))
        
        if 'stopwords' in self.steps:
            self.df_column = self.df_column.apply(lambda x: remove_stopwords(x))
            
        if 'single_characters' in self.steps:
            self.df_column = self.df_column.apply(lambda x: remove_single_characters(x))
        
        if 'stemming' in self.steps:
            self.df_column = self.df_column.apply(lambda x: stem_words(x))
            
        if 'lemmatize' in self.steps:
            self.df_column = self.df_column.apply(lambda x: lemmatize(x))    
            
        return self.df_column

In [110]:
unprocessed_df = df.copy()

In [111]:
steps = ['lower_case','remove_numbers','remove_punctuation',
        'tokenize','stopwords','single_characters','stemming']
processor = Preprocessor()
df['question1'] = processor.preprocess(df['question1'] ,steps)

In [112]:
df['question2'] = processor.preprocess(df['question2'] ,steps)

In [113]:
df.reset_index(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363177 entries, 0 to 363176
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   index         363177 non-null  int64  
 1   id            363177 non-null  object 
 2   qid1          363177 non-null  object 
 3   qid2          363177 non-null  float64
 4   question1     363177 non-null  object 
 5   question2     363177 non-null  object 
 6   is_duplicate  363177 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 19.4+ MB


### Sentence Matching using TF-IDF weighting - implementation from scratch

In [114]:
#create a vocabulary list from tokens in responses
q2vocab = df['question2'].tolist()
q2vocab = [item for sublist in q2vocab for item in sublist]
q2vocab = list(set(q2vocab))
len(q2vocab)

50010

In [115]:
# create dictionary - word in vocabulary: list of response ids (q2id) it is found in
Word_Doc_ids = {}
for word in q2vocab:
    Word_Doc_ids[word] = []
for i in range(len(df)):
    for word in df.loc[i,'question2']:
        Word_Doc_ids[word].append(df.loc[i,'qid2'])

#create set to drop duplicate values of doc_ids
for word in q2vocab:
    Word_Doc_ids[word] = list(set(Word_Doc_ids[word]))
    
    
list(Word_Doc_ids.items())[3:5]

[('dropoff', [396539.0]), ('upris', [354113.0, 363877.0, 1005.0, 414734.0])]

In [116]:
#create Inverse Document_frequency (IDF) dictionary - word in vocabulary : IDF
#use idf formula as per sklearn implementation - for smoothing add 1 to numerator and denominator

N = df['qid2'].nunique()
print('Number of unique responses: {}'.format(N))

IDF = {}
for word in Word_Doc_ids.keys():
    IDF[word] = np.log((1+N/(len(Word_Doc_ids[word]) +1))+1)
list(IDF.items())[:5]    

Number of unique responses: 273121


[('i∧', 11.824537664272047),
 ('remedi', 7.468955891006257),
 ('assuar', 11.824537664272047),
 ('dropoff', 11.824537664272047),
 ('upris', 10.908268900119985)]

In [117]:
#dictionary of list of words in each response - q2id : words
doc_words = dict(zip(df['qid2'],df['question2']))

list(doc_words.items())[:2]

[(213222.0, ['level', 'preprat', 'enough', 'exam', 'jlpt']),
 (536041.0, ['control', 'horni'])]

In [118]:
#define function to calculate  norm - to apply 'l1' or 'l2' normalization for calculation of term frequency

def norm(list_of_words, normalization_method = 'l1'):
    word_vec = np.array([list_of_words.count(word) for word in set(list_of_words)])
    if normalization_method == 'l1':
        norm = np.linalg.norm(word_vec,ord=1)
    else:
        norm = np.linalg.norm(word_vec, ord=2)
    return norm


In [119]:
# Create a dictionary where keys are words in the vocabulary, 
#values are dictionary of key:values as q2id : term frequency for corresponding word
#nested dictionary - word: {{q2id1: tf}, {q2id2: tf}, ....}
#use 'l1' norm to normalize term frequency

inverted_tf_l1 = {}
for word in q2vocab:
    word_tf = {}
    for doc in Word_Doc_ids[word]:
        tf = doc_words[doc].count(word)/norm(doc_words[doc],'l1')
        word_tf[doc] = tf
    inverted_tf_l1[word] = word_tf   

In [120]:
#use 'l2' norm to normalize term frequency

inverted_tf_l2 = {}
for word in q2vocab:
    word_tf = {}
    for doc in Word_Doc_ids[word]:
        tf = doc_words[doc].count(word)/norm(doc_words[doc],'l2')
        word_tf[doc] = tf
    inverted_tf_l2[word] = word_tf 

#### Creating inverted file for retreiving tfidf values - term frequencies normalized by either 'l1' or 'l2' norm

In [121]:
# Create a dictionary where keys are words in the vocab, 
#values are dictinary of key:values as doc_id:tfidf for corresponding word
#nested dictionary - word: {{doc_id1: tf}, {doc_id2: tf}, ....}

inverted_tfidf_l1 = {}
for word in q2vocab:
    doc_tfidfs = []
    for tup in list(inverted_tf_l1[word].items()):
        tup = list(tup)
        tfidf = tup[1]*IDF[word]
        tup[1] = tfidf
        doc_tfidfs.append(tup)
    inverted_tfidf_l1[word] = doc_tfidfs  
    
    
inverted_tfidf_l2 = {}
for word in q2vocab:
    doc_tfidfs = []
    for tup in list(inverted_tf_l2[word].items()):
        tup = list(tup)
        tfidf = tup[1]*IDF[word]
        tup[1] = tfidf
        doc_tfidfs.append(tup)
    inverted_tfidf_l2[word] = doc_tfidfs     

In [122]:
for i in range(len(inverted_tfidf_l1)):
    inverted_tfidf_l1[q2vocab[i]] = {x[0]:x[1] for x in inverted_tfidf_l1[q2vocab[i]]}
    
for i in range(len(inverted_tfidf_l2)):
    inverted_tfidf_l2[q2vocab[i]] = {x[0]:x[1] for x in inverted_tfidf_l2[q2vocab[i]]}    

In [123]:
list(inverted_tfidf_l1.items())[3:5]

[('dropoff', {396539.0: 1.9707562773786744}),
 ('upris',
  {354113.0: 1.818044816686664,
   363877.0: 1.3635336125149982,
   1005.0: 2.1816537800239972,
   414734.0: 1.0908268900119986})]

In [124]:
#create dataframe of top 100 queries that have label = 1
queries = df[df['is_duplicate'] == 1][:100]
queries = queries[['qid1','question1','qid2','question2', 'is_duplicate']]
queries = queries.reset_index(drop=True)
queries.head()

Unnamed: 0,qid1,question1,qid2,question2,is_duplicate
0,536040,"[control, horni, emot]",536041.0,"[control, horni]",1.0
1,155721,"[one, mbb]",7256.0,[mbb],1.0
2,147570,"[best, self, help, book, read, chang, life]",787.0,"[top, self, help, book, read]",1.0
3,71243,"[hillari, clinton, polici, toward, india, beco...",177376.0,"[hilari, clinton, polici, toward, india, becom...",1.0
4,22332,"[best, book, studi, tensor, gener, rel, basic]",22333.0,"[best, book, tensor, calculu]",1.0


In [125]:
#define function to retrieve matching responses against query
from collections import Counter
def ranked_docs(query, inverted_file):
    counter_objects = []
    for word in query:
        if word in inverted_file.keys():
            counter_objects.append(Counter(inverted_file[word]))
            combined = sum(counter_objects,Counter())
            ranked_docs = sorted(combined, key = combined.get, reverse=True)
    return ranked_docs 


In [126]:
#use inverted_tfidf_l1 or inverted_tfidf_l2 to retrieve top5 and top2 matching responses against queries
for i in range(len(queries)):
    query = queries.loc[i,'question1']
    ranked = ranked_docs(query, inverted_tfidf_l1)
    if queries.loc[i,'qid2'] in ranked[:2]:
        queries.loc[i,'top5_pred_l1'] = 1.0
        queries.loc[i,'top2_pred_l1'] = 1.0
    elif queries.loc[i,'qid2'] in ranked[:5]:
        queries.loc[i,'top5_pred_l1'] = 1.0
        queries.loc[i,'top2_pred_l1'] = 0.0
    else:
        queries.loc[i,'top5_pred_l1'] = 0.0
        queries.loc[i,'top2_pred_l1'] = 0.0
    

for i in range(len(queries)):
    query = queries.loc[i,'question1']
    ranked = ranked_docs(query, inverted_tfidf_l2)
    if queries.loc[i,'qid2'] in ranked[:2]:
        queries.loc[i,'top5_pred_l2'] = 1.0
        queries.loc[i,'top2_pred_l2'] = 1.0
    elif queries.loc[i,'qid2'] in ranked[:5]:
        queries.loc[i,'top5_pred_l2'] = 1.0
        queries.loc[i,'top2_pred_l2'] = 0.0
    else:
        queries.loc[i,'top5_pred_l2'] = 0.0
        queries.loc[i,'top2_pred_l2'] = 0.0    
queries.head()

Unnamed: 0,qid1,question1,qid2,question2,is_duplicate,top5_pred_l1,top2_pred_l1,top5_pred_l2,top2_pred_l2
0,536040,"[control, horni, emot]",536041.0,"[control, horni]",1.0,1.0,0.0,1.0,1.0
1,155721,"[one, mbb]",7256.0,[mbb],1.0,1.0,0.0,1.0,0.0
2,147570,"[best, self, help, book, read, chang, life]",787.0,"[top, self, help, book, read]",1.0,0.0,0.0,1.0,1.0
3,71243,"[hillari, clinton, polici, toward, india, beco...",177376.0,"[hilari, clinton, polici, toward, india, becom...",1.0,0.0,0.0,1.0,0.0
4,22332,"[best, book, studi, tensor, gener, rel, basic]",22333.0,"[best, book, tensor, calculu]",1.0,0.0,0.0,1.0,0.0


In [137]:
unprocessed_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,133273,213221,213222.0,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0.0
1,402555,536040,536041.0,How do I control my horny emotions?,How do you control your horniness?,1.0
2,360472,364011,490273.0,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0.0
3,150662,155721,7256.0,What can one do after MBBS?,What do i do after my MBBS ?,1.0
4,183004,279958,279959.0,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0.0


In [144]:
def retreive_unprocessed_question(question_qid, question_type):
    if question_type == 'query':
        unprocessed_question = unprocessed_df.loc[unprocessed_df['qid1']==question_qid, 'question1'].iloc[0]
    else:
        unprocessed_question = unprocessed_df.loc[unprocessed_df['qid2']==question_qid, 'question2'].iloc[0]
    
    return unprocessed_question

In [178]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [147]:
response_index = 1
#query = queries.loc[query_index,'question1' ]
response_qid = queries.loc[query_index,'qid2']
unprocessed_query = retreive_unprocessed_question(response_qid,'response')
unprocessed_query

'What do i do after my MBBS ?'

In [190]:
#define function to view matching responses
def view_ranked_responses(query_index,number_of_responses, normalization_method):
    
    query = queries.loc[query_index,'question1' ]
    query_qid = queries.loc[query_index,'qid1']
    actual_response_id = queries.loc[query_index,'qid2']
    
    
    if normalization_method == 'l1':
        top_responses = ranked_docs(query, inverted_tfidf_l1)[:number_of_responses]
    else:
        top_responses = ranked_docs(query, inverted_tfidf_l2)[:number_of_responses]
        
    
        
        
    print(color.BOLD+'Sentence matching with TF-IDF method - implementation from scratch...'+color.END) 
    print()
    
    print(color.BOLD+'Query'+color.END)
    print(color.RED+retreive_unprocessed_question(query_qid,'query')+color.END)
    print()
    print(color.BOLD+'Ground truth (actual) matching response'+color.END)
    print(color.CYAN +retreive_unprocessed_question(actual_response_id,'response')+color.END)
    print()
    print(color.BOLD+'Top {} matched responses by TFIDF method, {} normalization'.format(number_of_responses,normalization_method)+color.END)
    for response_id in top_responses:
        if response_id == actual_response_id:
            print(color.CYAN+retreive_unprocessed_question(response_id,'response')+color.END)
        else:
            print(retreive_unprocessed_question(response_id,'response'))
    
    

In [197]:
view_ranked_responses(76,5,'l1')

[1mSentence matching with TF-IDF method - implementation from scratch...[0m

[1mQuery[0m
[91mCould I buy a civilianized version of a fighter jet?[0m

[1mGround truth (actual) matching response[0m
[96mCan I buy a fighter jet?[0m

[1mTop 5 matched responses by TFIDF method, l1 normalization[0m
Who are the z fighters?
[96mCan I buy a fighter jet?[0m
Can a civilian buy a fighter jet if he is rich enough?
What should I buy with $800?
How do I buy Jets from USa?


In [198]:
from sklearn.metrics import accuracy_score
top2_l1 = accuracy_score(queries['is_duplicate'],queries['top2_pred_l1'])
top5_l1 = accuracy_score(queries['is_duplicate'],queries['top5_pred_l1'])
print(' L1-norm TF-IDF - accuracy of retriving correct response in top 2 ranked responses : {} %'.format(top2_l1*100))
print(' L1-norm TF-IDF - accuracy of retriving correct response in top 5 ranked responses : {} %'.format(top5_l1*100))


 L1-norm TF-IDF - accuracy of retriving correct response in top 2 ranked responses : 16.0 %
 L1-norm TF-IDF - accuracy of retriving correct response in top 5 ranked responses : 30.0 %


In [199]:
top2_l2 = accuracy_score(queries['is_duplicate'],queries['top2_pred_l2'])
top5_l2 = accuracy_score(queries['is_duplicate'],queries['top5_pred_l2'])
print(' L2-norm TF-IDF - accuracy of retriving correct response in top 2 ranked responses : {} %'.format(top2_l2*100))
print(' L2-norm TF-IDF - accuracy of retriving correct response in top 5 ranked responses : {:.1f} %'.format(top5_l2*100))



 L2-norm TF-IDF - accuracy of retriving correct response in top 2 ranked responses : 47.0 %
 L2-norm TF-IDF - accuracy of retriving correct response in top 5 ranked responses : 58.0 %


### Sentence matching with TFIDF -  sklearn implementation

In [211]:
corpus = df[['qid2','question2']]

In [212]:
corpus.drop_duplicates(subset='qid2', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corpus.drop_duplicates(subset='qid2', inplace=True)


In [213]:
#sklearn tfidf - l1 normalization method
from sklearn.feature_extraction.text import TfidfVectorizer
train_questions = list(corpus['question2'])
query_questions = list(queries['question1'])

def dummy(text):
    return text

tf1 = TfidfVectorizer(tokenizer=dummy,preprocessor=dummy, lowercase=False, norm='l1')
X_train = tf1.fit_transform(train_questions)
X_test = tf1.transform(query_questions)

matched = np.array((X_test@X_train.T).todense())
matched_sorted = np.zeros(matched.shape, dtype='int')

In [214]:
sktop2_l1 = []
sktop5_l1 = []

for i in range(matched.shape[0]):
    matched_sorted[i,:] = matched[i,:].argsort()[::-1]
    top2ind = matched_sorted[i,:2]
    top5ind = matched_sorted[i,:5]
    top2docs = [corpus.iloc[c,0] for c in top2ind]
    top5docs = [corpus.iloc[c,0] for c in top5ind]
    if queries.iloc[i,2] in top2docs:
        sktop2_l1.append(1)
    else:
        sktop2_l1.append(0)
    if queries.iloc[i,2] in top5docs:
        sktop5_l1.append(1)
    else:
        sktop5_l1.append(0)

queries['sktop2_l1'] = sktop2_l1
queries['sktop5_l1'] = sktop5_l1

In [215]:
tf2 = TfidfVectorizer(tokenizer=dummy,preprocessor=dummy, lowercase=False, norm='l2')
X_train2 = tf2.fit_transform(train_questions)
X_test2 = tf2.transform(query_questions)
matched2 = np.array((X_test2@X_train2.T).todense())
matched_sorted2 = np.zeros(matched2.shape, dtype='int')

In [216]:
sktop2_l2 = []
sktop5_l2 = []

for i in range(matched2.shape[0]):
    matched_sorted2[i,:] = matched2[i,:].argsort()[::-1]
    top2ind = matched_sorted2[i,:2]
    top5ind = matched_sorted2[i,:5]
    top2docs = [corpus.iloc[c,0] for c in top2ind]
    top5docs = [corpus.iloc[c,0] for c in top5ind]
    if queries.iloc[i,2] in top2docs:
        sktop2_l2.append(1)
    else:
        sktop2_l2.append(0)
    if queries.iloc[i,2] in top5docs:
        sktop5_l2.append(1)
    else:
        sktop5_l2.append(0)

queries['sktop2_l2'] = sktop2_l2
queries['sktop5_l2'] = sktop5_l2

In [219]:
sktop2_l1 = accuracy_score(queries['is_duplicate'], queries['sktop2_l1'])
sktop5_l1 = accuracy_score(queries['is_duplicate'], queries['sktop5_l1'])
print('Sklearn - Accuracy of retreiving correct response in top 2 matched responses - l1 normalization: {}'.format(sktop2_l1*100))
print('Sklearn - Accuracy of retreiving correct response in top 5 matched responses - l1 normalization: {}'.format(sktop5_l1*100))

Sklearn - Accuracy of retreiving correct response in top 2 matched responses - l1 normalization: 16.0
Sklearn - Accuracy of retreiving correct response in top 5 matched responses - l1 normalization: 32.0


In [221]:
sktop2_l2 = accuracy_score(queries['is_duplicate'], queries['sktop2_l2'])
sktop5_l2 = accuracy_score(queries['is_duplicate'], queries['sktop5_l2'])
print('Sklearn - Accuracy of retreiving correct response in top 2 matched responses - l2 normalization: {}'.format(sktop2_l2*100))
print('Sklearn - Accuracy of retreiving correct response in top 5 matched responses - l2 normalization: {}'.format(sktop5_l2*100))

Sklearn - Accuracy of retreiving correct response in top 2 matched responses - l2 normalization: 45.0
Sklearn - Accuracy of retreiving correct response in top 5 matched responses - l2 normalization: 61.0


In [222]:
#save dataframes
# unprocessed_df.to_pickle('unprocessed_dataframe')
# df.to_pickle('text_preprocessed_dataframe')
