In [1]:
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
!pip install annoy
from annoy import AnnoyIndex
from datasketch import MinHash, MinHashLSHEnsemble
from sklearn.cluster import KMeans

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\senth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [2]:
df = pd.read_csv('train.csv')

In [3]:
df

Unnamed: 0,src_content,dummy_column,tgt_content,dummy_column.1
0,National Archives NEWLINE_CHAR NEWLINE_CHAR Y...,1.0,– The unemployment rate dropped to 8.2% last m...,1.0
1,LOS ANGELES (AP) — In her first interview sinc...,1.0,"– Shelly Sterling plans ""eventually"" to divorc...",1.0
2,"GAITHERSBURG, Md. (AP) — A small, private jet ...",1.0,– A twin-engine Embraer jet that the FAA descr...,1.0
3,Tucker Carlson Exposes His Own Sexism on Twitt...,1.0,– Tucker Carlson is in deep doodoo with conser...,1.0
4,A man accused of removing another man's testic...,1.0,– What are the three most horrifying words in ...,1.0
...,...,...,...,...
44967,"More than 670,000 copies of the Pearls’ self-p...",1.0,– The deaths of three children have been linke...,1.0
44968,Seeking out cost-conscious consumers who have ...,1.0,"– Apple is hoping its new, cheaper iPhone can ...",1.0
44969,Click to email this to a friend (Opens in new ...,1.0,"– January Jones, who plays the beleaguered wif...",1.0
44970,"BARRINGTON, R.I. (AP) — Women clad in yoga pan...",1.0,– A Rhode Island man who penned a letter to th...,1.0


In [4]:
df = df.drop(['dummy_column','dummy_column.1'], axis =1)

In [5]:
df

Unnamed: 0,src_content,tgt_content
0,National Archives NEWLINE_CHAR NEWLINE_CHAR Y...,– The unemployment rate dropped to 8.2% last m...
1,LOS ANGELES (AP) — In her first interview sinc...,"– Shelly Sterling plans ""eventually"" to divorc..."
2,"GAITHERSBURG, Md. (AP) — A small, private jet ...",– A twin-engine Embraer jet that the FAA descr...
3,Tucker Carlson Exposes His Own Sexism on Twitt...,– Tucker Carlson is in deep doodoo with conser...
4,A man accused of removing another man's testic...,– What are the three most horrifying words in ...
...,...,...
44967,"More than 670,000 copies of the Pearls’ self-p...",– The deaths of three children have been linke...
44968,Seeking out cost-conscious consumers who have ...,"– Apple is hoping its new, cheaper iPhone can ..."
44969,Click to email this to a friend (Opens in new ...,"– January Jones, who plays the beleaguered wif..."
44970,"BARRINGTON, R.I. (AP) — Women clad in yoga pan...",– A Rhode Island man who penned a letter to th...


In [6]:
#stop words removal

In [7]:
stop_words = stopwords.words('english')
df['src_content'] = df['src_content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [8]:
 #Stemming

In [9]:
# Initialize the stemmer
stemmer = PorterStemmer()

# Apply stemming to the 'text' column
df['src_content'] = df['src_content'].apply(lambda x: stemmer.stem(x))

In [10]:
df

Unnamed: 0,src_content,tgt_content
0,national archives newline_char newline_char ye...,– The unemployment rate dropped to 8.2% last m...
1,los angeles (ap) — in first interview since nb...,"– Shelly Sterling plans ""eventually"" to divorc..."
2,"gaithersburg, md. (ap) — a small, private jet ...",– A twin-engine Embraer jet that the FAA descr...
3,tucker carlson exposes his own sexism twitter ...,– Tucker Carlson is in deep doodoo with conser...
4,a man accused removing another man's testicle ...,– What are the three most horrifying words in ...
...,...,...
44967,"more 670,000 copies pearls’ self-published boo...",– The deaths of three children have been linke...
44968,seeking cost-conscious consumers gravitated to...,"– Apple is hoping its new, cheaper iPhone can ..."
44969,click email friend (opens new window) newline_...,"– January Jones, who plays the beleaguered wif..."
44970,"barrington, r.i. (ap) — women clad yoga pants ...",– A Rhode Island man who penned a letter to th...


In [11]:
#Removal of unwanted email ids and websites 

In [12]:
import re

def remove_emails_websites(text):
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove website links starting with http:// or https://
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    return text

In [13]:
df['src_content'] = df['src_content'].apply(remove_emails_websites)

In [14]:
df

Unnamed: 0,src_content,tgt_content
0,national archives newline_char newline_char ye...,– The unemployment rate dropped to 8.2% last m...
1,los angeles (ap) — in first interview since nb...,"– Shelly Sterling plans ""eventually"" to divorc..."
2,"gaithersburg, md. (ap) — a small, private jet ...",– A twin-engine Embraer jet that the FAA descr...
3,tucker carlson exposes his own sexism twitter ...,– Tucker Carlson is in deep doodoo with conser...
4,a man accused removing another man's testicle ...,– What are the three most horrifying words in ...
...,...,...
44967,"more 670,000 copies pearls’ self-published boo...",– The deaths of three children have been linke...
44968,seeking cost-conscious consumers gravitated to...,"– Apple is hoping its new, cheaper iPhone can ..."
44969,click email friend (opens new window) newline_...,"– January Jones, who plays the beleaguered wif..."
44970,"barrington, r.i. (ap) — women clad yoga pants ...",– A Rhode Island man who penned a letter to th...


In [15]:
#Removing Paranthesis and Hyphens

In [16]:
df['src_content'] = df['src_content'].str.replace(r'[\(\)\-]', '', regex=True)

In [17]:
df

Unnamed: 0,src_content,tgt_content
0,national archives newline_char newline_char ye...,– The unemployment rate dropped to 8.2% last m...
1,los angeles ap — in first interview since nba ...,"– Shelly Sterling plans ""eventually"" to divorc..."
2,"gaithersburg, md. ap — a small, private jet cr...",– A twin-engine Embraer jet that the FAA descr...
3,tucker carlson exposes his own sexism twitter ...,– Tucker Carlson is in deep doodoo with conser...
4,a man accused removing another man's testicle ...,– What are the three most horrifying words in ...
...,...,...
44967,"more 670,000 copies pearls’ selfpublished book...",– The deaths of three children have been linke...
44968,seeking costconscious consumers gravitated tow...,"– Apple is hoping its new, cheaper iPhone can ..."
44969,click email friend opens new window newline_ch...,"– January Jones, who plays the beleaguered wif..."
44970,"barrington, r.i. ap — women clad yoga pants pl...",– A Rhode Island man who penned a letter to th...


In [18]:
#function to remove newline_char 

In [19]:
def remove_word(df, src_content, word):
    df[src_content] = df[src_content].str.replace(word, '')
    return df
word_to_remove = 'newline_char'
column_name = 'src_content'
df = remove_word(df,column_name,word_to_remove)

In [20]:
df

Unnamed: 0,src_content,tgt_content
0,"national archives yes, it’s time again, folk...",– The unemployment rate dropped to 8.2% last m...
1,los angeles ap — in first interview since nba ...,"– Shelly Sterling plans ""eventually"" to divorc..."
2,"gaithersburg, md. ap — a small, private jet cr...",– A twin-engine Embraer jet that the FAA descr...
3,tucker carlson exposes his own sexism twitter ...,– Tucker Carlson is in deep doodoo with conser...
4,a man accused removing another man's testicle ...,– What are the three most horrifying words in ...
...,...,...
44967,"more 670,000 copies pearls’ selfpublished book...",– The deaths of three children have been linke...
44968,seeking costconscious consumers gravitated tow...,"– Apple is hoping its new, cheaper iPhone can ..."
44969,click email friend opens new window click sh...,"– January Jones, who plays the beleaguered wif..."
44970,"barrington, r.i. ap — women clad yoga pants pl...",– A Rhode Island man who penned a letter to th...


In [21]:
Truncated_data = df.head(1000)


In [22]:
Truncated_data 

Unnamed: 0,src_content,tgt_content
0,"national archives yes, it’s time again, folk...",– The unemployment rate dropped to 8.2% last m...
1,los angeles ap — in first interview since nba ...,"– Shelly Sterling plans ""eventually"" to divorc..."
2,"gaithersburg, md. ap — a small, private jet cr...",– A twin-engine Embraer jet that the FAA descr...
3,tucker carlson exposes his own sexism twitter ...,– Tucker Carlson is in deep doodoo with conser...
4,a man accused removing another man's testicle ...,– What are the three most horrifying words in ...
...,...,...
995,_______ |_ _| | | | | ____| | | | | | | | \___...,"– Before dying by car bomb last October, Malte..."
996,"ramallah, west bank ap — a witness says member...",– A Palestinian Cabinet member has died amid a...
997,if you’ve ever thought birth control might mes...,– Anyone who's struggled with mood swings whil...
998,friends say surprise reconciliation cards holl...,– The faces of Demi and Ashton may be filling ...


In [23]:
#Removing the redunduncy present in the text by constructing cosine matrix

In [24]:
def remove_redundancy(Truncated_data, src_content):
    # Create TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Fit and transform the text data
    tfidf_matrix = tfidf_vectorizer.fit_transform(Truncated_data[src_content])

    # Compute pairwise cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix)
    print(similarity_matrix)

    # Create a mask to track redundant sentences
    mask = []

    for i in range(len(similarity_matrix)):
        # Check if the sentence is similar to any previous sentences
        if not any(similarity_matrix[i, j] > 0.9 for j in range(i)):
            mask.append(True)
        else:
            mask.append(False)

    # Filter out redundant sentences
    Truncated_data_filtered = Truncated_data[mask]

    return Truncated_data_filtered


In [25]:
column_name = 'src_content'

Truncated_data_filtered = remove_redundancy(Truncated_data, column_name)
print(Truncated_data_filtered)

[[1.         0.01706763 0.02710664 ... 0.02806697 0.02163965 0.02732097]
 [0.01706763 1.         0.02104491 ... 0.00719449 0.02435722 0.02439763]
 [0.02710664 0.02104491 1.         ... 0.01747572 0.02201428 0.0452059 ]
 ...
 [0.02806697 0.00719449 0.01747572 ... 1.         0.01252957 0.03260633]
 [0.02163965 0.02435722 0.02201428 ... 0.01252957 1.         0.0325608 ]
 [0.02732097 0.02439763 0.0452059  ... 0.03260633 0.0325608  1.        ]]
                                           src_content  \
0    national archives   yes, it’s time again, folk...   
1    los angeles ap — in first interview since nba ...   
2    gaithersburg, md. ap — a small, private jet cr...   
3    tucker carlson exposes his own sexism twitter ...   
4    a man accused removing another man's testicle ...   
..                                                 ...   
995  _______ |_ _| | | | | ____| | | | | | | | \___...   
996  ramallah, west bank ap — a witness says member...   
997  if you’ve ever thought birth 

In [26]:
Truncated_data_filtered

Unnamed: 0,src_content,tgt_content
0,"national archives yes, it’s time again, folk...",– The unemployment rate dropped to 8.2% last m...
1,los angeles ap — in first interview since nba ...,"– Shelly Sterling plans ""eventually"" to divorc..."
2,"gaithersburg, md. ap — a small, private jet cr...",– A twin-engine Embraer jet that the FAA descr...
3,tucker carlson exposes his own sexism twitter ...,– Tucker Carlson is in deep doodoo with conser...
4,a man accused removing another man's testicle ...,– What are the three most horrifying words in ...
...,...,...
995,_______ |_ _| | | | | ____| | | | | | | | \___...,"– Before dying by car bomb last October, Malte..."
996,"ramallah, west bank ap — a witness says member...",– A Palestinian Cabinet member has died amid a...
997,if you’ve ever thought birth control might mes...,– Anyone who's struggled with mood swings whil...
998,friends say surprise reconciliation cards holl...,– The faces of Demi and Ashton may be filling ...


In [27]:
Truncated_data_filtered.loc[3,'src_content']

'tucker carlson exposes his own sexism twitter updated   tucker carlson done good work past… his site, the daily caller, frequent stop mine many conservatives. they responsible exposing journolist scandal, highlighted planning coordination many members leftwing press. i always grateful tucker’s team bringing story light. this also i angered tucker’s recent actions. i thought better this.   if haven’t heard now, monday evening, tucker carlson posted disturbing tweet governor palin said:   palin’s popularity falling iowa, maintains lead become supreme commander milfistan   aside tucker’s sheeplike response warped poll numbers, also failed take ownership sexist comment. he deleted original which i link retweet obviously aware posted wrong. unfortunately him, many people already seen responded. you can’t put toothpaste back tube, tucker.   is sort treatment conservative women, want get involved process, expected put with? is okay male columnists conservative otherwise continue objectifying

In [28]:
Truncated_data_filtered

Unnamed: 0,src_content,tgt_content
0,"national archives yes, it’s time again, folk...",– The unemployment rate dropped to 8.2% last m...
1,los angeles ap — in first interview since nba ...,"– Shelly Sterling plans ""eventually"" to divorc..."
2,"gaithersburg, md. ap — a small, private jet cr...",– A twin-engine Embraer jet that the FAA descr...
3,tucker carlson exposes his own sexism twitter ...,– Tucker Carlson is in deep doodoo with conser...
4,a man accused removing another man's testicle ...,– What are the three most horrifying words in ...
...,...,...
995,_______ |_ _| | | | | ____| | | | | | | | \___...,"– Before dying by car bomb last October, Malte..."
996,"ramallah, west bank ap — a witness says member...",– A Palestinian Cabinet member has died amid a...
997,if you’ve ever thought birth control might mes...,– Anyone who's struggled with mood swings whil...
998,friends say surprise reconciliation cards holl...,– The faces of Demi and Ashton may be filling ...


In [29]:
#Removing special character from dataframe
Truncated_data_filtered = Truncated_data_filtered.replace('[^\w\s]', '', regex=True) 

In [30]:
Truncated_data_filtered

Unnamed: 0,src_content,tgt_content
0,national archives yes its time again folks i...,The unemployment rate dropped to 82 last mont...
1,los angeles ap in first interview since nba b...,Shelly Sterling plans eventually to divorce h...
2,gaithersburg md ap a small private jet crashe...,A twinengine Embraer jet that the FAA describ...
3,tucker carlson exposes his own sexism twitter ...,Tucker Carlson is in deep doodoo with conserv...
4,a man accused removing another mans testicle m...,What are the three most horrifying words in t...
...,...,...
995,_______ _ _ ____ ______ ournalist n...,Before dying by car bomb last October Maltese...
996,ramallah west bank ap a witness says member p...,A Palestinian Cabinet member has died amid a ...
997,if youve ever thought birth control might mess...,Anyone whos struggled with mood swings while ...
998,friends say surprise reconciliation cards holl...,The faces of Demi and Ashton may be filling s...


In [31]:
#Cleaning the Tgt content which is the summary for the multi document summarizaion

In [32]:
#Removing Paranthesis and Hyphens
Truncated_data_filtered['tgt_content'] = Truncated_data_filtered['tgt_content'].str.replace(r'[\(\)\-]', '', regex=True)

In [33]:
#function to remove newline_char 

In [34]:
def remove_word(Truncated_data_filtered, tgt_content, word):
    df[tgt_content] = df[tgt_content].str.replace(word, '')
    return Truncated_data_filtered
word_to_remove = 'newline_char'
column_name = 'tgt_content'
Truncated_data_filtered = remove_word(Truncated_data_filtered,column_name,word_to_remove)

In [35]:
Truncated_data_filtered

Unnamed: 0,src_content,tgt_content
0,national archives yes its time again folks i...,The unemployment rate dropped to 82 last mont...
1,los angeles ap in first interview since nba b...,Shelly Sterling plans eventually to divorce h...
2,gaithersburg md ap a small private jet crashe...,A twinengine Embraer jet that the FAA describ...
3,tucker carlson exposes his own sexism twitter ...,Tucker Carlson is in deep doodoo with conserv...
4,a man accused removing another mans testicle m...,What are the three most horrifying words in t...
...,...,...
995,_______ _ _ ____ ______ ournalist n...,Before dying by car bomb last October Maltese...
996,ramallah west bank ap a witness says member p...,A Palestinian Cabinet member has died amid a ...
997,if youve ever thought birth control might mess...,Anyone whos struggled with mood swings while ...
998,friends say surprise reconciliation cards holl...,The faces of Demi and Ashton may be filling s...


In [36]:
Truncated_data_filtered.to_csv('Train_Truncated_Data.csv', index=False)