### Importing libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re
import sys

### Reading data

In [2]:
# Read the train, test, and validation datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
validation_df = pd.read_csv("validation.csv")


# Combine the datasets
ats_data = pd.concat([train_df, test_df, validation_df], ignore_index=True)

# Save the combined dataset
#ats_data.to_csv("combined.csv", index=False)

In [3]:
ats_data

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...
...,...,...,...
311966,e93f721ba4949f21f33549c4a21d55ff456af979,All shops will be allowed to offer ‘click and ...,Shops won't have to apply for planning permiss...
311967,8df19a570ad14119a7d00f3bbe864fedf8c1691d,Mo Farah has had his nationality called into q...,Mo Farah broke the European half-marathon reco...
311968,2fdd5f89aa26e91ceea9b0ef264abfcfc3e6fa2e,Wolves kept their promotion hopes alive with a...,Wolves are three points off the play-off place...
311969,530d7b18d7a715b368b0745f9dfebfe353adeda8,A Brown University graduate student has died ...,"Hyoun Ju Sohn, a 25-year-old doctoral student,..."


In [4]:
ats_data.article[0]

"By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated food while attending a conference for newly ordained 

In [5]:
ats_data.highlights[0]

'Bishop John Folda, of North Dakota, is taking time off after being diagnosed .\nHe contracted the infection through contaminated food in Italy .\nChurch members in Fargo, Grand Forks and Jamestown could have been exposed .'

### check if all ID values are unique

In [6]:
ats_data.id.nunique()

311971

### dropping the entire ID column since it is insignificant to the research

In [7]:
ats_data = ats_data.drop('id', axis= 1)
ats_data.head(2)

Unnamed: 0,article,highlights
0,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...


In [8]:
ats_data.loc[1, 'article'] = np.nan
ats_data.loc[122, 'article'] = np.nan
ats_data.loc[211, 'article'] = np.nan
ats_data.loc[51, 'article'] = np.nan
ats_data.loc[278, 'article'] = np.nan
ats_data.loc[218, 'article'] = np.nan
ats_data.loc[9024, 'article'] = np.nan
ats_data.loc[5216, 'article'] = np.nan
ats_data.loc[111, 'article'] = np.nan
ats_data.loc[222, 'article'] = np.nan
ats_data.loc[221, 'article'] = np.nan
ats_data.loc[151, 'article'] = np.nan
ats_data.loc[2078, 'article'] = np.nan
ats_data.loc[2118, 'article'] = np.nan
ats_data.loc[90124, 'article'] = np.nan
ats_data.loc[51216, 'article'] = np.nan
ats_data.loc[6522, 'article'] = np.nan
ats_data.loc[21221, 'article'] = np.nan
ats_data.loc[52121, 'article'] = np.nan
ats_data.loc[27338, 'article'] = np.nan
ats_data.loc[21118, 'article'] = np.nan
ats_data.loc[90214, 'article'] = np.nan
ats_data.loc[52316, 'article'] = np.nan
ats_data.loc[2, 'article'] = np.nan
ats_data.loc[12, 'article'] = np.nan
ats_data.loc[21, 'article'] = np.nan
ats_data.loc[513, 'article'] = np.nan
ats_data.loc[27, 'article'] = np.nan
ats_data.loc[28, 'article'] = np.nan
ats_data.loc[924, 'article'] = np.nan
ats_data.loc[516, 'article'] = np.nan

## Incomplete & Missing data

### Drop rows with missing values in any of the 3 columns.

In [9]:
#checking for null values
ats_data.isnull().sum()

article       31
highlights     0
dtype: int64

In [10]:
# Drop rows with missing values in 'id', 'article', or 'highlights' columns
ats_data.dropna(subset=['article', 'highlights'], inplace=True)

# Display the updated summary of the cleaned dataset
print(ats_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 311940 entries, 0 to 311970
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   article     311940 non-null  object
 1   highlights  311940 non-null  object
dtypes: object(2)
memory usage: 7.1+ MB
None


In [11]:
ats_data.isnull().sum()

article       0
highlights    0
dtype: int64

## Noisy Data


### Remove any HTML tags from `article` and `highlights` columns

In [12]:
# Convert the DataFrame to a string
article_string = str(ats_data['article'])

# Check if the article column contains any HTML links or tags
article_has_links_or_tags = re.search(r'<a href="[^"]+">|</a>', article_string) is not None

# Print the results
print("Does the article column contain any HTML links or tags?:", article_has_links_or_tags)



# Convert the DataFrame to a string
highlights_string = str(ats_data['highlights'])

# Check if the article column contains any HTML links or tags
highlights_has_links_or_tags = re.search(r'<a href="[^"]+">|</a>', highlights_string) is not None

# Print the results
print("Does the article column contain any HTML links or tags?:", highlights_has_links_or_tags)

ats_data['article'] = ats_data['article'].apply(lambda x: re.sub(r'<.*?|>', '', x))
ats_data['highlights'] = ats_data['highlights'].apply(lambda x: re.sub(r'<.*?>', '', x))

# Display the updated summary of the cleaned dataset
print(ats_data.info())

Does the article column contain any HTML links or tags?: False
Does the article column contain any HTML links or tags?: False


In [13]:
ats_data['article'] = ats_data['article'].apply(lambda x: re.sub(r'<.*?|>', '', x))
ats_data['highlights'] = ats_data['highlights'].apply(lambda x: re.sub(r'<.*?>', '', x))

# Display the updated summary of the cleaned dataset
print(ats_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 311940 entries, 0 to 311970
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   article     311940 non-null  object
 1   highlights  311940 non-null  object
dtypes: object(2)
memory usage: 7.1+ MB
None


### Removing '\n' newline characters

#### before removal

In [14]:
ats_data.highlights[0]

'Bishop John Folda, of North Dakota, is taking time off after being diagnosed .\nHe contracted the infection through contaminated food in Italy .\nChurch members in Fargo, Grand Forks and Jamestown could have been exposed .'

In [15]:
ats_data['article'] = ats_data['article'].apply(lambda x: re.sub(r' .\n', '. ', x))
ats_data['highlights'] = ats_data['highlights'].apply(lambda x: re.sub(r' .\n', '. ', x))

#### after removal

In [16]:
ats_data.highlights[0]

'Bishop John Folda, of North Dakota, is taking time off after being diagnosed. He contracted the infection through contaminated food in Italy. Church members in Fargo, Grand Forks and Jamestown could have been exposed .'

### Removing all special characters

#### Before removal

In [17]:
ats_data.article[7]

'By . Daily Mail Reporter . This is the moment a train announcer stunned passengers by announcing over a tannoy as they pulled into a station to beware of pickpockets and gipsies. The London Midland service had been pulling into Telford Station, Shropshire, on Saturday when the comments were made. Passenger Chris Downes, 46, was recording on his mobile at the time and the announcer can clearly be heard saying: \'Telford Central - please be aware of pickpockets and gipsies\'. Scroll down for video . This is the moment a train announcer stunned passengers by announcing over a tannoy as they pulled into a station to beware of pickpockets and gipsies . The remark was mainly greeted by cheers from Shrewsbury Town football fans travelling back from their game against Wolverhampton Wanderers. But London Midland said it is now launching an investigation into the incident on board the 17.25 Wolverhampton to Shrewsbury service. Yesterday Wolves fan Mr Downes, who was on his way home to Bayston H

In [18]:
regex = re.compile('[^\w\s,.!?()\’]')
ats_data['article'] = ats_data['article'].str.replace(regex, '')
ats_data['highlights'] = ats_data['highlights'].str.replace(regex, '')

In [19]:
ats_data.article[7]

'By . Daily Mail Reporter . This is the moment a train announcer stunned passengers by announcing over a tannoy as they pulled into a station to beware of pickpockets and gipsies. The London Midland service had been pulling into Telford Station, Shropshire, on Saturday when the comments were made. Passenger Chris Downes, 46, was recording on his mobile at the time and the announcer can clearly be heard saying Telford Central  please be aware of pickpockets and gipsies. Scroll down for video . This is the moment a train announcer stunned passengers by announcing over a tannoy as they pulled into a station to beware of pickpockets and gipsies . The remark was mainly greeted by cheers from Shrewsbury Town football fans travelling back from their game against Wolverhampton Wanderers. But London Midland said it is now launching an investigation into the incident on board the 17.25 Wolverhampton to Shrewsbury service. Yesterday Wolves fan Mr Downes, who was on his way home to Bayston Hill, S

In [20]:
column_name = 'article' 

# Check for duplicates in the specified column
duplicate_values = ats_data.duplicated(subset=column_name, keep=False)

# Filter the DataFrame to show only the duplicate rows
duplicate_rows = ats_data[duplicate_values]

In [21]:
duplicates=pd.DataFrame(duplicate_rows)

In [22]:
duplicates.shape

(6214, 2)

In [23]:
ats_data.drop_duplicates(subset=column_name, inplace=True)

In [24]:
column_name = 'article' 

# Check for duplicates in the specified column
duplicate_values_ = ats_data.duplicated(subset=column_name, keep=False)

# Filter the DataFrame to show only the duplicate rows
duplicate_rows_ = ats_data[duplicate_values_]

In [25]:
duplicates_recheck=pd.DataFrame(duplicate_rows_)
duplicates_recheck.shape

(0, 2)

## Imbalanced data

### Convert all text to lowercase in `article` and `highlights` columns

In [26]:
# Convert all text to lowercase in 'article' and 'highlights' column
ats_data['article'] = ats_data['article'].str.lower()
ats_data['highlights'] = ats_data['highlights'].str.lower()

### Expanding all the contractions in the `article` and `highlights` column

In [27]:
# Define a dictionary to map contractions to their expanded forms
contraction_map = {
    "can't": "cannot",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it's": "it is",
    "let's": "let us",
    "mustn't": "must not",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "shouldn't": "should not",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what's": "what is",
    "what've": "what have",
    "where's": "where is",
    "who'd": "who would",
    "who'll": "who will",
    "who's": "who is",
    "won't": "will not",
    "would've": "would have",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
}

# Function to expand contractions
def expand_contractions(text):
    return ' '.join([contraction_map.get(word, word) for word in text.split()])

# Apply the function to the 'article' and 'highlights' columns in the dataframe
ats_data['article'] = ats_data['article'].apply(expand_contractions)
ats_data['highlights'] = ats_data['highlights'].apply(expand_contractions)

# Save the cleaned dataset to a new CSV file
#ats_data.to_csv('cleaned.csv', index=False)  

In [28]:
# Save the cleaned dataset to a new CSV file
ats_data.to_csv('cleaned.csv', index=False) 

In [31]:
#uncomment later
'''# Calculate article lengths
ats_data['article_length'] = ats_data['article'].str.split().apply(len)

# Bar chart of article lengths with custom colors
plt.figure(figsize=(10, 6))
sns.histplot(ats_data['article_length'], bins=20, color='#1f77b4')  # Set custom color
plt.axvline(ats_data['article_length'].mean(), color='red', linestyle='--', label='Mean')  # Add a vertical line for mean article length
plt.xlabel('Article Length')
plt.ylabel('Count')
plt.title('Distribution of Article Lengths')
plt.legend()
plt.show()'''

"# Calculate article lengths\nats_data['article_length'] = ats_data['article'].str.split().apply(len)\n\n# Bar chart of article lengths with custom colors\nplt.figure(figsize=(10, 6))\nsns.histplot(ats_data['article_length'], bins=20, color='#1f77b4')  # Set custom color\nplt.axvline(ats_data['article_length'].mean(), color='red', linestyle='--', label='Mean')  # Add a vertical line for mean article length\nplt.xlabel('Article Length')\nplt.ylabel('Count')\nplt.title('Distribution of Article Lengths')\nplt.legend()\nplt.show()"

In [32]:
#uncomment later

# Example: Word cloud of highlights
'''from wordcloud import WordCloud

plt.figure(figsize=(10, 8))
wordcloud = WordCloud(background_color='whitesmoke', width=800, height=400, max_words=100).generate(' '.join(ats_data['highlights']))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Highlights')
plt.show()'''

"from wordcloud import WordCloud\n\nplt.figure(figsize=(10, 8))\nwordcloud = WordCloud(background_color='whitesmoke', width=800, height=400, max_words=100).generate(' '.join(ats_data['highlights']))\nplt.imshow(wordcloud, interpolation='bilinear')\nplt.axis('off')\nplt.title('Word Cloud of Highlights')\nplt.show()"

In [33]:
#uncomment later

'''plt.figure(figsize=(10, 8))
wordcloud = WordCloud(background_color='whitesmoke', width=800, height=400, max_words=100).generate(' '.join(ats_data['article']))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of article')
plt.show()'''

"plt.figure(figsize=(10, 8))\nwordcloud = WordCloud(background_color='whitesmoke', width=800, height=400, max_words=100).generate(' '.join(ats_data['article']))\nplt.imshow(wordcloud, interpolation='bilinear')\nplt.axis('off')\nplt.title('Word Cloud of article')\nplt.show()"

In [34]:
#uncomment later

# Example: Scatter plot of article length vs highlight length
'''ats_data['highlights_length'] = ats_data['highlights'].str.split().apply(len)

plt.figure(figsize=(8, 6))
sns.scatterplot(x='article_length', y='highlights_length', data=ats_data)
plt.xlabel('Article Length')
plt.ylabel('Highlight Length')
plt.title('Scatter Plot of Article Length vs Highlights Length')
plt.show()'''

"ats_data['highlights_length'] = ats_data['highlights'].str.split().apply(len)\n\nplt.figure(figsize=(8, 6))\nsns.scatterplot(x='article_length', y='highlights_length', data=ats_data)\nplt.xlabel('Article Length')\nplt.ylabel('Highlight Length')\nplt.title('Scatter Plot of Article Length vs Highlights Length')\nplt.show()"

In [35]:
#uncomment later

'''correlation = ats_data[['article_length', 'highlights_length']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('Correlation between Article Length and Highlight Length')
plt.show()'''

"correlation = ats_data[['article_length', 'highlights_length']].corr()\nplt.figure(figsize=(8, 6))\nsns.heatmap(correlation, annot=True, cmap='coolwarm')\nplt.title('Correlation between Article Length and Highlight Length')\nplt.show()"

In [36]:
#uncomment later

# Create a line graph with two lines
'''plt.figure(figsize=(10, 6))
plt.plot(ats_data['highlights_length'], label='Highlights Length')
plt.plot(ats_data['article_length'], label='Article Length')
plt.xlabel('Index')
plt.ylabel('Length')
plt.title('Length of Highlights and Articles')
plt.legend()
plt.show()'''

"plt.figure(figsize=(10, 6))\nplt.plot(ats_data['highlights_length'], label='Highlights Length')\nplt.plot(ats_data['article_length'], label='Article Length')\nplt.xlabel('Index')\nplt.ylabel('Length')\nplt.title('Length of Highlights and Articles')\nplt.legend()\nplt.show()"

In [37]:
#!pip install nltk

In [38]:
'''import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Define a function to calculate the percentage of nouns and pronouns in a given text
def calc_noun_pronoun_percentage(text):
    # Tokenize the text into words
    words = word_tokenize(text)
    # Remove stop words from the list of words
    filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]
    # Get the part-of-speech (POS) tags for the filtered words
    pos_tags = nltk.pos_tag(filtered_words)
    # Count the number of nouns and pronouns in the POS tags
    noun_count = sum([1 for tag in pos_tags if tag[1] == 'NN' or tag[1] == 'NNS' or tag[1] == 'NNP' or tag[1] == 'NNPS'])
    pronoun_count = sum([1 for tag in pos_tags if tag[1] == 'PRP' or tag[1] == 'PRP$'])
    # Calculate the percentage of nouns and pronouns in the text
    total_count = noun_count + pronoun_count
    noun_percentage = round((noun_count / total_count) * 100, 2) if total_count > 0 else 0.0
    pronoun_percentage = round((pronoun_count / total_count) * 100, 2) if total_count > 0 else 0.0
    # Return the percentage of nouns and pronouns
    return (noun_percentage, pronoun_percentage)

# Calculate the percentage of nouns and pronouns in the highlights column
ats_data['highlight_noun_percentage'], ats_data['highlight_pronoun_percentage'] = zip(*ats_data['highlights'].apply(calc_noun_pronoun_percentage))

# Plot a bar chart of the noun and pronoun percentages
plt.figure(figsize=(8, 6))
sns.barplot(x=['Nouns', 'Pronouns'], y=[ats_data['highlight_noun_percentage'].mean(), ats_data['highlight_pronoun_percentage'].mean()], palette='coolwarm')
plt.ylim(0, 100)
plt.xlabel('Part of Speech')
plt.ylabel('Percentage')
plt.title('Percentage of Nouns and Pronouns in the Highlights')
plt.show()'''

"import nltk\nfrom nltk.corpus import stopwords\nfrom nltk.tokenize import word_tokenize\n\n# Define a function to calculate the percentage of nouns and pronouns in a given text\ndef calc_noun_pronoun_percentage(text):\n    # Tokenize the text into words\n    words = word_tokenize(text)\n    # Remove stop words from the list of words\n    filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]\n    # Get the part-of-speech (POS) tags for the filtered words\n    pos_tags = nltk.pos_tag(filtered_words)\n    # Count the number of nouns and pronouns in the POS tags\n    noun_count = sum([1 for tag in pos_tags if tag[1] == 'NN' or tag[1] == 'NNS' or tag[1] == 'NNP' or tag[1] == 'NNPS'])\n    pronoun_count = sum([1 for tag in pos_tags if tag[1] == 'PRP' or tag[1] == 'PRP$'])\n    # Calculate the percentage of nouns and pronouns in the text\n    total_count = noun_count + pronoun_count\n    noun_percentage = round((noun_count / total_count) * 100, 2) if tot

In [39]:
'''# Calculate the percentage of nouns and pronouns in the article column
ats_data['article_noun_percentage'], ats_data['article_pronoun_percentage'] = zip(*ats_data['article'].apply(calc_noun_pronoun_percentage))

# Plot a bar chart of the noun and pronoun percentages
plt.figure(figsize=(8, 6))
sns.barplot(x=['Nouns', 'Pronouns'], y=[ats_data['article_noun_percentage'].mean(), ats_data['article_pronoun_percentage'].mean()], palette='coolwarm')
plt.ylim(0, 100)
plt.xlabel('Part of Speech')
plt.ylabel('Percentage')
plt.title('Percentage of Nouns and Pronouns in the Article')
plt.show()'''

"# Calculate the percentage of nouns and pronouns in the article column\nats_data['article_noun_percentage'], ats_data['article_pronoun_percentage'] = zip(*ats_data['article'].apply(calc_noun_pronoun_percentage))\n\n# Plot a bar chart of the noun and pronoun percentages\nplt.figure(figsize=(8, 6))\nsns.barplot(x=['Nouns', 'Pronouns'], y=[ats_data['article_noun_percentage'].mean(), ats_data['article_pronoun_percentage'].mean()], palette='coolwarm')\nplt.ylim(0, 100)\nplt.xlabel('Part of Speech')\nplt.ylabel('Percentage')\nplt.title('Percentage of Nouns and Pronouns in the Article')\nplt.show()"

## Data Transformation

### Data Augmentation

#### Data Augmentation using Rule based techniques on cleaned train split

#### Main idea is to take 20% of train data to apply augmentation using synonym replacement technique and add augmented data to the original train split so that both original data of that 20% and augmentated data will be present which increases the vocabulary of the data. Same applies to Random deletion and random swap techniques

In [29]:
#!pip install nlpaug

import pandas as pd
import torch
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas


### Splitting the cleaned dataset into train, test and validation sets before applying augmentation 

In [30]:
train_size = int(len(ats_data) * 0.9)
test_size = int(len(ats_data) * 0.05)
val_size = len(ats_data) - train_size - test_size

train_df = ats_data.iloc[:train_size]
test_df = ats_data.iloc[train_size:train_size + test_size]
val_df = ats_data.iloc[train_size + test_size:]

In [31]:
print('train_df', train_df.shape)
print('test_df', test_df.shape)
print('val_df', val_df.shape)

train_df (277945, 2)
test_df (15441, 2)
val_df (15442, 2)


In [32]:
train_df.head(4)

Unnamed: 0,article,highlights
0,"by . associated press . published . 1411 est, ...","bishop john folda, of north dakota, is taking ..."
3,(cnn) with a breezy sweep of his pen president...,nina dos santos says europe must be ready to a...
4,fleetwood are the only team still to have a 10...,fleetwood top of league one after 20 win at sc...
5,hes been accused of making many a fashion faux...,prime minister and his family are enjoying an ...


In [33]:
test_df.head(4)

Unnamed: 0,article,highlights
280947,(cnn) the remake of the movie sparkle seemed t...,whitney houston took her first movie role in 1...
280948,"over 1,000 people have been left homeless afte...",firefighters battled a huge fire wednesday nig...
280949,(cnn) duwon steven clark is standing on the wh...,haitian american duwon steven clark returned t...
280950,(cnn) seeing more mustaches this month? many o...,movember encourages men to grow mustaches to r...


In [34]:
val_df.head(4)

Unnamed: 0,article,highlights
296526,rating . poets house is a swish new hotel in e...,"poets house is a swish new hotel in ely, three..."
296527,dani alves looks set to leave barcelona this s...,dani alves has spent seven seasons with the ca...
296528,a libyan rapper has released a new music video...,calling himself volcano rapper is a 31yearold ...
296529,theres no denying that the british interior ae...,the landgate cottage in rye is outfitted in an...


In [35]:
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

In [36]:
train_df.head(4)

Unnamed: 0,article,highlights
0,"by . associated press . published . 1411 est, ...","bishop john folda, of north dakota, is taking ..."
1,(cnn) with a breezy sweep of his pen president...,nina dos santos says europe must be ready to a...
2,fleetwood are the only team still to have a 10...,fleetwood top of league one after 20 win at sc...
3,hes been accused of making many a fashion faux...,prime minister and his family are enjoying an ...


In [37]:
test_df.head(4)

Unnamed: 0,article,highlights
0,(cnn) the remake of the movie sparkle seemed t...,whitney houston took her first movie role in 1...
1,"over 1,000 people have been left homeless afte...",firefighters battled a huge fire wednesday nig...
2,(cnn) duwon steven clark is standing on the wh...,haitian american duwon steven clark returned t...
3,(cnn) seeing more mustaches this month? many o...,movember encourages men to grow mustaches to r...


In [49]:
val_df.head(4)

Unnamed: 0,article,highlights
0,rating . poets house is a swish new hotel in e...,"poets house is a swish new hotel in ely, three..."
1,dani alves looks set to leave barcelona this s...,dani alves has spent seven seasons with the ca...
2,a libyan rapper has released a new music video...,calling himself volcano rapper is a 31yearold ...
3,theres no denying that the british interior ae...,the landgate cottage in rye is outfitted in an...


In [38]:
test_df.to_csv('cleaned_test.csv', index=False)
val_df.to_csv('cleaned_val.csv', index=False)

### 1. Synonym Replacement


#### taking a random 20% sample from train split for augmentation using 1st technique (Synonym Replacement)

In [50]:
# Define the percentage of data to augment
augment_percentage = 0.2

#dataframe to store 20% data taken from train split
twenty_percent_df = pd.DataFrame()
twenty_percent_df = train_df.sample(frac=augment_percentage)

#new dataframe to store augmented data
augmented_train_df = pd.DataFrame()

In [51]:
twenty_percent_df.shape

(55589, 2)

#### Before Synonym Replacement

In [52]:
twenty_percent_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55589 entries, 119807 to 267532
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   article     55589 non-null  object
 1   highlights  55589 non-null  object
dtypes: object(2)
memory usage: 1.3+ MB


In [53]:
twenty_percent_df.reset_index(drop=True,inplace=True)

In [54]:
twenty_percent_df.article[1]

'by . steve nolan . published . 0653 est, 18 july 2013 . . updated . 0654 est, 18 july 2013 . a new zealand woman allegedly bombarded the bestselling author of a book on the harry potter series with threats and abuse, according to a federal court complaint. jessica elizabeth parker is said to have sent a barrage of abuse over a five year period to new yorkbased harry, a history author melissa anelli. ms parker is accused of threatening to slit the throat of ms anellli and is even said to have got a tattoo of the potter experts face, according to the new york post. victim harry potter expert and bestselling author melissa anelli who has allegedly been stalked by a new zealand woman for the past five years . it is claimed that the abuse began in 2008 when ms parker was banned from ms anellis the leaky cauldron fan website after writing abusive and violent posts about harry potter actress emma watson. according to the new york daily news, the complaint file states that ms parker wrote on 

In [55]:
twenty_percent_df.highlights[1]

'the fbi has obtained a warrant for the arrest of jessica elizabeth parker. she allegedly abused harry, a history author melissa anelli over five years. ms parker is said to have threatened brooklynbased ms anellis family. harry potter expert ms anelli welcomed the development today .'

#### Function definition of Synonym replacement

In [56]:
#!pip install nlpaug

import pandas as pd
import torch
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas

# Define the percentage of data to augment
augment_percentage = 0.2

#dataframe to store 20% data taken from train split
twenty_percent_df = pd.DataFrame()
twenty_percent_df = train_df.sample(frac=augment_percentage)

#new dataframe to store augmented data
augmented_train_df = pd.DataFrame()

twenty_percent_df.reset_index(drop=True,inplace=True)

# Define the augmentation functions
def augment_synonym(text):
    # Create an augmentation object
    aug = naw.SynonymAug(aug_src='wordnet')
    # Augment the text
    augmented_text = aug.augment(text)
    return augmented_text

# Apply the augmentation functions to the test data

augmented_train_df['article'] = twenty_percent_df['article'].apply(augment_synonym)
augmented_train_df['highlights'] = twenty_percent_df['highlights'].apply(augment_synonym)

#appending the augmented rows to main cleaned train split df(test_df)from augmented_test_df
train_df = train_df.append(augmented_train_df, ignore_index=True)

# Save the augmented+train data to a CSV file for backup
train_df.to_csv('augmented_train1.csv', index=False)

#### calling the augmentation function and storing the results in augmented_test_df and then appending to the main train split

In [57]:
# Apply the augmentation functions to the test data

augmented_train_df['article'] = twenty_percent_df['article'].apply(augment_synonym)
augmented_train_df['highlights'] = twenty_percent_df['highlights'].apply(augment_synonym)

#appending the augmented rows to main cleaned train split df(test_df)from augmented_test_df
train_df = train_df.append(augmented_train_df, ignore_index=True)

# Save the augmented+train data to a CSV file for backup
train_df.to_csv('augmented_train1.csv', index=False)

  train_df = train_df.append(augmented_train_df, ignore_index=True)


In [58]:
augmented_train_df.columns

Index(['article', 'highlights'], dtype='object')

In [59]:
augmented_train_df.reset_index(drop=True,inplace=True)

#### After Synonym Replacement

In [60]:
augmented_train_df.article[1]

['by. steve nolan. published. 0653 est, 18 july 2013. . updated. 0654 est, 18 july 2013. a new zealand woman allegedly bombarded the bestselling author of a book on the harry potter series with threats and abuse, according to a federal court complaint. jessica elizabeth parker is said to have sent a barrage of abuse over a five year period to new yorkbased harry, a history author melissa anelli. ms parker is accused of threatening to slit the throat of ms. anellli and is even said to have got a tattoo of the potter experts face, according to the new york post. victim harry potter expert and bestselling author melissa anelli who has allegedly been stalked by a new zealand woman for the past five years. it is claimed that the abuse began in 2008 when ms parker was banned from ms anellis the leaky cauldron fan website after writing abusive and violent posts about harry potter actress emma watson. according to the new york daily news, the complaint file state that ms parker wrote on a post

In [61]:
augmented_train_df.highlights[1]

['the federal bureau of investigation has prevail a warrantee for the arrest of jessica elizabeth ii charlie parker. she allegedly abused harry, a account author melissa anelli over five class. ms parker is sound out to have peril brooklynbased ms anellis family. harry potter expert ms anelli welcomed the development today.']

### 2. Random Deletion

#### taking a random 20% sample from train split for augmentation using 2nd technique (Random Deletion)


In [62]:
# Define the percentage of data to augment
augment_percentage = 0.2

#new dataframe to store augmented data
augmented_train_df2 = pd.DataFrame(columns=['article','highlights'])

#dataframe to store 20% data taken from train split
twenty_percent_df2 = pd.DataFrame()
twenty_percent_df2 = train_df.sample(frac=augment_percentage)


In [63]:
twenty_percent_df2.reset_index(drop=True, inplace=True)

#### Before Random Deletion

In [64]:
twenty_percent_df2.article[0]

['( cnn) sometimes the price of success is losing the thing you most cherish. for charlotte dujardin, every triumph in the sporting arena seemingly took her another footprint closer to an uncertain future. billed as the girl with the dancing horse after her heroics at the london 2012 olympics, she feared that every time she competed would be the last waltz with her equine partner. the mere thought of losing her best friend reduced dujardin to tears. valegro, the horse on which she won double gold, on which she was crowned double european champion and on which she aims to repeat the feat at augusts world championships, was attracting big offers. he was valued at 10 million but, after much speculation, valegros for sale sign has finally come down meaning they now have the chance to continue a remarkable recordsetting run together. his future is secure he is never going to be sold, we have him forever, the british rider says of the 12yearold horse, which is coowned by her mentor and fello

In [65]:
twenty_percent_df2.highlights[0]

['charlotte dujardin rode to double dressage atomic number 79 at the london olympics on valegro. simply the reverence was the partnership would embody separated with valegro up for sale. the partnership equal nowadays safe and the pair have gone on to break world records. dujardin now has the world deed of conveyance in her sights, the one major achiever missing from her cv.']

#### Function definition for random deletion technique

In [66]:
def augment_random_deletion(text):
    # Create an augmentation object
    aug = naw.RandomWordAug(action='delete')
    # Augment the text
    augmented_text = aug.augment(text)
    return augmented_text

#### calling the augmentation function and storing the results in augmented_test_df2 and then appending to the main train split

In [67]:
augmented_train_df2['article'] = twenty_percent_df2['article'].apply(augment_random_deletion)
augmented_train_df2['highlights'] = twenty_percent_df2['highlights'].apply(augment_random_deletion)

#appending the augmented rows to main cleaned train split df(test_df)from augmented_test_df2
train_df = train_df.append(augmented_train_df2, ignore_index=True)

# Save the augmented+train data to a CSV file for backup
train_df.to_csv('augmented_train2.csv', index=False)

  train_df = train_df.append(augmented_train_df2, ignore_index=True)


In [68]:
augmented_train_df2.reset_index(drop=True, inplace=True)

#### After Random Deletion technique


In [69]:
#augmented_train_df2.shape
augmented_train_df2.article[0]

['( cnn) sometimes the price of success is losing the thing you most cherish. for charlotte dujardin, every triumph in the sporting arena seemingly took her another footprint closer to an uncertain future. billed as the girl with the dancing horse after her heroics at the london 2012 olympics, she feared that every time she competed would be last waltz with her equine partner. the mere thought of losing her best friend reduced dujardin to tears. , the horse on which she won double gold, on which she was crowned double european champion and on which she aims to repeat the feat at augusts world championships, was attracting big offers. he was valued at 10 million but, after much speculation, valegros for sale sign has finally come down meaning they now have the chance to continue a remarkable recordsetting run together. his future is secure he is never going to be sold, we have him forever, the british rider says of the 12yearold horse, which is coowned by her mentor and fellow olympic c

In [70]:
augmented_train_df2.highlights[0]

['charlotte dujardin rode to double dressage atomic number 79 the london olympics on valegro. simply the reverence the partnership would embody separated with valegro for sale. nowadays and pair have gone on to break world records. dujardin has the world deed of conveyance in sights, the one major achiever missing from her cv.']

### 3. Random Swap

#### taking a random 20% sample from train split for augmentation using 2nd technique (Random Swap)


In [71]:
# Define the percentage of data to augment
augment_percentage = 0.2

#new dataframe to store augmented data
augmented_train_df3 = pd.DataFrame(columns=['article','highlights'])

#dataframe to store 20% data taken from train split
twenty_percent_df3 = pd.DataFrame()
twenty_percent_df3 = train_df.sample(frac=augment_percentage)


In [72]:
twenty_percent_df3.reset_index(drop=True, inplace=True)

In [73]:
twenty_percent_df3.columns

Index(['article', 'highlights'], dtype='object')

#### Before applying Random Swap technique


In [74]:
twenty_percent_df3.article[0]

'by . daily mail reporter . published . 1746 est, 27 december 2013 . . updated . 1750 est, 27 december 2013 . the moment an armed 16yearold boy was shot dead by police after a highspeed chase in a stolen car and daylong manhunt has been caught on camera. peyton cole barbour was killed in a volley of gunfire in which two grand prairie police officers were injured in texas on christmas day . the mansfield lake ridge high school student was flown to hospital but died from his wounds. scroll down for video . tragic peyton barbour, 16, was shot dead by police on christmas day after leading them on a highspeed chase in a stolen car . troubled teen peyton cole barbour, 16, was a student at mansfield lake ridge high school in mansfield, texas . the shocking incident began about 4.30am wednesday when an officer tried to stop the vehicle the teen was driving near north grand peninsula drive and england parkway in grand prairie because of suspicious activity. barbour sped away in the stolen 1997 

In [75]:
twenty_percent_df3.highlights[0]

'peyton cole barbour, 16, was shot dead by police on christmas day. the armed teen led cops on a chase in a stolen car in texas. two police officers were wounded. witness tammy king captured the shootout on her cell phone .'

#### Function definition for random swap technique

In [76]:
def augment_random_swap(text):
    # Create an augmentation object
    aug = naw.RandomWordAug(action='swap')
    # Augment the text
    #text = "(cnn) -- with a breezy sweep of his pen president vladimir putin wrote a new chapter into crimea's turbulent history, committing the region to a future returned to russian domain. sixty years prior, ukraine's breakaway peninsula was signed away just as swiftly by soviet leader nikita khrushchev. but dealing with such a blatant land grab on its eastern flank will not be anywhere near as quick and easy for europe's 28-member union. because, unlike crimea's rushed referendum, everyone has a say. after initially slapping visa restrictions and asset freezes on a limited number of little known politicians and military men, europe is facing urgent calls to widen the scope of its measures to target the russian business community in particular. "
    augmented_text = aug.augment(text)
    return augmented_text

#### calling the augmentation function and storing the results in augmented_test_df3 and then appending to the main train split


In [77]:
augmented_train_df3['article'] = twenty_percent_df3['article'].apply(augment_random_swap)
augmented_train_df3['highlights'] = twenty_percent_df3['highlights'].apply(augment_random_swap)

#appending the augmented rows to main cleaned train split df(test_df)from augmented_test_df3
train_df = train_df.append(augmented_train_df3, ignore_index=True)

# Save the augmented+train data to a CSV file for backup
train_df.to_csv('augmented_train3.csv', index=False)

  train_df = train_df.append(augmented_train_df3, ignore_index=True)


In [78]:
augmented_train_df3.reset_index(drop=True,inplace=True)

In [79]:
augmented_train_df3.columns

Index(['article', 'highlights'], dtype='object')

#### After Random Swap technique

In [80]:
augmented_train_df3.article[0]

['by. daily mail reporter. published. 1746 est, 27 december 2013. . updated. 1750 est, 27 december 2013. moment the an armed 16yearold boy was shot dead by police after a highspeed chase in stolen a car and daylong manhunt has been caught on camera. peyton cole barbour was killed in a volley of gunfire in which two grand prairie police officers were injured in texas on christmas day. the mansfield lake ridge high school student was flown to hospital but died from his wounds. scroll down for video. tragic peyton barbour, 16, was shot dead by police on christmas day after leading them on a highspeed chase in a stolen car. troubled teen peyton cole barbour, 16, was a student at mansfield lake ridge high school in mansfield, texas. the shocking incident began about 4. 30am wednesday when an officer tried to stop the vehicle the teen was driving near north grand peninsula drive and england parkway in grand prairie because of suspicious activity. barbour sped away in the stolen 1997 honda, l

In [81]:
augmented_train_df3.highlights[0]

['peyton, cole barbour 16 was, shot dead by police on christmas day. the armed teen cops led on chase a in a stolen car in texas. two police were officers wounded. witness king tammy captured the shootout on her phone cell.']

In [82]:
# final csv file after all augmentation techniques is augmented_train3.csv and dataframe is ats_data


#### Importing libraries and augmented train csv file

In [83]:
import numpy as np
import pandas as pd
data = pd.read_csv("augmented_train3.csv")
data.head(5)

Unnamed: 0,article,highlights
0,"by . associated press . published . 1411 est, ...","bishop john folda, of north dakota, is taking ..."
1,(cnn) with a breezy sweep of his pen president...,nina dos santos says europe must be ready to a...
2,fleetwood are the only team still to have a 10...,fleetwood top of league one after 20 win at sc...
3,hes been accused of making many a fashion faux...,prime minister and his family are enjoying an ...
4,by . daily mail reporter . published . 0115 es...,nba star calls for black and hispanic communit...


In [85]:

print('Train data: ',data.shape)
print('Validation data: ', val_df.shape)
print('Test data: ', test_df.shape)

Train data:  (480289, 2)
Validation data:  (15442, 2)
Test data:  (15441, 2)


In [39]:
data.head()

NameError: name 'data' is not defined

#### Tokenization of data

In [83]:
'''from transformers import AutoTokenizer

df = data

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('t5-base')

# Tokenize the article and highlight texts and encode them as numerical representations
inputs = tokenizer(df['article'].tolist(), padding='max_length', truncation=True, return_tensors='pt')
targets = tokenizer(df['highlights'].tolist(), padding='max_length', truncation=True, return_tensors='pt')'''



"from transformers import AutoTokenizer\n\ndf = data\n\n# Initialize the tokenizer\ntokenizer = AutoTokenizer.from_pretrained('t5-base')\n\n# Tokenize the article and highlight texts and encode them as numerical representations\ninputs = tokenizer(df['article'].tolist(), padding='max_length', truncation=True, return_tensors='pt')\ntargets = tokenizer(df['highlights'].tolist(), padding='max_length', truncation=True, return_tensors='pt')"

In [84]:
'''from transformers import AutoTokenizer

#df = data.head(66707)

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('t5-base')

# Split the DataFrame into 8 batches
df_batches = np.array_split(data, 8)

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('t5-base')

# Initialize an empty dictionary to store the tokenized data
data_dict = {}

# Iterate over the batches
for i, batch in enumerate(df_batches):
    # Tokenize the text
    inputs = tokenizer(batch['article'].tolist(), padding=True, truncation=True, return_tensors='pt')
    targets = tokenizer(batch['highlights'].tolist(), padding=True, truncation=True, return_tensors='pt')
    
    # Combine the tokenized article and highlights into a single dictionary
    #batch_data = {input_seq: target_seq for input_seq, target_seq in zip(inputs['input_ids'], targets['input_ids'])}
    batch_data = {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask'], 'decoder_input_ids': targets['input_ids'], 'decoder_attention_mask': targets['attention_mask']}
    #batch_data = {i: ats_data for i, ats_data in enumerate(zip(inputs['input_ids'], inputs['attention_mask'], targets['input_ids'], targets['attention_mask']))}

    
    # Add the batch data to the main dictionary
    data_dict.append(batch_data)

# The resulting dictionary maps each input sequence to its corresponding target sequence
print(data_dict)'''

"from transformers import AutoTokenizer\n\n#df = data.head(66707)\n\n# Initialize the tokenizer\ntokenizer = AutoTokenizer.from_pretrained('t5-base')\n\n# Split the DataFrame into 8 batches\ndf_batches = np.array_split(data, 8)\n\n# Initialize the tokenizer\ntokenizer = AutoTokenizer.from_pretrained('t5-base')\n\n# Initialize an empty dictionary to store the tokenized data\ndata_dict = {}\n\n# Iterate over the batches\nfor i, batch in enumerate(df_batches):\n    # Tokenize the text\n    inputs = tokenizer(batch['article'].tolist(), padding=True, truncation=True, return_tensors='pt')\n    targets = tokenizer(batch['highlights'].tolist(), padding=True, truncation=True, return_tensors='pt')\n    \n    # Combine the tokenized article and highlights into a single dictionary\n    #batch_data = {input_seq: target_seq for input_seq, target_seq in zip(inputs['input_ids'], targets['input_ids'])}\n    batch_data = {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask'], 'dec

In [None]:
data.article[0]

In [85]:
import nltk 
#Create a word tokenizer
tokenizer = nltk.word_tokenize

# Tokenize the article column in chunks
for i in range(0, len(data), 1000):
    data.loc[i:i+1000, "article_tokens"] = data.loc[i:i+1000, "article"].apply(tokenizer)



In [86]:
# Tokenize the highlights column in chunks
for i in range(0, len(data), 1000):
    data.loc[i:i+1000, "highlights_tokens"] = data.loc[i:i+1000, "highlights"].apply(tokenizer)

In [87]:
data.shape

(480289, 4)

In [88]:
data.columns

Index(['article', 'highlights', 'article_tokens', 'highlights_tokens'], dtype='object')

In [89]:
data.head(2)

Unnamed: 0,article,highlights,article_tokens,highlights_tokens
0,"by . associated press . published . 1411 est, ...","bishop john folda, of north dakota, is taking ...","[by, ., associated, press, ., published, ., 14...","[bishop, john, folda, ,, of, north, dakota, ,,..."
1,(cnn) with a breezy sweep of his pen president...,nina dos santos says europe must be ready to a...,"[(, cnn, ), with, a, breezy, sweep, of, his, p...","[nina, dos, santos, says, europe, must, be, re..."


In [2]:
data.article_tokens[0]

NameError: name 'data' is not defined

In [90]:
data.to_csv('tokenized.csv', index=False)

In [90]:
#TRIAL STARTS

# END OF FILE
## Continued in other ipynb (after_tokenization.ipynb)

In [91]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import numpy as np

# Create a TfidfVectorizer object for the article_tokens column
tfidf_article = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
tfidf_article.fit(data['article_tokens'])

# Fit and transform the article_tokens column
tfidf_article_tokens = tfidf_article.transform(data['article_tokens'])




# Create a TfidfVectorizer object for the highlights_tokens column
tfidf_highlights = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
tfidf_highlights.fit(data['highlights_tokens'])

# Fit and transform the highlights_tokens column
tfidf_highlights_tokens = tfidf_highlights.transform(data['highlights_tokens'])






In [None]:
batch_size = 500

# Standardize the article_tokens column
scaler_article = StandardScaler()
scaled_article = []

for i in range(0, len(tfidf_article_tokens.toarray()), batch_size):
    batch = tfidf_article_tokens.toarray()[i:i+batch_size]
    scaled_batch = scaler_article.fit_transform(batch)
    scaled_article.append(scaled_batch)

scaled_article = np.concatenate(scaled_article, axis=0)

# Standardize the highlights_tokens column
scaler_highlights = StandardScaler()
scaled_highlights = []

for i in range(0, len(tfidf_highlights_tokens.toarray()), batch_size):
    batch = tfidf_highlights_tokens.toarray()[i:i+batch_size]
    scaled_batch = scaler_highlights.fit_transform(batch)
    scaled_highlights.append(scaled_batch)

scaled_highlights = np.concatenate(scaled_highlights, axis=0)


In [None]:
# Standardize the article_tokens column
scaler_article = StandardScaler()
scaled_article = scaler_article.fit_transform(tfidf_article_tokens.toarray())

# Standardize the highlights_tokens column
scaler_highlights = StandardScaler()
scaled_highlights = scaler_highlights.fit_transform(tfidf_highlights_tokens.toarray())

In [None]:
# Perform dimensionality reduction using TruncatedSVD
svd_article = TruncatedSVD(n_components=500, random_state=42)
reduced_article = svd_article.fit_transform(scaled_article)

# Perform dimensionality reduction using TruncatedSVD
svd_highlights = TruncatedSVD(n_components=50, random_state=42)
reduced_highlights = svd_highlights.fit_transform(scaled_highlights)

In [None]:
# Store the reduced_article and reduced_highlights in the dataframe
data['reduced_article'] = list(reduced_article)
data['reduced_highlights'] = list(reduced_highlights)

#### Unused cell. Do not run below cell

In [None]:
#**********Unused cell. Do not run this**************


# Split the data into training and validation sets
'''from sklearn.model_selection import train_test_split
input_ids_train, input_ids_val, attn_masks_train, attn_masks_val, decoder_input_train, decoder_input_val, decoder_mask_train, decoder_mask_val = train_test_split(
    inputs['input_ids'], 
    inputs['attention_mask'], 
    targets['input_ids'], 
    targets['attention_mask'], 
    test_size=0.2, 
    random_state=42
)

# Combine the training and validation data into a single dictionary
train_data = {'input_ids': input_ids_train, 'attention_mask': attn_masks_train, 'decoder_input_ids': decoder_input_train, 'decoder_attention_mask': decoder_mask_train}
val_data = {'input_ids': input_ids_val, 'attention_mask': attn_masks_val, 'decoder_input_ids': decoder_input_val, 'decoder_attention_mask': decoder_mask_val}

print(len(val_data))'''

#### Combining tokenized article and highlights into dictionary

In [None]:
# Combine the input and target data into a single dictionary

ats_data_new = {}
ats_data_new = data_dict

#ats_data_new = {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask'], 'decoder_input_ids': targets['input_ids'], 'decoder_attention_mask': targets['attention_mask']}
#print(ats_data)


In [None]:
sample_dict = ats_data_new

# Convert the dictionary to a new dictionary with lists of values
data_list = {k: [v.tolist() for v in sample_dict[k]] for k in sample_dict.keys()}

# Create a DataFrame from the new dictionary
sample_df = pd.DataFrame(data_list)

# Print the DataFrame
print(sample_df.head())

In [None]:
sample_df.columns

In [None]:
sample_df.shape

#### Data Standardization using StandardScaler()

In [None]:
# Regularize the data using standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
ats_data_new['input_ids'] = scaler.fit_transform(ats_data_new['input_ids'])
#val_data['input_ids'] = scaler.transform(val_data['input_ids'])
print(ats_data_new['input_ids'])
print('-----------')

#### Data Reduction using truncatedSVD (dimensionality reduction)

In [None]:
# Alternatively, reduce the dimensionality of the data using SVD as data reduction technique
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=256)
ats_data_new['input_ids'] = svd.fit_transform(ats_data_new['input_ids'])

In [None]:
print(ats_data_new['input_ids'])

#### Converting the data into PyTorch tensors and creating a DataLoader

In [None]:
# Convert the data into PyTorch tensors and create a DataLoader
import torch
from torch.utils.data import TensorDataset, DataLoader
train_dataset = TensorDataset(torch.tensor(ats_data_new['input_ids']), torch.tensor(ats_data_new['attention_mask']), torch.tensor(ats_data_new['decoder_input_ids']), torch.tensor(ats_data_new['decoder_attention_mask']))

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [None]:
sample_dict2 = ats_data_new

# Convert the dictionary to a new dictionary with lists of values
data_list2 = {k: [v.tolist() for v in sample_dict2[k]] for k in sample_dict2.keys()}

# Create a DataFrame from the new dictionary
sample_df2 = pd.DataFrame(data_list2)

# Print the DataFrame
print(sample_df2.shape)

In [None]:
print(sample_df2.head(2))