##Function for Identity Synthetic Dataset

In [None]:
# This function is used for the identity synthetic dataset construction 
# Input and output are exactly the same (also called No_BT in the files - meaning no Back translation was applied)  
# The length of the input is restricted to 250, because the max length that T5 can handle is 512. (Input + Output + Some formatting makes it close to 512)
def convert_to_t5_format(x):
  if len(x.split())>250:
    x = ' '.join(x.split()[:250])
  return 'Translate: '+x+" ."+"Traslated: "+x

##Preprocessing TST dataset for Donald Trump

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Extarcting the dataset
!unzip '/content/drive/MyDrive/685 Project/Trump_TST/archive (7).zip'

Archive:  /content/drive/MyDrive/685 Project/Trump_TST/archive (7).zip
  inflating: realdonaldtrump.csv     
  inflating: trumptweets.csv         


In [None]:
import pandas as pd
import numpy as np

In [None]:
#Loading Trump tweets
tweet_df = pd.read_csv('trumptweets.csv')

In [None]:
#Only require the tweets and no other information
tweet_df = tweet_df[['content']]

In [None]:
tweet_df.head()

Unnamed: 0,content
0,Be sure to tune in and watch Donald Trump on L...
1,Donald Trump will be appearing on The View tom...
2,Donald Trump reads Top Ten Financial Tips on L...
3,New Blog Post: Celebrity Apprentice Finale and...
4,"""My persona will never be that of a wallflower..."


In [None]:
len(tweet_df)

41122

In [None]:
import re
#Removing all the special symbols and the links from the sentence
def removal_chars_links(tweet):
  #Replace http and all the continuous characters following it by ''
  tweet = re.sub(r"http\S+", "", tweet)
  #Replace all the mentions using @ with ''
  tweet = re.sub(r"\S*@\S*\s?", "", tweet)
  return tweet 

In [None]:
#Applying the preprocessing functions
tweet_df['content'] = tweet_df['content'].apply(removal_chars_links)

In [None]:
#Converting all the inputs to lowercase
def convert_lowercase(input):
  return input.lower()

In [None]:
tweet_df['content'] = tweet_df['content'].apply(convert_lowercase)

In [None]:
def max_length(dataset):
  return max(dataset.content.str.split().apply(len))

def avg_length(dataset):
  return  dataset['content'].str.split().str.len().mean()

In [None]:
#Statistics about the max tweet length and average tweet length 
print(max_length(tweet_df))
print(avg_length(tweet_df))

60
19.43324741014542


In [None]:
#Saving the preprocessed dataset
tweet_df.to_csv('/content/drive/MyDrive/685 Project/Trump_TST/TrumpTST.csv',index = False)

In [None]:
#Converting into an identical mapping dataset
tweet_df['content'] = tweet_df['content'].apply(convert_to_t5_format)
tweet_df.dropna(axis=0, inplace=True)

In [None]:
tweet_df.to_csv('/content/drive/MyDrive/685 Project/Trump_TST/Trump_no_BT_TST.csv',index = False)

## Preprocess Taylor Swift Dataset

In [None]:
#Unzipping the song lyrics
!unzip '/content/drive/MyDrive/685 Project/Taylor_TST/archive (9).zip'

Archive:  /content/drive/MyDrive/685 Project/Taylor_TST/archive (9).zip
  inflating: taylor_swift_lyrics.csv  


In [None]:
#Loading the dataset 
taylor_df = pd.read_csv('/content/taylor_swift_lyrics.csv', encoding = "latin1")

In [None]:
taylor_df.head()

Unnamed: 0,artist,album,track_title,track_n,lyric,line,year
0,Taylor Swift,Taylor Swift,Tim McGraw,1,He said the way my blue eyes shined,1,2006
1,Taylor Swift,Taylor Swift,Tim McGraw,1,Put those Georgia stars to shame that night,2,2006
2,Taylor Swift,Taylor Swift,Tim McGraw,1,"I said, ""That's a lie""",3,2006
3,Taylor Swift,Taylor Swift,Tim McGraw,1,Just a boy in a Chevy truck,4,2006
4,Taylor Swift,Taylor Swift,Tim McGraw,1,That had a tendency of gettin' stuck,5,2006


In [None]:
#Creating a new dataset to hold the final processed song lyrics 
final_taylor_df = pd.DataFrame(columns = ['content'])

In [None]:
#If want to use the lyrics for an entire song as a single input, then use this dataset - but it leads to very small dataset, not good enough for training a T5 model 

# final_taylor_df['content'] = taylor_df.groupby(['track_title','track_n','artist','album','year'])['lyric'].apply(lambda x: '. '.join(x)).reset_index(drop=True)

In [None]:
final_taylor_df['content'] = taylor_df['lyric']

In [None]:
final_taylor_df.head()

Unnamed: 0,content
0,He said the way my blue eyes shined
1,Put those Georgia stars to shame that night
2,"I said, ""That's a lie"""
3,Just a boy in a Chevy truck
4,That had a tendency of gettin' stuck


In [None]:
#Statistics about the average lyric length and the max lyric length
print(max_length(final_taylor_df))
print(avg_length(final_taylor_df))

18
7.248457424928013


In [None]:
#Converting to lower case
final_taylor_df['content'] = final_taylor_df['content'].apply(convert_lowercase)

In [None]:
#Saving the preprocessed song lyric dataset 
final_taylor_df.to_csv('/content/drive/MyDrive/685 Project/Taylor_TST/TaylorTST.csv',index = False)

In [None]:
len(final_taylor_df)

4862

In [None]:
#Creating the identically mapped dataset 
final_taylor_df['content'] = final_taylor_df['content'].apply(convert_to_t5_format)
final_taylor_df.dropna(axis=0, inplace=True)

In [None]:
final_taylor_df.to_csv('/content/drive/MyDrive/685 Project/Taylor_TST/Taylor_no_BT_TST.csv',index = False)