In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


# Load the dataset
df_validation = pd.read_csv('D:/dataaa/cnn_dailymail/validation.csv')
df_train = pd.read_csv('D:/dataaa/cnn_dailymail/train.csv')
df_test = pd.read_csv('D:/dataaa/cnn_dailymail/test.csv')


In [2]:
df_train['article'] = df_train['article'].str.lower()
df_train['highlights'] = df_train['highlights'].str.lower()

df_validation['highlights'] = df_validation['highlights'].str.lower()
df_validation['article'] = df_validation['article'].str.lower()

df_test['article'] = df_test['article'].str.lower()
df_test['highlights'] = df_test['highlights'].str.lower()

In [3]:
import string
    
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df_train['article'] = df_train['article'].apply(remove_punctuation)
df_train['highlights'] = df_train['highlights'].apply(remove_punctuation)

df_validation['article'] = df_validation['article'].apply(remove_punctuation)
df_validation['highlights'] = df_validation['highlights'].apply(remove_punctuation)

df_test['article'] = df_test['article'].apply(remove_punctuation)
df_test['highlights'] = df_test['highlights'].apply(remove_punctuation)


In [4]:

df_train['article'] = df_train['article'].apply(word_tokenize)
df_train['highlights'] = df_train['highlights'].apply(word_tokenize)

df_validation['article'] = df_validation['article'].apply(word_tokenize)
df_validation['highlights'] = df_validation['highlights'].apply(word_tokenize)

df_test['article'] = df_test['article'].apply(word_tokenize)
df_test['highlights'] = df_test['highlights'].apply(word_tokenize)


In [5]:
df_train.drop(columns=['id'], inplace=True)
df_validation.drop(columns=['id'], inplace=True)
df_test.drop(columns=['id'], inplace=True)

In [6]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]

df_train['article'] = df_train['article'].apply(remove_stopwords)
df_train['highlights'] = df_train['highlights'].apply(remove_stopwords)

df_validation['article'] = df_validation['article'].apply(remove_stopwords)
df_validation['highlights'] = df_validation['highlights'].apply(remove_stopwords)

df_test['article'] = df_test['article'].apply(remove_stopwords)
df_test['highlights'] = df_test['highlights'].apply(remove_stopwords)

In [7]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

df_train['article'] = df_train['article'].apply(lemmatize)
df_train['highlights'] = df_train['highlights'].apply(lemmatize)

df_validation['article'] = df_validation['article'].apply(lemmatize)
df_validation['highlights'] = df_validation['highlights'].apply(lemmatize)

df_test['article'] = df_test['article'].apply(lemmatize)
df_test['highlights'] = df_test['highlights'].apply(lemmatize)


In [8]:

# Concatenate the articles and highlights for topic modeling
train_corpus = df_train['article'].apply(lambda x: ' '.join(x)) + df_train['highlights'].apply(lambda x: ' '.join(x))
validation_corpus = df_validation['article'].apply(lambda x: ' '.join(x)) + df_validation['highlights'].apply(lambda x: ' '.join(x))
test_corpus = df_test['article'].apply(lambda x: ' '.join(x)) + df_test['highlights'].apply(lambda x: ' '.join(x))


In [9]:
# Convert the text data to TF-IDF matrices for each dataset
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

# Fit and transform on train set
tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_corpus)


In [10]:
# #Transform validation and test sets
tfidf_matrix_validation = tfidf_vectorizer.transform(validation_corpus)
tfidf_matrix_test = tfidf_vectorizer.transform(test_corpus)


In [11]:
#Apply Latent Dirichlet Allocation (LDA) for each dataset
num_topics = 5  
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)

In [48]:
# Fit and transform on train set
lda_train = lda.fit_transform(tfidf_matrix_train)
df_train['topic'] = lda_train.argmax(axis=1)

In [51]:
# Transform test set

lda_test = lda.transform(tfidf_matrix_test)
df_test['topic'] = lda_test.argmax(axis=1)

In [49]:
# Transform validation
lda_validation = lda.transform(tfidf_matrix_validation)
df_validation['topic'] = lda_validation.argmax(axis=1)

In [52]:
df_train_topic_1 = df_train[df_train['topic'] == 1]
print(df_train_topic_1)

                                                  article  \
4       [fleetwood, team, still, 100, record, sky, bet...   
8       [number, job, description, waiting, darren, fl...   
13      [louis, van, gaal, said, option, substitute, p...   
21      [everton, still, looking, add, two, new, playe...   
24      [glen, johnson, look, destined, leave, anfield...   
...                                                   ...   
287058  [frank, lampard, brought, curtain, 13year, che...   
287060  [cnn, late, sergio, ramos, goal, earned, real,...   
287100  [wayne, rooney, took, young, child, onto, wemb...   
287102  [real, madrid, looking, extend, 21game, winnin...   
287107  [neil, ashton, follow, neilashton, marco, reus...   

                                               highlights  topic  
4       [fleetwood, top, league, one, 20, win, scuntho...      1  
8       [tony, pulis, belief, saido, berahino, look, d...      1  
13      [manchester, united, beat, southampton, 21, st...      1  

In [53]:
df_test_topic_1 = df_test[df_test['topic'] == 1]
print(df_test_topic_1)

                                                 article  \
2      [dougie, freedman, verge, agreeing, new, twoye...   
3      [liverpool, target, neto, also, wanted, psg, c...   
8      [1120pm, former, world, champion, ken, doherty...   
12     [england, captain, alastair, cook, completed, ...   
16     [arsenal, midfield, trio, jack, wilshere, mike...   
...                                                  ...   
11459  [rarely, headline, matchday, magazine, prophet...   
11464  [vincent, kompany, emerged, injury, doubt, man...   
11467  [saviour, english, football, turn, longhaired,...   
11468  [blackpool, talk, sign, austria, defender, tho...   
11488  [brook, lopez, dominated, twin, brother, robin...   

                                              highlights  topic  
2      [nottingham, forest, close, extending, dougie,...      1  
3      [fiorentina, goalkeeper, neto, linked, liverpo...      1  
8      [reanne, evans, faced, ken, doherty, world, ch...      1  
12     [alastai

In [54]:
df_validation_topic_1 = df_validation[df_validation['topic'] == 1]
print(df_validation_topic_1)

                                                 article  \
3      [avid, rugby, fan, prince, harry, could, barel...   
5      [team, lowest, transfer, outlay, season, rose,...   
8      [ronda, rousey, recorded, fastestever, finish,...   
9      [celtic, defender, virgil, van, dijk, admits, ...   
12     [radamel, falcao, reduced, tear, manchester, u...   
...                                                  ...   
13359  [make, mistake, first, trophy, celtic, manager...   
13361  [match, martin, ’, neill, needed, win, convinc...   
13362  [per, mertesacker, say, frank, team, meeting, ...   
13364  [mo, farah, nationality, called, question, spa...   
13365  [wolf, kept, promotion, hope, alive, routine, ...   

                                              highlights  topic  
3      [prince, harry, attendance, england, crunch, m...      1  
5      [stoke, city, beat, everton, 20, move, eighth,...      1  
8      [ronda, rousey, submitted, cat, zingano, via, ...      1  
9      [celtic,

In [56]:
# Save the extracted data to an Excel file
output_excel_file = 'D:/dataaa/df_validation_topic_1.xlsx'
df_validation_topic_1.to_excel(output_excel_file, index=False)

In [57]:
# Save the extracted data from training set to an Excel file
output_train_excel_file = 'D:/dataaa/df_train_topic_1.xlsx'
df_train_topic_1.to_excel(output_train_excel_file, index=False)

In [58]:
# Save the extracted data from test set to an Excel file
output_test_excel_file = 'D:/dataaa/df_test_topic_1.xlsx'
df_test_topic_1.to_excel(output_test_excel_file, index=False)

In [65]:
print("Perplexity on validation set:", lda.perplexity(tfidf_matrix_validation))

Perplexity on validation set: 15917283.596167924
