# LDA model


In [None]:
import numpy as pd
import pandas as pd
import matplotlib.pyplot as plt


### Mount google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### import dataset

In [None]:
df =  pd.read_csv('/content/drive/MyDrive/topic extraction/data/stories_cleaned_tokenized.csv')

### check dataset and have idea about it

In [None]:
df.head()

Unnamed: 0,body,topic,cleaned
0,hello and welcome to BBC News a woman who gave...,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '9ff5...",hello welcome bbc news woman gave key evidence...
1,news now out of North Hollywood. A 14 yearold ...,['9ff54ded-904b-4e0c-85ce-a3617f5cb913'],news north hollywood yearold girl found dead i...
2,homelessness his city's greatest failure. That...,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '74e2...",homelessness city greatest failure message ton...
3,Minneapolis police officer Kim Potter guilty o...,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '9ff5...",minneapolis police officer kim potter guilty d...
4,Judy an update now to the wildfires that wiped...,"['9ff54ded-904b-4e0c-85ce-a3617f5cb913', '9a06...",judy update wildfire wiped entire neighborhood...


In [None]:
df.tail()

Unnamed: 0,body,topic,cleaned
5136,News. More local help will soon be on the way....,"['9ff54ded-904b-4e0c-85ce-a3617f5cb913', '9632...",news local help soon way group volunteer yonke...
5137,"with March 1, we start what is called Meteorol...",['9a06646a-e1df-4fca-888e-69658420556b'],march start called meteorological spring keep ...
5138,overseas. A massive Russian convoy is headed t...,['9ff54ded-904b-4e0c-85ce-a3617f5cb913'],overseas massive russian convoy headed toward ...
5139,"And this morning, the National Hockey League s...","['9ff54ded-904b-4e0c-85ce-a3617f5cb913', 'b492...",morning national hockey league say suspending ...
5140,"thank you very much. Also this morning, the Un...","['83a09c6b-5f2f-421f-ae50-b38acca7e008', '9ff5...",thank much also morning united state asking tw...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5141 entries, 0 to 5140
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   body     5141 non-null   object
 1   topic    5141 non-null   object
 2   cleaned  5138 non-null   object
dtypes: object(3)
memory usage: 120.6+ KB


In [None]:
df.describe()

Unnamed: 0,body,topic,cleaned
count,5141,5141,5138
unique,5141,178,5137
top,hello and welcome to BBC News a woman who gave...,['9ff54ded-904b-4e0c-85ce-a3617f5cb913'],vigil today yearold alexis gabe oakley last se...
freq,1,882,2


### module to tokenize text

In [None]:
! pip install -U gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

### use LDA as unsupervised learning to presict the title

In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
from gensim.models import CoherenceModel
from sklearn.model_selection import ParameterGrid

def preprocess_text(text):
    # Lowercasing
    text = text.lower()

    # Removing URLs
    text = re.sub(r'http\S+|www.\S+', '', text)

    # Removing numbers
    text = re.sub(r'\d+', '', text)

    # Removing punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Removing tabs and spaces
    text = re.sub(r'\s+', ' ', text)

    # Tokenization
    tokens = text.split()

    # Removing stopwords and non-English words
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and is_english_word(token)]

    return tokens

def is_english_word(word):
    # Add your logic for checking if a word is English or not
    # You can use language detection libraries or other approaches

    # Placeholder implementation
    return True

def calculate_coherence(texts, num_topics, dictionary):
    # Preprocess the texts
    tokenized_texts = [preprocess_text(text) for text in texts]

    # Create a corpus (bag of words representation)
    corpus = [dictionary.doc2bow(tokens) for tokens in tokenized_texts]

    # Create the LDA model
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

    # Calculate the coherence score
    coherence_model = CoherenceModel(model=lda_model, texts=tokenized_texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()

    return coherence_score

# Example usage with your dataset
dataset = df['cleaned'].copy()

# Preprocess the texts and create a dictionary
tokenized_texts = [preprocess_text(text) for text in dataset]
dictionary = corpora.Dictionary(tokenized_texts)

# Define the parameter grid for num_topics
param_grid = {'num_topics': [2, 2, 3, 5, 8, 10]}

# Perform grid search
best_score = -float('inf')
best_params = None
for params in ParameterGrid(param_grid):
    coherence_score = calculate_coherence(dataset, params['num_topics'], dictionary)
    if coherence_score > best_score:
        best_score = coherence_score
        best_params = params

# Print the best parameters and coherence score
print("Best Parameters:", best_params)
print("Best Coherence Score:", best_score)


In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models

def preprocess_text(text):

    # Lowercasing
    text = text.lower()

    # remove non English words
    text = re.sub(r'[^\x00-\x7f]',r'', text) 

    # Removing URLs
    text = re.sub(r'http\S+|www.\S+', '', text)

    # remove letters that occur more than 2 times
    text = re.sub(r'(\w)\1{2,}', r'\1',text)
    
    # Removing numbers
    text = re.sub(r'\d+', '', text)

    # Removing more than one spaces
    text = re.sub(r'[\s]{2,}', ' ', text)

    # Removing punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Removing tabs and spaces
    text = re.sub(r'\s+', ' ', text)

    # remove tags and mentions
    text = re.sub(r'@\S*','',text)
    text = re.sub(r'#\S*','',text)
    
    # Tokenization
    tokens = text.split()

    # Removing stopwords and non-English words
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words ]

    return tokens

def extract_topics(texts, num_topics):
    # Preprocess the texts
    tokenized_texts = [preprocess_text(text) for text in texts]

    # Create a dictionary from the tokenized texts
    dictionary = corpora.Dictionary(tokenized_texts)

    # Filter out rare and common tokens
    dictionary.filter_extremes(no_below=5, no_above=0.5)

    # Create a corpus (bag of words representation)
    corpus = [dictionary.doc2bow(tokens) for tokens in tokenized_texts]

    # Create the LDA model
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    lda_model.save('lda.model')

    # Extract the topics
    topics = lda_model.print_topics(num_topics=num_topics)

    return topics

# Example usage with your dataset
dataset = df['body'].copy()


# Extract topics
topics = extract_topics(dataset, num_topics=10)

# Print the topics
for topic in topics:
    print(topic)


(0, '0.031*"police" + 0.026*"say" + 0.016*"man" + 0.014*"year" + 0.013*"old" + 0.011*"happened" + 0.011*"suspect" + 0.010*"car" + 0.010*"shot" + 0.010*"shooting"')
(1, '0.031*"new" + 0.019*"city" + 0.017*"state" + 0.014*"york" + 0.014*"mandate" + 0.011*"governor" + 0.010*"san" + 0.010*"today" + 0.009*"say" + 0.008*"mayor"')
(2, '0.033*"fire" + 0.018*"people" + 0.014*"morning" + 0.012*"firefighter" + 0.012*"building" + 0.011*"subway" + 0.011*"one" + 0.010*"right" + 0.010*"mayor" + 0.010*"new"')
(3, '0.038*"ukraine" + 0.029*"president" + 0.025*"russian" + 0.023*"russia" + 0.018*"u" + 0.014*"ukrainian" + 0.013*"putin" + 0.012*"biden" + 0.010*"today" + 0.009*"troop"')
(4, '0.038*"school" + 0.017*"mask" + 0.014*"say" + 0.014*"student" + 0.014*"new" + 0.013*"case" + 0.012*"county" + 0.012*"covid" + 0.011*"teacher" + 0.010*"week"')
(5, '0.017*"going" + 0.012*"day" + 0.012*"see" + 0.012*"morning" + 0.011*"right" + 0.010*"well" + 0.009*"snow" + 0.009*"get" + 0.008*"area" + 0.008*"like"')
(6, '0

In [None]:
story =  pd.read_csv('/content/drive/MyDrive/topic extraction/data/stories_filled.csv')

In [None]:
story

Unnamed: 0.1,Unnamed: 0,first_words,last_words,source_video_id,body
0,0,Well knew. This morning police need your help,"gunpoint, beating him and stealing his cell ph...",18246,Well knew. This morning police need your help ...
1,1,a call. San Francisco firefighters rescued a man,all the way down to the ocean.,12387,a call. San Francisco firefighters rescued a m...
2,2,"Paul. Meanwhile, the state set a record in","night through conservation, some 4000 conserva...",16859,"Paul. Meanwhile, the state set a record in ene..."
3,3,Emergency crews in Florida continue to search for,"in Florida to more than 850,000 homes.",18246,Emergency crews in Florida continue to search ...
4,4,But even though the state never ordered rolling,feel since their power got cut out needlessly.,16859,But even though the state never ordered rollin...
5,5,"aid. And today, president Joe Biden and first",to view the destruction caused by Hurricane Ian.,18246,"aid. And today, president Joe Biden and first ..."
6,6,"In the last month, there have been numerous",are necessary to crack down on those hackers.,18246,"In the last month, there have been numerous da..."
7,7,and the warriors are playing the Boston Celtics,that. We'll see if they get it tonight.,12387,and the warriors are playing the Boston Celtic...
8,8,And San Leandro police searching for the person,footage to try to piece together more informat...,16859,And San Leandro police searching for the perso...
9,9,The updated Bivalent Coronavirus booster shot ...,on their vaccinations getting severe illness f...,16859,The updated Bivalent Coronavirus booster shot ...


In [None]:
extract_topics(story['body'], 10)

[(0,
  '0.106*"also" + 0.081*"year" + 0.080*"last" + 0.054*"really" + 0.054*"people" + 0.054*"well" + 0.054*"getting" + 0.054*"today" + 0.054*"morning" + 0.054*"police"'),
 (1,
  '0.077*"san" + 0.068*"see" + 0.065*"back" + 0.051*"week" + 0.050*"dont" + 0.050*"well" + 0.048*"going" + 0.042*"official" + 0.039*"go" + 0.038*"getting"'),
 (2,
  '0.029*"happened" + 0.029*"man" + 0.029*"two" + 0.029*"police" + 0.029*"also" + 0.029*"people" + 0.029*"official" + 0.029*"back" + 0.029*"getting" + 0.029*"area"'),
 (3,
  '0.152*"police" + 0.084*"like" + 0.075*"happened" + 0.074*"inside" + 0.074*"right" + 0.067*"two" + 0.047*"information" + 0.036*"people" + 0.035*"know" + 0.034*"morning"'),
 (4,
  '0.029*"year" + 0.029*"two" + 0.029*"like" + 0.029*"thats" + 0.029*"see" + 0.029*"going" + 0.029*"know" + 0.029*"right" + 0.029*"get" + 0.029*"inside"'),
 (5,
  '0.105*"year" + 0.104*"today" + 0.104*"right" + 0.104*"take" + 0.104*"week" + 0.104*"two" + 0.104*"area" + 0.010*"also" + 0.010*"last" + 0.010*"we